2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: Coulomb
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128i ifour = _mm_set1_epi32(4);
119 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
121 __m128 dummy_mask,cutoff_mask;
122 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123 __m128 one = _mm_set1_ps(1.0);
124 __m128 two = _mm_set1_ps(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm_set1_ps(fr->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_vdw->data;
143 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
148 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
149 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm_set1_ps(charge[inr+1]);
153 jq2 = _mm_set1_ps(charge[inr+2]);
154 jq3 = _mm_set1_ps(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _mm_mul_ps(iq1,jq1);
159 qq12 = _mm_mul_ps(iq1,jq2);
160 qq13 = _mm_mul_ps(iq1,jq3);
161 qq21 = _mm_mul_ps(iq2,jq1);
162 qq22 = _mm_mul_ps(iq2,jq2);
163 qq23 = _mm_mul_ps(iq2,jq3);
164 qq31 = _mm_mul_ps(iq3,jq1);
165 qq32 = _mm_mul_ps(iq3,jq2);
166 qq33 = _mm_mul_ps(iq3,jq3);
168 /* Avoid stupid compiler warnings */
169 jnrA = jnrB = jnrC = jnrD = 0;
178 for(iidx=0;iidx<4*DIM;iidx++)
183 /* Start outer loop over neighborlists */
184 for(iidx=0; iidx<nri; iidx++)
186 /* Load shift vector for this list */
187 i_shift_offset = DIM*shiftidx[iidx];
189 /* Load limits for loop over neighbors */
190 j_index_start = jindex[iidx];
191 j_index_end = jindex[iidx+1];
193 /* Get outer coordinate index */
195 i_coord_offset = DIM*inr;
197 /* Load i particle coords and add shift vector */
198 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
199 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
201 fix0 = _mm_setzero_ps();
202 fiy0 = _mm_setzero_ps();
203 fiz0 = _mm_setzero_ps();
204 fix1 = _mm_setzero_ps();
205 fiy1 = _mm_setzero_ps();
206 fiz1 = _mm_setzero_ps();
207 fix2 = _mm_setzero_ps();
208 fiy2 = _mm_setzero_ps();
209 fiz2 = _mm_setzero_ps();
210 fix3 = _mm_setzero_ps();
211 fiy3 = _mm_setzero_ps();
212 fiz3 = _mm_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm_setzero_ps();
216 vvdwsum = _mm_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
229 j_coord_offsetC = DIM*jnrC;
230 j_coord_offsetD = DIM*jnrD;
232 /* load j atom coordinates */
233 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236 &jy2,&jz2,&jx3,&jy3,&jz3);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_ps(ix0,jx0);
240 dy00 = _mm_sub_ps(iy0,jy0);
241 dz00 = _mm_sub_ps(iz0,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx13 = _mm_sub_ps(ix1,jx3);
249 dy13 = _mm_sub_ps(iy1,jy3);
250 dz13 = _mm_sub_ps(iz1,jz3);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
257 dx23 = _mm_sub_ps(ix2,jx3);
258 dy23 = _mm_sub_ps(iy2,jy3);
259 dz23 = _mm_sub_ps(iz2,jz3);
260 dx31 = _mm_sub_ps(ix3,jx1);
261 dy31 = _mm_sub_ps(iy3,jy1);
262 dz31 = _mm_sub_ps(iz3,jz1);
263 dx32 = _mm_sub_ps(ix3,jx2);
264 dy32 = _mm_sub_ps(iy3,jy2);
265 dz32 = _mm_sub_ps(iz3,jz2);
266 dx33 = _mm_sub_ps(ix3,jx3);
267 dy33 = _mm_sub_ps(iy3,jy3);
268 dz33 = _mm_sub_ps(iz3,jz3);
270 /* Calculate squared distance and things based on it */
271 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
275 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
278 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
279 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
280 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
282 rinv00 = gmx_mm_invsqrt_ps(rsq00);
283 rinv11 = gmx_mm_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm_invsqrt_ps(rsq12);
285 rinv13 = gmx_mm_invsqrt_ps(rsq13);
286 rinv21 = gmx_mm_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm_invsqrt_ps(rsq22);
288 rinv23 = gmx_mm_invsqrt_ps(rsq23);
289 rinv31 = gmx_mm_invsqrt_ps(rsq31);
290 rinv32 = gmx_mm_invsqrt_ps(rsq32);
291 rinv33 = gmx_mm_invsqrt_ps(rsq33);
293 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
294 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
295 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
296 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
297 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
298 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
299 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
300 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
301 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
303 fjx0 = _mm_setzero_ps();
304 fjy0 = _mm_setzero_ps();
305 fjz0 = _mm_setzero_ps();
306 fjx1 = _mm_setzero_ps();
307 fjy1 = _mm_setzero_ps();
308 fjz1 = _mm_setzero_ps();
309 fjx2 = _mm_setzero_ps();
310 fjy2 = _mm_setzero_ps();
311 fjz2 = _mm_setzero_ps();
312 fjx3 = _mm_setzero_ps();
313 fjy3 = _mm_setzero_ps();
314 fjz3 = _mm_setzero_ps();
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 r00 = _mm_mul_ps(rsq00,rinv00);
322 /* Calculate table index by multiplying r with table scale and truncate to integer */
323 rt = _mm_mul_ps(r00,vftabscale);
324 vfitab = _mm_cvttps_epi32(rt);
326 vfeps = _mm_frcz_ps(rt);
328 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
330 twovfeps = _mm_add_ps(vfeps,vfeps);
331 vfitab = _mm_slli_epi32(vfitab,3);
333 /* CUBIC SPLINE TABLE DISPERSION */
334 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
335 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
336 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
337 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
338 _MM_TRANSPOSE4_PS(Y,F,G,H);
339 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
340 VV = _mm_macc_ps(vfeps,Fp,Y);
341 vvdw6 = _mm_mul_ps(c6_00,VV);
342 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
343 fvdw6 = _mm_mul_ps(c6_00,FF);
345 /* CUBIC SPLINE TABLE REPULSION */
346 vfitab = _mm_add_epi32(vfitab,ifour);
347 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
348 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
349 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
350 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
351 _MM_TRANSPOSE4_PS(Y,F,G,H);
352 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
353 VV = _mm_macc_ps(vfeps,Fp,Y);
354 vvdw12 = _mm_mul_ps(c12_00,VV);
355 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
356 fvdw12 = _mm_mul_ps(c12_00,FF);
357 vvdw = _mm_add_ps(vvdw12,vvdw6);
358 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
365 /* Update vectorial force */
366 fix0 = _mm_macc_ps(dx00,fscal,fix0);
367 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
368 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
370 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
371 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
372 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 /* COULOMB ELECTROSTATICS */
379 velec = _mm_mul_ps(qq11,rinv11);
380 felec = _mm_mul_ps(velec,rinvsq11);
382 /* Update potential sum for this i atom from the interaction with this j atom. */
383 velecsum = _mm_add_ps(velecsum,velec);
387 /* Update vectorial force */
388 fix1 = _mm_macc_ps(dx11,fscal,fix1);
389 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
390 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
392 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
393 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
394 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 /* COULOMB ELECTROSTATICS */
401 velec = _mm_mul_ps(qq12,rinv12);
402 felec = _mm_mul_ps(velec,rinvsq12);
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velecsum = _mm_add_ps(velecsum,velec);
409 /* Update vectorial force */
410 fix1 = _mm_macc_ps(dx12,fscal,fix1);
411 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
412 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
414 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
415 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
416 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 /* COULOMB ELECTROSTATICS */
423 velec = _mm_mul_ps(qq13,rinv13);
424 felec = _mm_mul_ps(velec,rinvsq13);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velecsum = _mm_add_ps(velecsum,velec);
431 /* Update vectorial force */
432 fix1 = _mm_macc_ps(dx13,fscal,fix1);
433 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
434 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
436 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
437 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
438 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 /* COULOMB ELECTROSTATICS */
445 velec = _mm_mul_ps(qq21,rinv21);
446 felec = _mm_mul_ps(velec,rinvsq21);
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum = _mm_add_ps(velecsum,velec);
453 /* Update vectorial force */
454 fix2 = _mm_macc_ps(dx21,fscal,fix2);
455 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
456 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
458 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
459 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
460 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 /* COULOMB ELECTROSTATICS */
467 velec = _mm_mul_ps(qq22,rinv22);
468 felec = _mm_mul_ps(velec,rinvsq22);
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm_add_ps(velecsum,velec);
475 /* Update vectorial force */
476 fix2 = _mm_macc_ps(dx22,fscal,fix2);
477 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
478 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
480 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
481 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
482 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 /* COULOMB ELECTROSTATICS */
489 velec = _mm_mul_ps(qq23,rinv23);
490 felec = _mm_mul_ps(velec,rinvsq23);
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velecsum = _mm_add_ps(velecsum,velec);
497 /* Update vectorial force */
498 fix2 = _mm_macc_ps(dx23,fscal,fix2);
499 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
500 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
502 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
503 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
504 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 /* COULOMB ELECTROSTATICS */
511 velec = _mm_mul_ps(qq31,rinv31);
512 felec = _mm_mul_ps(velec,rinvsq31);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velecsum = _mm_add_ps(velecsum,velec);
519 /* Update vectorial force */
520 fix3 = _mm_macc_ps(dx31,fscal,fix3);
521 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
522 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
524 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
525 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
526 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 /* COULOMB ELECTROSTATICS */
533 velec = _mm_mul_ps(qq32,rinv32);
534 felec = _mm_mul_ps(velec,rinvsq32);
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm_add_ps(velecsum,velec);
541 /* Update vectorial force */
542 fix3 = _mm_macc_ps(dx32,fscal,fix3);
543 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
544 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
546 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
547 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
548 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 /* COULOMB ELECTROSTATICS */
555 velec = _mm_mul_ps(qq33,rinv33);
556 felec = _mm_mul_ps(velec,rinvsq33);
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velecsum = _mm_add_ps(velecsum,velec);
563 /* Update vectorial force */
564 fix3 = _mm_macc_ps(dx33,fscal,fix3);
565 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
566 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
568 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
569 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
570 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
572 fjptrA = f+j_coord_offsetA;
573 fjptrB = f+j_coord_offsetB;
574 fjptrC = f+j_coord_offsetC;
575 fjptrD = f+j_coord_offsetD;
577 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
578 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
579 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
581 /* Inner loop uses 341 flops */
587 /* Get j neighbor index, and coordinate index */
588 jnrlistA = jjnr[jidx];
589 jnrlistB = jjnr[jidx+1];
590 jnrlistC = jjnr[jidx+2];
591 jnrlistD = jjnr[jidx+3];
592 /* Sign of each element will be negative for non-real atoms.
593 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
594 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
596 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
597 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
598 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
599 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
600 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
601 j_coord_offsetA = DIM*jnrA;
602 j_coord_offsetB = DIM*jnrB;
603 j_coord_offsetC = DIM*jnrC;
604 j_coord_offsetD = DIM*jnrD;
606 /* load j atom coordinates */
607 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
608 x+j_coord_offsetC,x+j_coord_offsetD,
609 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
610 &jy2,&jz2,&jx3,&jy3,&jz3);
612 /* Calculate displacement vector */
613 dx00 = _mm_sub_ps(ix0,jx0);
614 dy00 = _mm_sub_ps(iy0,jy0);
615 dz00 = _mm_sub_ps(iz0,jz0);
616 dx11 = _mm_sub_ps(ix1,jx1);
617 dy11 = _mm_sub_ps(iy1,jy1);
618 dz11 = _mm_sub_ps(iz1,jz1);
619 dx12 = _mm_sub_ps(ix1,jx2);
620 dy12 = _mm_sub_ps(iy1,jy2);
621 dz12 = _mm_sub_ps(iz1,jz2);
622 dx13 = _mm_sub_ps(ix1,jx3);
623 dy13 = _mm_sub_ps(iy1,jy3);
624 dz13 = _mm_sub_ps(iz1,jz3);
625 dx21 = _mm_sub_ps(ix2,jx1);
626 dy21 = _mm_sub_ps(iy2,jy1);
627 dz21 = _mm_sub_ps(iz2,jz1);
628 dx22 = _mm_sub_ps(ix2,jx2);
629 dy22 = _mm_sub_ps(iy2,jy2);
630 dz22 = _mm_sub_ps(iz2,jz2);
631 dx23 = _mm_sub_ps(ix2,jx3);
632 dy23 = _mm_sub_ps(iy2,jy3);
633 dz23 = _mm_sub_ps(iz2,jz3);
634 dx31 = _mm_sub_ps(ix3,jx1);
635 dy31 = _mm_sub_ps(iy3,jy1);
636 dz31 = _mm_sub_ps(iz3,jz1);
637 dx32 = _mm_sub_ps(ix3,jx2);
638 dy32 = _mm_sub_ps(iy3,jy2);
639 dz32 = _mm_sub_ps(iz3,jz2);
640 dx33 = _mm_sub_ps(ix3,jx3);
641 dy33 = _mm_sub_ps(iy3,jy3);
642 dz33 = _mm_sub_ps(iz3,jz3);
644 /* Calculate squared distance and things based on it */
645 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
646 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
647 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
648 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
649 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
650 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
651 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
652 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
653 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
654 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
656 rinv00 = gmx_mm_invsqrt_ps(rsq00);
657 rinv11 = gmx_mm_invsqrt_ps(rsq11);
658 rinv12 = gmx_mm_invsqrt_ps(rsq12);
659 rinv13 = gmx_mm_invsqrt_ps(rsq13);
660 rinv21 = gmx_mm_invsqrt_ps(rsq21);
661 rinv22 = gmx_mm_invsqrt_ps(rsq22);
662 rinv23 = gmx_mm_invsqrt_ps(rsq23);
663 rinv31 = gmx_mm_invsqrt_ps(rsq31);
664 rinv32 = gmx_mm_invsqrt_ps(rsq32);
665 rinv33 = gmx_mm_invsqrt_ps(rsq33);
667 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
668 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
669 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
670 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
671 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
672 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
673 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
674 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
675 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
677 fjx0 = _mm_setzero_ps();
678 fjy0 = _mm_setzero_ps();
679 fjz0 = _mm_setzero_ps();
680 fjx1 = _mm_setzero_ps();
681 fjy1 = _mm_setzero_ps();
682 fjz1 = _mm_setzero_ps();
683 fjx2 = _mm_setzero_ps();
684 fjy2 = _mm_setzero_ps();
685 fjz2 = _mm_setzero_ps();
686 fjx3 = _mm_setzero_ps();
687 fjy3 = _mm_setzero_ps();
688 fjz3 = _mm_setzero_ps();
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 r00 = _mm_mul_ps(rsq00,rinv00);
695 r00 = _mm_andnot_ps(dummy_mask,r00);
697 /* Calculate table index by multiplying r with table scale and truncate to integer */
698 rt = _mm_mul_ps(r00,vftabscale);
699 vfitab = _mm_cvttps_epi32(rt);
701 vfeps = _mm_frcz_ps(rt);
703 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
705 twovfeps = _mm_add_ps(vfeps,vfeps);
706 vfitab = _mm_slli_epi32(vfitab,3);
708 /* CUBIC SPLINE TABLE DISPERSION */
709 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
710 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
711 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
712 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
713 _MM_TRANSPOSE4_PS(Y,F,G,H);
714 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
715 VV = _mm_macc_ps(vfeps,Fp,Y);
716 vvdw6 = _mm_mul_ps(c6_00,VV);
717 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
718 fvdw6 = _mm_mul_ps(c6_00,FF);
720 /* CUBIC SPLINE TABLE REPULSION */
721 vfitab = _mm_add_epi32(vfitab,ifour);
722 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
723 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
724 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
725 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
726 _MM_TRANSPOSE4_PS(Y,F,G,H);
727 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
728 VV = _mm_macc_ps(vfeps,Fp,Y);
729 vvdw12 = _mm_mul_ps(c12_00,VV);
730 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
731 fvdw12 = _mm_mul_ps(c12_00,FF);
732 vvdw = _mm_add_ps(vvdw12,vvdw6);
733 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
737 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
741 fscal = _mm_andnot_ps(dummy_mask,fscal);
743 /* Update vectorial force */
744 fix0 = _mm_macc_ps(dx00,fscal,fix0);
745 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
746 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
748 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
749 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
750 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
752 /**************************
753 * CALCULATE INTERACTIONS *
754 **************************/
756 /* COULOMB ELECTROSTATICS */
757 velec = _mm_mul_ps(qq11,rinv11);
758 felec = _mm_mul_ps(velec,rinvsq11);
760 /* Update potential sum for this i atom from the interaction with this j atom. */
761 velec = _mm_andnot_ps(dummy_mask,velec);
762 velecsum = _mm_add_ps(velecsum,velec);
766 fscal = _mm_andnot_ps(dummy_mask,fscal);
768 /* Update vectorial force */
769 fix1 = _mm_macc_ps(dx11,fscal,fix1);
770 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
771 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
773 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
774 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
775 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
777 /**************************
778 * CALCULATE INTERACTIONS *
779 **************************/
781 /* COULOMB ELECTROSTATICS */
782 velec = _mm_mul_ps(qq12,rinv12);
783 felec = _mm_mul_ps(velec,rinvsq12);
785 /* Update potential sum for this i atom from the interaction with this j atom. */
786 velec = _mm_andnot_ps(dummy_mask,velec);
787 velecsum = _mm_add_ps(velecsum,velec);
791 fscal = _mm_andnot_ps(dummy_mask,fscal);
793 /* Update vectorial force */
794 fix1 = _mm_macc_ps(dx12,fscal,fix1);
795 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
796 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
798 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
799 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
800 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* COULOMB ELECTROSTATICS */
807 velec = _mm_mul_ps(qq13,rinv13);
808 felec = _mm_mul_ps(velec,rinvsq13);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_andnot_ps(dummy_mask,velec);
812 velecsum = _mm_add_ps(velecsum,velec);
816 fscal = _mm_andnot_ps(dummy_mask,fscal);
818 /* Update vectorial force */
819 fix1 = _mm_macc_ps(dx13,fscal,fix1);
820 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
821 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
823 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
824 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
825 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 /* COULOMB ELECTROSTATICS */
832 velec = _mm_mul_ps(qq21,rinv21);
833 felec = _mm_mul_ps(velec,rinvsq21);
835 /* Update potential sum for this i atom from the interaction with this j atom. */
836 velec = _mm_andnot_ps(dummy_mask,velec);
837 velecsum = _mm_add_ps(velecsum,velec);
841 fscal = _mm_andnot_ps(dummy_mask,fscal);
843 /* Update vectorial force */
844 fix2 = _mm_macc_ps(dx21,fscal,fix2);
845 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
846 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
848 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
849 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
850 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
852 /**************************
853 * CALCULATE INTERACTIONS *
854 **************************/
856 /* COULOMB ELECTROSTATICS */
857 velec = _mm_mul_ps(qq22,rinv22);
858 felec = _mm_mul_ps(velec,rinvsq22);
860 /* Update potential sum for this i atom from the interaction with this j atom. */
861 velec = _mm_andnot_ps(dummy_mask,velec);
862 velecsum = _mm_add_ps(velecsum,velec);
866 fscal = _mm_andnot_ps(dummy_mask,fscal);
868 /* Update vectorial force */
869 fix2 = _mm_macc_ps(dx22,fscal,fix2);
870 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
871 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
873 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
874 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
875 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
877 /**************************
878 * CALCULATE INTERACTIONS *
879 **************************/
881 /* COULOMB ELECTROSTATICS */
882 velec = _mm_mul_ps(qq23,rinv23);
883 felec = _mm_mul_ps(velec,rinvsq23);
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 velec = _mm_andnot_ps(dummy_mask,velec);
887 velecsum = _mm_add_ps(velecsum,velec);
891 fscal = _mm_andnot_ps(dummy_mask,fscal);
893 /* Update vectorial force */
894 fix2 = _mm_macc_ps(dx23,fscal,fix2);
895 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
896 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
898 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
899 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
900 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
902 /**************************
903 * CALCULATE INTERACTIONS *
904 **************************/
906 /* COULOMB ELECTROSTATICS */
907 velec = _mm_mul_ps(qq31,rinv31);
908 felec = _mm_mul_ps(velec,rinvsq31);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 velec = _mm_andnot_ps(dummy_mask,velec);
912 velecsum = _mm_add_ps(velecsum,velec);
916 fscal = _mm_andnot_ps(dummy_mask,fscal);
918 /* Update vectorial force */
919 fix3 = _mm_macc_ps(dx31,fscal,fix3);
920 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
921 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
923 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
924 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
925 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
927 /**************************
928 * CALCULATE INTERACTIONS *
929 **************************/
931 /* COULOMB ELECTROSTATICS */
932 velec = _mm_mul_ps(qq32,rinv32);
933 felec = _mm_mul_ps(velec,rinvsq32);
935 /* Update potential sum for this i atom from the interaction with this j atom. */
936 velec = _mm_andnot_ps(dummy_mask,velec);
937 velecsum = _mm_add_ps(velecsum,velec);
941 fscal = _mm_andnot_ps(dummy_mask,fscal);
943 /* Update vectorial force */
944 fix3 = _mm_macc_ps(dx32,fscal,fix3);
945 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
946 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
948 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
949 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
950 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
952 /**************************
953 * CALCULATE INTERACTIONS *
954 **************************/
956 /* COULOMB ELECTROSTATICS */
957 velec = _mm_mul_ps(qq33,rinv33);
958 felec = _mm_mul_ps(velec,rinvsq33);
960 /* Update potential sum for this i atom from the interaction with this j atom. */
961 velec = _mm_andnot_ps(dummy_mask,velec);
962 velecsum = _mm_add_ps(velecsum,velec);
966 fscal = _mm_andnot_ps(dummy_mask,fscal);
968 /* Update vectorial force */
969 fix3 = _mm_macc_ps(dx33,fscal,fix3);
970 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
971 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
973 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
974 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
975 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
977 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
978 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
979 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
980 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
982 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
983 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
984 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
986 /* Inner loop uses 342 flops */
989 /* End of innermost loop */
991 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
992 f+i_coord_offset,fshift+i_shift_offset);
995 /* Update potential energies */
996 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
997 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
999 /* Increment number of inner iterations */
1000 inneriter += j_index_end - j_index_start;
1002 /* Outer loop uses 26 flops */
1005 /* Increment number of outer iterations */
1008 /* Update outer/inner flops */
1010 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*342);
1013 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1014 * Electrostatics interaction: Coulomb
1015 * VdW interaction: CubicSplineTable
1016 * Geometry: Water4-Water4
1017 * Calculate force/pot: Force
1020 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1021 (t_nblist * gmx_restrict nlist,
1022 rvec * gmx_restrict xx,
1023 rvec * gmx_restrict ff,
1024 t_forcerec * gmx_restrict fr,
1025 t_mdatoms * gmx_restrict mdatoms,
1026 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1027 t_nrnb * gmx_restrict nrnb)
1029 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1030 * just 0 for non-waters.
1031 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1032 * jnr indices corresponding to data put in the four positions in the SIMD register.
1034 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1035 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1036 int jnrA,jnrB,jnrC,jnrD;
1037 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1038 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1039 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1040 real rcutoff_scalar;
1041 real *shiftvec,*fshift,*x,*f;
1042 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1043 real scratch[4*DIM];
1044 __m128 fscal,rcutoff,rcutoff2,jidxall;
1046 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1048 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1050 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1052 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1053 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1054 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1055 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1056 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1057 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1058 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1059 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1060 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1061 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1062 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1063 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1064 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1065 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1066 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1067 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1068 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1069 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1070 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1071 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1074 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1077 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1078 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1080 __m128i ifour = _mm_set1_epi32(4);
1081 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1083 __m128 dummy_mask,cutoff_mask;
1084 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1085 __m128 one = _mm_set1_ps(1.0);
1086 __m128 two = _mm_set1_ps(2.0);
1092 jindex = nlist->jindex;
1094 shiftidx = nlist->shift;
1096 shiftvec = fr->shift_vec[0];
1097 fshift = fr->fshift[0];
1098 facel = _mm_set1_ps(fr->epsfac);
1099 charge = mdatoms->chargeA;
1100 nvdwtype = fr->ntype;
1101 vdwparam = fr->nbfp;
1102 vdwtype = mdatoms->typeA;
1104 vftab = kernel_data->table_vdw->data;
1105 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1107 /* Setup water-specific parameters */
1108 inr = nlist->iinr[0];
1109 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1110 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1111 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1112 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1114 jq1 = _mm_set1_ps(charge[inr+1]);
1115 jq2 = _mm_set1_ps(charge[inr+2]);
1116 jq3 = _mm_set1_ps(charge[inr+3]);
1117 vdwjidx0A = 2*vdwtype[inr+0];
1118 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1119 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1120 qq11 = _mm_mul_ps(iq1,jq1);
1121 qq12 = _mm_mul_ps(iq1,jq2);
1122 qq13 = _mm_mul_ps(iq1,jq3);
1123 qq21 = _mm_mul_ps(iq2,jq1);
1124 qq22 = _mm_mul_ps(iq2,jq2);
1125 qq23 = _mm_mul_ps(iq2,jq3);
1126 qq31 = _mm_mul_ps(iq3,jq1);
1127 qq32 = _mm_mul_ps(iq3,jq2);
1128 qq33 = _mm_mul_ps(iq3,jq3);
1130 /* Avoid stupid compiler warnings */
1131 jnrA = jnrB = jnrC = jnrD = 0;
1132 j_coord_offsetA = 0;
1133 j_coord_offsetB = 0;
1134 j_coord_offsetC = 0;
1135 j_coord_offsetD = 0;
1140 for(iidx=0;iidx<4*DIM;iidx++)
1142 scratch[iidx] = 0.0;
1145 /* Start outer loop over neighborlists */
1146 for(iidx=0; iidx<nri; iidx++)
1148 /* Load shift vector for this list */
1149 i_shift_offset = DIM*shiftidx[iidx];
1151 /* Load limits for loop over neighbors */
1152 j_index_start = jindex[iidx];
1153 j_index_end = jindex[iidx+1];
1155 /* Get outer coordinate index */
1157 i_coord_offset = DIM*inr;
1159 /* Load i particle coords and add shift vector */
1160 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1161 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1163 fix0 = _mm_setzero_ps();
1164 fiy0 = _mm_setzero_ps();
1165 fiz0 = _mm_setzero_ps();
1166 fix1 = _mm_setzero_ps();
1167 fiy1 = _mm_setzero_ps();
1168 fiz1 = _mm_setzero_ps();
1169 fix2 = _mm_setzero_ps();
1170 fiy2 = _mm_setzero_ps();
1171 fiz2 = _mm_setzero_ps();
1172 fix3 = _mm_setzero_ps();
1173 fiy3 = _mm_setzero_ps();
1174 fiz3 = _mm_setzero_ps();
1176 /* Start inner kernel loop */
1177 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1180 /* Get j neighbor index, and coordinate index */
1182 jnrB = jjnr[jidx+1];
1183 jnrC = jjnr[jidx+2];
1184 jnrD = jjnr[jidx+3];
1185 j_coord_offsetA = DIM*jnrA;
1186 j_coord_offsetB = DIM*jnrB;
1187 j_coord_offsetC = DIM*jnrC;
1188 j_coord_offsetD = DIM*jnrD;
1190 /* load j atom coordinates */
1191 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1192 x+j_coord_offsetC,x+j_coord_offsetD,
1193 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1194 &jy2,&jz2,&jx3,&jy3,&jz3);
1196 /* Calculate displacement vector */
1197 dx00 = _mm_sub_ps(ix0,jx0);
1198 dy00 = _mm_sub_ps(iy0,jy0);
1199 dz00 = _mm_sub_ps(iz0,jz0);
1200 dx11 = _mm_sub_ps(ix1,jx1);
1201 dy11 = _mm_sub_ps(iy1,jy1);
1202 dz11 = _mm_sub_ps(iz1,jz1);
1203 dx12 = _mm_sub_ps(ix1,jx2);
1204 dy12 = _mm_sub_ps(iy1,jy2);
1205 dz12 = _mm_sub_ps(iz1,jz2);
1206 dx13 = _mm_sub_ps(ix1,jx3);
1207 dy13 = _mm_sub_ps(iy1,jy3);
1208 dz13 = _mm_sub_ps(iz1,jz3);
1209 dx21 = _mm_sub_ps(ix2,jx1);
1210 dy21 = _mm_sub_ps(iy2,jy1);
1211 dz21 = _mm_sub_ps(iz2,jz1);
1212 dx22 = _mm_sub_ps(ix2,jx2);
1213 dy22 = _mm_sub_ps(iy2,jy2);
1214 dz22 = _mm_sub_ps(iz2,jz2);
1215 dx23 = _mm_sub_ps(ix2,jx3);
1216 dy23 = _mm_sub_ps(iy2,jy3);
1217 dz23 = _mm_sub_ps(iz2,jz3);
1218 dx31 = _mm_sub_ps(ix3,jx1);
1219 dy31 = _mm_sub_ps(iy3,jy1);
1220 dz31 = _mm_sub_ps(iz3,jz1);
1221 dx32 = _mm_sub_ps(ix3,jx2);
1222 dy32 = _mm_sub_ps(iy3,jy2);
1223 dz32 = _mm_sub_ps(iz3,jz2);
1224 dx33 = _mm_sub_ps(ix3,jx3);
1225 dy33 = _mm_sub_ps(iy3,jy3);
1226 dz33 = _mm_sub_ps(iz3,jz3);
1228 /* Calculate squared distance and things based on it */
1229 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1230 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1231 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1232 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1233 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1234 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1235 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1236 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1237 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1238 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1240 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1241 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1242 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1243 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1244 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1245 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1246 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1247 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1248 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1249 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1251 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1252 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1253 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1254 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1255 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1256 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1257 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1258 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1259 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1261 fjx0 = _mm_setzero_ps();
1262 fjy0 = _mm_setzero_ps();
1263 fjz0 = _mm_setzero_ps();
1264 fjx1 = _mm_setzero_ps();
1265 fjy1 = _mm_setzero_ps();
1266 fjz1 = _mm_setzero_ps();
1267 fjx2 = _mm_setzero_ps();
1268 fjy2 = _mm_setzero_ps();
1269 fjz2 = _mm_setzero_ps();
1270 fjx3 = _mm_setzero_ps();
1271 fjy3 = _mm_setzero_ps();
1272 fjz3 = _mm_setzero_ps();
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 r00 = _mm_mul_ps(rsq00,rinv00);
1280 /* Calculate table index by multiplying r with table scale and truncate to integer */
1281 rt = _mm_mul_ps(r00,vftabscale);
1282 vfitab = _mm_cvttps_epi32(rt);
1284 vfeps = _mm_frcz_ps(rt);
1286 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1288 twovfeps = _mm_add_ps(vfeps,vfeps);
1289 vfitab = _mm_slli_epi32(vfitab,3);
1291 /* CUBIC SPLINE TABLE DISPERSION */
1292 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1293 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1294 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1295 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1296 _MM_TRANSPOSE4_PS(Y,F,G,H);
1297 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1298 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1299 fvdw6 = _mm_mul_ps(c6_00,FF);
1301 /* CUBIC SPLINE TABLE REPULSION */
1302 vfitab = _mm_add_epi32(vfitab,ifour);
1303 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1304 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1305 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1306 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1307 _MM_TRANSPOSE4_PS(Y,F,G,H);
1308 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1309 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1310 fvdw12 = _mm_mul_ps(c12_00,FF);
1311 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1315 /* Update vectorial force */
1316 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1317 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1318 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1320 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1321 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1322 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 /* COULOMB ELECTROSTATICS */
1329 velec = _mm_mul_ps(qq11,rinv11);
1330 felec = _mm_mul_ps(velec,rinvsq11);
1334 /* Update vectorial force */
1335 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1336 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1337 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1339 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1340 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1341 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1343 /**************************
1344 * CALCULATE INTERACTIONS *
1345 **************************/
1347 /* COULOMB ELECTROSTATICS */
1348 velec = _mm_mul_ps(qq12,rinv12);
1349 felec = _mm_mul_ps(velec,rinvsq12);
1353 /* Update vectorial force */
1354 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1355 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1356 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1358 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1359 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1360 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 /* COULOMB ELECTROSTATICS */
1367 velec = _mm_mul_ps(qq13,rinv13);
1368 felec = _mm_mul_ps(velec,rinvsq13);
1372 /* Update vectorial force */
1373 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1374 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1375 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1377 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1378 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1379 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1381 /**************************
1382 * CALCULATE INTERACTIONS *
1383 **************************/
1385 /* COULOMB ELECTROSTATICS */
1386 velec = _mm_mul_ps(qq21,rinv21);
1387 felec = _mm_mul_ps(velec,rinvsq21);
1391 /* Update vectorial force */
1392 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1393 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1394 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1396 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1397 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1398 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1400 /**************************
1401 * CALCULATE INTERACTIONS *
1402 **************************/
1404 /* COULOMB ELECTROSTATICS */
1405 velec = _mm_mul_ps(qq22,rinv22);
1406 felec = _mm_mul_ps(velec,rinvsq22);
1410 /* Update vectorial force */
1411 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1412 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1413 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1415 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1416 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1417 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 /* COULOMB ELECTROSTATICS */
1424 velec = _mm_mul_ps(qq23,rinv23);
1425 felec = _mm_mul_ps(velec,rinvsq23);
1429 /* Update vectorial force */
1430 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1431 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1432 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1434 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1435 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1436 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1438 /**************************
1439 * CALCULATE INTERACTIONS *
1440 **************************/
1442 /* COULOMB ELECTROSTATICS */
1443 velec = _mm_mul_ps(qq31,rinv31);
1444 felec = _mm_mul_ps(velec,rinvsq31);
1448 /* Update vectorial force */
1449 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1450 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1451 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1453 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1454 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1455 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1457 /**************************
1458 * CALCULATE INTERACTIONS *
1459 **************************/
1461 /* COULOMB ELECTROSTATICS */
1462 velec = _mm_mul_ps(qq32,rinv32);
1463 felec = _mm_mul_ps(velec,rinvsq32);
1467 /* Update vectorial force */
1468 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1469 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1470 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1472 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1473 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1474 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1476 /**************************
1477 * CALCULATE INTERACTIONS *
1478 **************************/
1480 /* COULOMB ELECTROSTATICS */
1481 velec = _mm_mul_ps(qq33,rinv33);
1482 felec = _mm_mul_ps(velec,rinvsq33);
1486 /* Update vectorial force */
1487 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1488 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1489 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1491 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1492 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1493 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1495 fjptrA = f+j_coord_offsetA;
1496 fjptrB = f+j_coord_offsetB;
1497 fjptrC = f+j_coord_offsetC;
1498 fjptrD = f+j_coord_offsetD;
1500 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1501 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1502 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1504 /* Inner loop uses 324 flops */
1507 if(jidx<j_index_end)
1510 /* Get j neighbor index, and coordinate index */
1511 jnrlistA = jjnr[jidx];
1512 jnrlistB = jjnr[jidx+1];
1513 jnrlistC = jjnr[jidx+2];
1514 jnrlistD = jjnr[jidx+3];
1515 /* Sign of each element will be negative for non-real atoms.
1516 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1517 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1519 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1520 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1521 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1522 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1523 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1524 j_coord_offsetA = DIM*jnrA;
1525 j_coord_offsetB = DIM*jnrB;
1526 j_coord_offsetC = DIM*jnrC;
1527 j_coord_offsetD = DIM*jnrD;
1529 /* load j atom coordinates */
1530 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1531 x+j_coord_offsetC,x+j_coord_offsetD,
1532 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1533 &jy2,&jz2,&jx3,&jy3,&jz3);
1535 /* Calculate displacement vector */
1536 dx00 = _mm_sub_ps(ix0,jx0);
1537 dy00 = _mm_sub_ps(iy0,jy0);
1538 dz00 = _mm_sub_ps(iz0,jz0);
1539 dx11 = _mm_sub_ps(ix1,jx1);
1540 dy11 = _mm_sub_ps(iy1,jy1);
1541 dz11 = _mm_sub_ps(iz1,jz1);
1542 dx12 = _mm_sub_ps(ix1,jx2);
1543 dy12 = _mm_sub_ps(iy1,jy2);
1544 dz12 = _mm_sub_ps(iz1,jz2);
1545 dx13 = _mm_sub_ps(ix1,jx3);
1546 dy13 = _mm_sub_ps(iy1,jy3);
1547 dz13 = _mm_sub_ps(iz1,jz3);
1548 dx21 = _mm_sub_ps(ix2,jx1);
1549 dy21 = _mm_sub_ps(iy2,jy1);
1550 dz21 = _mm_sub_ps(iz2,jz1);
1551 dx22 = _mm_sub_ps(ix2,jx2);
1552 dy22 = _mm_sub_ps(iy2,jy2);
1553 dz22 = _mm_sub_ps(iz2,jz2);
1554 dx23 = _mm_sub_ps(ix2,jx3);
1555 dy23 = _mm_sub_ps(iy2,jy3);
1556 dz23 = _mm_sub_ps(iz2,jz3);
1557 dx31 = _mm_sub_ps(ix3,jx1);
1558 dy31 = _mm_sub_ps(iy3,jy1);
1559 dz31 = _mm_sub_ps(iz3,jz1);
1560 dx32 = _mm_sub_ps(ix3,jx2);
1561 dy32 = _mm_sub_ps(iy3,jy2);
1562 dz32 = _mm_sub_ps(iz3,jz2);
1563 dx33 = _mm_sub_ps(ix3,jx3);
1564 dy33 = _mm_sub_ps(iy3,jy3);
1565 dz33 = _mm_sub_ps(iz3,jz3);
1567 /* Calculate squared distance and things based on it */
1568 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1569 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1570 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1571 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1572 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1573 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1574 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1575 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1576 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1577 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1579 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1580 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1581 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1582 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1583 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1584 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1585 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1586 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1587 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1588 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1590 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1591 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1592 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1593 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1594 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1595 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1596 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1597 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1598 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1600 fjx0 = _mm_setzero_ps();
1601 fjy0 = _mm_setzero_ps();
1602 fjz0 = _mm_setzero_ps();
1603 fjx1 = _mm_setzero_ps();
1604 fjy1 = _mm_setzero_ps();
1605 fjz1 = _mm_setzero_ps();
1606 fjx2 = _mm_setzero_ps();
1607 fjy2 = _mm_setzero_ps();
1608 fjz2 = _mm_setzero_ps();
1609 fjx3 = _mm_setzero_ps();
1610 fjy3 = _mm_setzero_ps();
1611 fjz3 = _mm_setzero_ps();
1613 /**************************
1614 * CALCULATE INTERACTIONS *
1615 **************************/
1617 r00 = _mm_mul_ps(rsq00,rinv00);
1618 r00 = _mm_andnot_ps(dummy_mask,r00);
1620 /* Calculate table index by multiplying r with table scale and truncate to integer */
1621 rt = _mm_mul_ps(r00,vftabscale);
1622 vfitab = _mm_cvttps_epi32(rt);
1624 vfeps = _mm_frcz_ps(rt);
1626 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1628 twovfeps = _mm_add_ps(vfeps,vfeps);
1629 vfitab = _mm_slli_epi32(vfitab,3);
1631 /* CUBIC SPLINE TABLE DISPERSION */
1632 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1633 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1634 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1635 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1636 _MM_TRANSPOSE4_PS(Y,F,G,H);
1637 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1638 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1639 fvdw6 = _mm_mul_ps(c6_00,FF);
1641 /* CUBIC SPLINE TABLE REPULSION */
1642 vfitab = _mm_add_epi32(vfitab,ifour);
1643 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1644 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1645 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1646 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1647 _MM_TRANSPOSE4_PS(Y,F,G,H);
1648 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1649 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1650 fvdw12 = _mm_mul_ps(c12_00,FF);
1651 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1655 fscal = _mm_andnot_ps(dummy_mask,fscal);
1657 /* Update vectorial force */
1658 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1659 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1660 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1662 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1663 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1664 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 /* COULOMB ELECTROSTATICS */
1671 velec = _mm_mul_ps(qq11,rinv11);
1672 felec = _mm_mul_ps(velec,rinvsq11);
1676 fscal = _mm_andnot_ps(dummy_mask,fscal);
1678 /* Update vectorial force */
1679 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1680 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1681 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1683 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1684 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1685 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 /* COULOMB ELECTROSTATICS */
1692 velec = _mm_mul_ps(qq12,rinv12);
1693 felec = _mm_mul_ps(velec,rinvsq12);
1697 fscal = _mm_andnot_ps(dummy_mask,fscal);
1699 /* Update vectorial force */
1700 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1701 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1702 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1704 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1705 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1706 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 /* COULOMB ELECTROSTATICS */
1713 velec = _mm_mul_ps(qq13,rinv13);
1714 felec = _mm_mul_ps(velec,rinvsq13);
1718 fscal = _mm_andnot_ps(dummy_mask,fscal);
1720 /* Update vectorial force */
1721 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1722 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1723 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1725 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1726 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1727 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 /* COULOMB ELECTROSTATICS */
1734 velec = _mm_mul_ps(qq21,rinv21);
1735 felec = _mm_mul_ps(velec,rinvsq21);
1739 fscal = _mm_andnot_ps(dummy_mask,fscal);
1741 /* Update vectorial force */
1742 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1743 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1744 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1746 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1747 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1748 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1750 /**************************
1751 * CALCULATE INTERACTIONS *
1752 **************************/
1754 /* COULOMB ELECTROSTATICS */
1755 velec = _mm_mul_ps(qq22,rinv22);
1756 felec = _mm_mul_ps(velec,rinvsq22);
1760 fscal = _mm_andnot_ps(dummy_mask,fscal);
1762 /* Update vectorial force */
1763 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1764 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1765 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1767 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1768 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1769 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1771 /**************************
1772 * CALCULATE INTERACTIONS *
1773 **************************/
1775 /* COULOMB ELECTROSTATICS */
1776 velec = _mm_mul_ps(qq23,rinv23);
1777 felec = _mm_mul_ps(velec,rinvsq23);
1781 fscal = _mm_andnot_ps(dummy_mask,fscal);
1783 /* Update vectorial force */
1784 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1785 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1786 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1788 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1789 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1790 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 /* COULOMB ELECTROSTATICS */
1797 velec = _mm_mul_ps(qq31,rinv31);
1798 felec = _mm_mul_ps(velec,rinvsq31);
1802 fscal = _mm_andnot_ps(dummy_mask,fscal);
1804 /* Update vectorial force */
1805 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1806 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1807 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1809 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1810 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1811 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 /* COULOMB ELECTROSTATICS */
1818 velec = _mm_mul_ps(qq32,rinv32);
1819 felec = _mm_mul_ps(velec,rinvsq32);
1823 fscal = _mm_andnot_ps(dummy_mask,fscal);
1825 /* Update vectorial force */
1826 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1827 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1828 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1830 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1831 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1832 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 /* COULOMB ELECTROSTATICS */
1839 velec = _mm_mul_ps(qq33,rinv33);
1840 felec = _mm_mul_ps(velec,rinvsq33);
1844 fscal = _mm_andnot_ps(dummy_mask,fscal);
1846 /* Update vectorial force */
1847 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1848 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1849 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1851 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1852 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1853 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1855 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1856 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1857 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1858 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1860 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1861 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1862 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1864 /* Inner loop uses 325 flops */
1867 /* End of innermost loop */
1869 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1870 f+i_coord_offset,fshift+i_shift_offset);
1872 /* Increment number of inner iterations */
1873 inneriter += j_index_end - j_index_start;
1875 /* Outer loop uses 24 flops */
1878 /* Increment number of outer iterations */
1881 /* Update outer/inner flops */
1883 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);