2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B;
91 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
96 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
103 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
105 __m128i ifour = _mm_set1_epi32(4);
106 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
108 __m128d dummy_mask,cutoff_mask;
109 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
110 __m128d one = _mm_set1_pd(1.0);
111 __m128d two = _mm_set1_pd(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm_set1_pd(fr->epsfac);
124 charge = mdatoms->chargeA;
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 vftab = kernel_data->table_elec_vdw->data;
130 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
132 /* Setup water-specific parameters */
133 inr = nlist->iinr[0];
134 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
135 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
136 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
137 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
139 /* Avoid stupid compiler warnings */
147 /* Start outer loop over neighborlists */
148 for(iidx=0; iidx<nri; iidx++)
150 /* Load shift vector for this list */
151 i_shift_offset = DIM*shiftidx[iidx];
153 /* Load limits for loop over neighbors */
154 j_index_start = jindex[iidx];
155 j_index_end = jindex[iidx+1];
157 /* Get outer coordinate index */
159 i_coord_offset = DIM*inr;
161 /* Load i particle coords and add shift vector */
162 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
163 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
165 fix0 = _mm_setzero_pd();
166 fiy0 = _mm_setzero_pd();
167 fiz0 = _mm_setzero_pd();
168 fix1 = _mm_setzero_pd();
169 fiy1 = _mm_setzero_pd();
170 fiz1 = _mm_setzero_pd();
171 fix2 = _mm_setzero_pd();
172 fiy2 = _mm_setzero_pd();
173 fiz2 = _mm_setzero_pd();
174 fix3 = _mm_setzero_pd();
175 fiy3 = _mm_setzero_pd();
176 fiz3 = _mm_setzero_pd();
178 /* Reset potential sums */
179 velecsum = _mm_setzero_pd();
180 vvdwsum = _mm_setzero_pd();
182 /* Start inner kernel loop */
183 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
186 /* Get j neighbor index, and coordinate index */
189 j_coord_offsetA = DIM*jnrA;
190 j_coord_offsetB = DIM*jnrB;
192 /* load j atom coordinates */
193 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
196 /* Calculate displacement vector */
197 dx00 = _mm_sub_pd(ix0,jx0);
198 dy00 = _mm_sub_pd(iy0,jy0);
199 dz00 = _mm_sub_pd(iz0,jz0);
200 dx10 = _mm_sub_pd(ix1,jx0);
201 dy10 = _mm_sub_pd(iy1,jy0);
202 dz10 = _mm_sub_pd(iz1,jz0);
203 dx20 = _mm_sub_pd(ix2,jx0);
204 dy20 = _mm_sub_pd(iy2,jy0);
205 dz20 = _mm_sub_pd(iz2,jz0);
206 dx30 = _mm_sub_pd(ix3,jx0);
207 dy30 = _mm_sub_pd(iy3,jy0);
208 dz30 = _mm_sub_pd(iz3,jz0);
210 /* Calculate squared distance and things based on it */
211 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
212 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
213 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
214 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
216 rinv00 = gmx_mm_invsqrt_pd(rsq00);
217 rinv10 = gmx_mm_invsqrt_pd(rsq10);
218 rinv20 = gmx_mm_invsqrt_pd(rsq20);
219 rinv30 = gmx_mm_invsqrt_pd(rsq30);
221 /* Load parameters for j particles */
222 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
223 vdwjidx0A = 2*vdwtype[jnrA+0];
224 vdwjidx0B = 2*vdwtype[jnrB+0];
226 fjx0 = _mm_setzero_pd();
227 fjy0 = _mm_setzero_pd();
228 fjz0 = _mm_setzero_pd();
230 /**************************
231 * CALCULATE INTERACTIONS *
232 **************************/
234 r00 = _mm_mul_pd(rsq00,rinv00);
236 /* Compute parameters for interactions between i and j atoms */
237 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
238 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
240 /* Calculate table index by multiplying r with table scale and truncate to integer */
241 rt = _mm_mul_pd(r00,vftabscale);
242 vfitab = _mm_cvttpd_epi32(rt);
244 vfeps = _mm_frcz_pd(rt);
246 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
248 twovfeps = _mm_add_pd(vfeps,vfeps);
249 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
251 /* CUBIC SPLINE TABLE DISPERSION */
252 vfitab = _mm_add_epi32(vfitab,ifour);
253 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
254 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
255 GMX_MM_TRANSPOSE2_PD(Y,F);
256 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
257 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
258 GMX_MM_TRANSPOSE2_PD(G,H);
259 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
260 VV = _mm_macc_pd(vfeps,Fp,Y);
261 vvdw6 = _mm_mul_pd(c6_00,VV);
262 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
263 fvdw6 = _mm_mul_pd(c6_00,FF);
265 /* CUBIC SPLINE TABLE REPULSION */
266 vfitab = _mm_add_epi32(vfitab,ifour);
267 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
268 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
269 GMX_MM_TRANSPOSE2_PD(Y,F);
270 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
271 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
272 GMX_MM_TRANSPOSE2_PD(G,H);
273 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
274 VV = _mm_macc_pd(vfeps,Fp,Y);
275 vvdw12 = _mm_mul_pd(c12_00,VV);
276 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
277 fvdw12 = _mm_mul_pd(c12_00,FF);
278 vvdw = _mm_add_pd(vvdw12,vvdw6);
279 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
281 /* Update potential sum for this i atom from the interaction with this j atom. */
282 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
286 /* Update vectorial force */
287 fix0 = _mm_macc_pd(dx00,fscal,fix0);
288 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
289 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
291 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
292 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
293 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r10 = _mm_mul_pd(rsq10,rinv10);
301 /* Compute parameters for interactions between i and j atoms */
302 qq10 = _mm_mul_pd(iq1,jq0);
304 /* Calculate table index by multiplying r with table scale and truncate to integer */
305 rt = _mm_mul_pd(r10,vftabscale);
306 vfitab = _mm_cvttpd_epi32(rt);
308 vfeps = _mm_frcz_pd(rt);
310 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
312 twovfeps = _mm_add_pd(vfeps,vfeps);
313 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
315 /* CUBIC SPLINE TABLE ELECTROSTATICS */
316 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
317 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
318 GMX_MM_TRANSPOSE2_PD(Y,F);
319 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
320 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(G,H);
322 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
323 VV = _mm_macc_pd(vfeps,Fp,Y);
324 velec = _mm_mul_pd(qq10,VV);
325 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
326 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 velecsum = _mm_add_pd(velecsum,velec);
333 /* Update vectorial force */
334 fix1 = _mm_macc_pd(dx10,fscal,fix1);
335 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
336 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
338 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
339 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
340 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
342 /**************************
343 * CALCULATE INTERACTIONS *
344 **************************/
346 r20 = _mm_mul_pd(rsq20,rinv20);
348 /* Compute parameters for interactions between i and j atoms */
349 qq20 = _mm_mul_pd(iq2,jq0);
351 /* Calculate table index by multiplying r with table scale and truncate to integer */
352 rt = _mm_mul_pd(r20,vftabscale);
353 vfitab = _mm_cvttpd_epi32(rt);
355 vfeps = _mm_frcz_pd(rt);
357 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
359 twovfeps = _mm_add_pd(vfeps,vfeps);
360 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
362 /* CUBIC SPLINE TABLE ELECTROSTATICS */
363 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
364 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
365 GMX_MM_TRANSPOSE2_PD(Y,F);
366 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
367 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
368 GMX_MM_TRANSPOSE2_PD(G,H);
369 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
370 VV = _mm_macc_pd(vfeps,Fp,Y);
371 velec = _mm_mul_pd(qq20,VV);
372 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
373 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum = _mm_add_pd(velecsum,velec);
380 /* Update vectorial force */
381 fix2 = _mm_macc_pd(dx20,fscal,fix2);
382 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
383 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
385 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
386 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
387 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 r30 = _mm_mul_pd(rsq30,rinv30);
395 /* Compute parameters for interactions between i and j atoms */
396 qq30 = _mm_mul_pd(iq3,jq0);
398 /* Calculate table index by multiplying r with table scale and truncate to integer */
399 rt = _mm_mul_pd(r30,vftabscale);
400 vfitab = _mm_cvttpd_epi32(rt);
402 vfeps = _mm_frcz_pd(rt);
404 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
406 twovfeps = _mm_add_pd(vfeps,vfeps);
407 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
409 /* CUBIC SPLINE TABLE ELECTROSTATICS */
410 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
411 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
412 GMX_MM_TRANSPOSE2_PD(Y,F);
413 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
414 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
415 GMX_MM_TRANSPOSE2_PD(G,H);
416 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
417 VV = _mm_macc_pd(vfeps,Fp,Y);
418 velec = _mm_mul_pd(qq30,VV);
419 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
420 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velecsum = _mm_add_pd(velecsum,velec);
427 /* Update vectorial force */
428 fix3 = _mm_macc_pd(dx30,fscal,fix3);
429 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
430 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
432 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
433 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
434 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
436 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
438 /* Inner loop uses 200 flops */
445 j_coord_offsetA = DIM*jnrA;
447 /* load j atom coordinates */
448 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
451 /* Calculate displacement vector */
452 dx00 = _mm_sub_pd(ix0,jx0);
453 dy00 = _mm_sub_pd(iy0,jy0);
454 dz00 = _mm_sub_pd(iz0,jz0);
455 dx10 = _mm_sub_pd(ix1,jx0);
456 dy10 = _mm_sub_pd(iy1,jy0);
457 dz10 = _mm_sub_pd(iz1,jz0);
458 dx20 = _mm_sub_pd(ix2,jx0);
459 dy20 = _mm_sub_pd(iy2,jy0);
460 dz20 = _mm_sub_pd(iz2,jz0);
461 dx30 = _mm_sub_pd(ix3,jx0);
462 dy30 = _mm_sub_pd(iy3,jy0);
463 dz30 = _mm_sub_pd(iz3,jz0);
465 /* Calculate squared distance and things based on it */
466 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
467 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
468 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
469 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
471 rinv00 = gmx_mm_invsqrt_pd(rsq00);
472 rinv10 = gmx_mm_invsqrt_pd(rsq10);
473 rinv20 = gmx_mm_invsqrt_pd(rsq20);
474 rinv30 = gmx_mm_invsqrt_pd(rsq30);
476 /* Load parameters for j particles */
477 jq0 = _mm_load_sd(charge+jnrA+0);
478 vdwjidx0A = 2*vdwtype[jnrA+0];
480 fjx0 = _mm_setzero_pd();
481 fjy0 = _mm_setzero_pd();
482 fjz0 = _mm_setzero_pd();
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 r00 = _mm_mul_pd(rsq00,rinv00);
490 /* Compute parameters for interactions between i and j atoms */
491 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
493 /* Calculate table index by multiplying r with table scale and truncate to integer */
494 rt = _mm_mul_pd(r00,vftabscale);
495 vfitab = _mm_cvttpd_epi32(rt);
497 vfeps = _mm_frcz_pd(rt);
499 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
501 twovfeps = _mm_add_pd(vfeps,vfeps);
502 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
504 /* CUBIC SPLINE TABLE DISPERSION */
505 vfitab = _mm_add_epi32(vfitab,ifour);
506 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
507 F = _mm_setzero_pd();
508 GMX_MM_TRANSPOSE2_PD(Y,F);
509 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
510 H = _mm_setzero_pd();
511 GMX_MM_TRANSPOSE2_PD(G,H);
512 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
513 VV = _mm_macc_pd(vfeps,Fp,Y);
514 vvdw6 = _mm_mul_pd(c6_00,VV);
515 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
516 fvdw6 = _mm_mul_pd(c6_00,FF);
518 /* CUBIC SPLINE TABLE REPULSION */
519 vfitab = _mm_add_epi32(vfitab,ifour);
520 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
521 F = _mm_setzero_pd();
522 GMX_MM_TRANSPOSE2_PD(Y,F);
523 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
524 H = _mm_setzero_pd();
525 GMX_MM_TRANSPOSE2_PD(G,H);
526 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
527 VV = _mm_macc_pd(vfeps,Fp,Y);
528 vvdw12 = _mm_mul_pd(c12_00,VV);
529 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
530 fvdw12 = _mm_mul_pd(c12_00,FF);
531 vvdw = _mm_add_pd(vvdw12,vvdw6);
532 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
536 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
540 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
542 /* Update vectorial force */
543 fix0 = _mm_macc_pd(dx00,fscal,fix0);
544 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
545 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
547 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
548 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
549 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 r10 = _mm_mul_pd(rsq10,rinv10);
557 /* Compute parameters for interactions between i and j atoms */
558 qq10 = _mm_mul_pd(iq1,jq0);
560 /* Calculate table index by multiplying r with table scale and truncate to integer */
561 rt = _mm_mul_pd(r10,vftabscale);
562 vfitab = _mm_cvttpd_epi32(rt);
564 vfeps = _mm_frcz_pd(rt);
566 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
568 twovfeps = _mm_add_pd(vfeps,vfeps);
569 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
572 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
573 F = _mm_setzero_pd();
574 GMX_MM_TRANSPOSE2_PD(Y,F);
575 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
576 H = _mm_setzero_pd();
577 GMX_MM_TRANSPOSE2_PD(G,H);
578 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
579 VV = _mm_macc_pd(vfeps,Fp,Y);
580 velec = _mm_mul_pd(qq10,VV);
581 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
582 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
586 velecsum = _mm_add_pd(velecsum,velec);
590 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
592 /* Update vectorial force */
593 fix1 = _mm_macc_pd(dx10,fscal,fix1);
594 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
595 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
597 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
598 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
599 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r20 = _mm_mul_pd(rsq20,rinv20);
607 /* Compute parameters for interactions between i and j atoms */
608 qq20 = _mm_mul_pd(iq2,jq0);
610 /* Calculate table index by multiplying r with table scale and truncate to integer */
611 rt = _mm_mul_pd(r20,vftabscale);
612 vfitab = _mm_cvttpd_epi32(rt);
614 vfeps = _mm_frcz_pd(rt);
616 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
618 twovfeps = _mm_add_pd(vfeps,vfeps);
619 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
622 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
623 F = _mm_setzero_pd();
624 GMX_MM_TRANSPOSE2_PD(Y,F);
625 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
626 H = _mm_setzero_pd();
627 GMX_MM_TRANSPOSE2_PD(G,H);
628 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
629 VV = _mm_macc_pd(vfeps,Fp,Y);
630 velec = _mm_mul_pd(qq20,VV);
631 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
632 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
636 velecsum = _mm_add_pd(velecsum,velec);
640 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
642 /* Update vectorial force */
643 fix2 = _mm_macc_pd(dx20,fscal,fix2);
644 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
645 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
647 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
648 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
649 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
651 /**************************
652 * CALCULATE INTERACTIONS *
653 **************************/
655 r30 = _mm_mul_pd(rsq30,rinv30);
657 /* Compute parameters for interactions between i and j atoms */
658 qq30 = _mm_mul_pd(iq3,jq0);
660 /* Calculate table index by multiplying r with table scale and truncate to integer */
661 rt = _mm_mul_pd(r30,vftabscale);
662 vfitab = _mm_cvttpd_epi32(rt);
664 vfeps = _mm_frcz_pd(rt);
666 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
668 twovfeps = _mm_add_pd(vfeps,vfeps);
669 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
671 /* CUBIC SPLINE TABLE ELECTROSTATICS */
672 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
673 F = _mm_setzero_pd();
674 GMX_MM_TRANSPOSE2_PD(Y,F);
675 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
676 H = _mm_setzero_pd();
677 GMX_MM_TRANSPOSE2_PD(G,H);
678 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
679 VV = _mm_macc_pd(vfeps,Fp,Y);
680 velec = _mm_mul_pd(qq30,VV);
681 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
682 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
684 /* Update potential sum for this i atom from the interaction with this j atom. */
685 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
686 velecsum = _mm_add_pd(velecsum,velec);
690 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
692 /* Update vectorial force */
693 fix3 = _mm_macc_pd(dx30,fscal,fix3);
694 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
695 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
697 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
698 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
699 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
701 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
703 /* Inner loop uses 200 flops */
706 /* End of innermost loop */
708 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
709 f+i_coord_offset,fshift+i_shift_offset);
712 /* Update potential energies */
713 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
714 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
716 /* Increment number of inner iterations */
717 inneriter += j_index_end - j_index_start;
719 /* Outer loop uses 26 flops */
722 /* Increment number of outer iterations */
725 /* Update outer/inner flops */
727 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*200);
730 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_double
731 * Electrostatics interaction: CubicSplineTable
732 * VdW interaction: CubicSplineTable
733 * Geometry: Water4-Particle
734 * Calculate force/pot: Force
737 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_avx_128_fma_double
738 (t_nblist * gmx_restrict nlist,
739 rvec * gmx_restrict xx,
740 rvec * gmx_restrict ff,
741 t_forcerec * gmx_restrict fr,
742 t_mdatoms * gmx_restrict mdatoms,
743 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
744 t_nrnb * gmx_restrict nrnb)
746 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
747 * just 0 for non-waters.
748 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
749 * jnr indices corresponding to data put in the four positions in the SIMD register.
751 int i_shift_offset,i_coord_offset,outeriter,inneriter;
752 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
754 int j_coord_offsetA,j_coord_offsetB;
755 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
757 real *shiftvec,*fshift,*x,*f;
758 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
760 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
762 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
764 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
766 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
767 int vdwjidx0A,vdwjidx0B;
768 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
769 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
770 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
771 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
772 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
773 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
776 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
779 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
780 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
782 __m128i ifour = _mm_set1_epi32(4);
783 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
785 __m128d dummy_mask,cutoff_mask;
786 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
787 __m128d one = _mm_set1_pd(1.0);
788 __m128d two = _mm_set1_pd(2.0);
794 jindex = nlist->jindex;
796 shiftidx = nlist->shift;
798 shiftvec = fr->shift_vec[0];
799 fshift = fr->fshift[0];
800 facel = _mm_set1_pd(fr->epsfac);
801 charge = mdatoms->chargeA;
802 nvdwtype = fr->ntype;
804 vdwtype = mdatoms->typeA;
806 vftab = kernel_data->table_elec_vdw->data;
807 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
809 /* Setup water-specific parameters */
810 inr = nlist->iinr[0];
811 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
812 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
813 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
814 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
816 /* Avoid stupid compiler warnings */
824 /* Start outer loop over neighborlists */
825 for(iidx=0; iidx<nri; iidx++)
827 /* Load shift vector for this list */
828 i_shift_offset = DIM*shiftidx[iidx];
830 /* Load limits for loop over neighbors */
831 j_index_start = jindex[iidx];
832 j_index_end = jindex[iidx+1];
834 /* Get outer coordinate index */
836 i_coord_offset = DIM*inr;
838 /* Load i particle coords and add shift vector */
839 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
840 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
842 fix0 = _mm_setzero_pd();
843 fiy0 = _mm_setzero_pd();
844 fiz0 = _mm_setzero_pd();
845 fix1 = _mm_setzero_pd();
846 fiy1 = _mm_setzero_pd();
847 fiz1 = _mm_setzero_pd();
848 fix2 = _mm_setzero_pd();
849 fiy2 = _mm_setzero_pd();
850 fiz2 = _mm_setzero_pd();
851 fix3 = _mm_setzero_pd();
852 fiy3 = _mm_setzero_pd();
853 fiz3 = _mm_setzero_pd();
855 /* Start inner kernel loop */
856 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
859 /* Get j neighbor index, and coordinate index */
862 j_coord_offsetA = DIM*jnrA;
863 j_coord_offsetB = DIM*jnrB;
865 /* load j atom coordinates */
866 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
869 /* Calculate displacement vector */
870 dx00 = _mm_sub_pd(ix0,jx0);
871 dy00 = _mm_sub_pd(iy0,jy0);
872 dz00 = _mm_sub_pd(iz0,jz0);
873 dx10 = _mm_sub_pd(ix1,jx0);
874 dy10 = _mm_sub_pd(iy1,jy0);
875 dz10 = _mm_sub_pd(iz1,jz0);
876 dx20 = _mm_sub_pd(ix2,jx0);
877 dy20 = _mm_sub_pd(iy2,jy0);
878 dz20 = _mm_sub_pd(iz2,jz0);
879 dx30 = _mm_sub_pd(ix3,jx0);
880 dy30 = _mm_sub_pd(iy3,jy0);
881 dz30 = _mm_sub_pd(iz3,jz0);
883 /* Calculate squared distance and things based on it */
884 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
885 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
886 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
887 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
889 rinv00 = gmx_mm_invsqrt_pd(rsq00);
890 rinv10 = gmx_mm_invsqrt_pd(rsq10);
891 rinv20 = gmx_mm_invsqrt_pd(rsq20);
892 rinv30 = gmx_mm_invsqrt_pd(rsq30);
894 /* Load parameters for j particles */
895 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
896 vdwjidx0A = 2*vdwtype[jnrA+0];
897 vdwjidx0B = 2*vdwtype[jnrB+0];
899 fjx0 = _mm_setzero_pd();
900 fjy0 = _mm_setzero_pd();
901 fjz0 = _mm_setzero_pd();
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 r00 = _mm_mul_pd(rsq00,rinv00);
909 /* Compute parameters for interactions between i and j atoms */
910 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
911 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
913 /* Calculate table index by multiplying r with table scale and truncate to integer */
914 rt = _mm_mul_pd(r00,vftabscale);
915 vfitab = _mm_cvttpd_epi32(rt);
917 vfeps = _mm_frcz_pd(rt);
919 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
921 twovfeps = _mm_add_pd(vfeps,vfeps);
922 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
924 /* CUBIC SPLINE TABLE DISPERSION */
925 vfitab = _mm_add_epi32(vfitab,ifour);
926 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
927 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
928 GMX_MM_TRANSPOSE2_PD(Y,F);
929 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
930 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
931 GMX_MM_TRANSPOSE2_PD(G,H);
932 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
933 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
934 fvdw6 = _mm_mul_pd(c6_00,FF);
936 /* CUBIC SPLINE TABLE REPULSION */
937 vfitab = _mm_add_epi32(vfitab,ifour);
938 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
939 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
940 GMX_MM_TRANSPOSE2_PD(Y,F);
941 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
942 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
943 GMX_MM_TRANSPOSE2_PD(G,H);
944 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
945 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
946 fvdw12 = _mm_mul_pd(c12_00,FF);
947 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
951 /* Update vectorial force */
952 fix0 = _mm_macc_pd(dx00,fscal,fix0);
953 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
954 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
956 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
957 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
958 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 r10 = _mm_mul_pd(rsq10,rinv10);
966 /* Compute parameters for interactions between i and j atoms */
967 qq10 = _mm_mul_pd(iq1,jq0);
969 /* Calculate table index by multiplying r with table scale and truncate to integer */
970 rt = _mm_mul_pd(r10,vftabscale);
971 vfitab = _mm_cvttpd_epi32(rt);
973 vfeps = _mm_frcz_pd(rt);
975 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
977 twovfeps = _mm_add_pd(vfeps,vfeps);
978 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
980 /* CUBIC SPLINE TABLE ELECTROSTATICS */
981 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
982 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
983 GMX_MM_TRANSPOSE2_PD(Y,F);
984 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
985 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
986 GMX_MM_TRANSPOSE2_PD(G,H);
987 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
988 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
989 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
993 /* Update vectorial force */
994 fix1 = _mm_macc_pd(dx10,fscal,fix1);
995 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
996 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
998 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
999 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1000 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r20 = _mm_mul_pd(rsq20,rinv20);
1008 /* Compute parameters for interactions between i and j atoms */
1009 qq20 = _mm_mul_pd(iq2,jq0);
1011 /* Calculate table index by multiplying r with table scale and truncate to integer */
1012 rt = _mm_mul_pd(r20,vftabscale);
1013 vfitab = _mm_cvttpd_epi32(rt);
1015 vfeps = _mm_frcz_pd(rt);
1017 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1019 twovfeps = _mm_add_pd(vfeps,vfeps);
1020 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1022 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1023 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1024 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1025 GMX_MM_TRANSPOSE2_PD(Y,F);
1026 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1027 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1028 GMX_MM_TRANSPOSE2_PD(G,H);
1029 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1030 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1031 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1035 /* Update vectorial force */
1036 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1037 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1038 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1040 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1041 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1042 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 r30 = _mm_mul_pd(rsq30,rinv30);
1050 /* Compute parameters for interactions between i and j atoms */
1051 qq30 = _mm_mul_pd(iq3,jq0);
1053 /* Calculate table index by multiplying r with table scale and truncate to integer */
1054 rt = _mm_mul_pd(r30,vftabscale);
1055 vfitab = _mm_cvttpd_epi32(rt);
1057 vfeps = _mm_frcz_pd(rt);
1059 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1061 twovfeps = _mm_add_pd(vfeps,vfeps);
1062 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1064 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1065 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1066 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1067 GMX_MM_TRANSPOSE2_PD(Y,F);
1068 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1069 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1070 GMX_MM_TRANSPOSE2_PD(G,H);
1071 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1072 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1073 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1077 /* Update vectorial force */
1078 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1079 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1080 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1082 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1083 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1084 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1086 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1088 /* Inner loop uses 180 flops */
1091 if(jidx<j_index_end)
1095 j_coord_offsetA = DIM*jnrA;
1097 /* load j atom coordinates */
1098 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1101 /* Calculate displacement vector */
1102 dx00 = _mm_sub_pd(ix0,jx0);
1103 dy00 = _mm_sub_pd(iy0,jy0);
1104 dz00 = _mm_sub_pd(iz0,jz0);
1105 dx10 = _mm_sub_pd(ix1,jx0);
1106 dy10 = _mm_sub_pd(iy1,jy0);
1107 dz10 = _mm_sub_pd(iz1,jz0);
1108 dx20 = _mm_sub_pd(ix2,jx0);
1109 dy20 = _mm_sub_pd(iy2,jy0);
1110 dz20 = _mm_sub_pd(iz2,jz0);
1111 dx30 = _mm_sub_pd(ix3,jx0);
1112 dy30 = _mm_sub_pd(iy3,jy0);
1113 dz30 = _mm_sub_pd(iz3,jz0);
1115 /* Calculate squared distance and things based on it */
1116 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1117 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1118 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1119 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1121 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1122 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1123 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1124 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1126 /* Load parameters for j particles */
1127 jq0 = _mm_load_sd(charge+jnrA+0);
1128 vdwjidx0A = 2*vdwtype[jnrA+0];
1130 fjx0 = _mm_setzero_pd();
1131 fjy0 = _mm_setzero_pd();
1132 fjz0 = _mm_setzero_pd();
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 r00 = _mm_mul_pd(rsq00,rinv00);
1140 /* Compute parameters for interactions between i and j atoms */
1141 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1143 /* Calculate table index by multiplying r with table scale and truncate to integer */
1144 rt = _mm_mul_pd(r00,vftabscale);
1145 vfitab = _mm_cvttpd_epi32(rt);
1147 vfeps = _mm_frcz_pd(rt);
1149 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1151 twovfeps = _mm_add_pd(vfeps,vfeps);
1152 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1154 /* CUBIC SPLINE TABLE DISPERSION */
1155 vfitab = _mm_add_epi32(vfitab,ifour);
1156 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1157 F = _mm_setzero_pd();
1158 GMX_MM_TRANSPOSE2_PD(Y,F);
1159 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1160 H = _mm_setzero_pd();
1161 GMX_MM_TRANSPOSE2_PD(G,H);
1162 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1163 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1164 fvdw6 = _mm_mul_pd(c6_00,FF);
1166 /* CUBIC SPLINE TABLE REPULSION */
1167 vfitab = _mm_add_epi32(vfitab,ifour);
1168 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1169 F = _mm_setzero_pd();
1170 GMX_MM_TRANSPOSE2_PD(Y,F);
1171 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1172 H = _mm_setzero_pd();
1173 GMX_MM_TRANSPOSE2_PD(G,H);
1174 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1175 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1176 fvdw12 = _mm_mul_pd(c12_00,FF);
1177 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1181 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1183 /* Update vectorial force */
1184 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1185 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1186 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1188 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1189 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1190 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 r10 = _mm_mul_pd(rsq10,rinv10);
1198 /* Compute parameters for interactions between i and j atoms */
1199 qq10 = _mm_mul_pd(iq1,jq0);
1201 /* Calculate table index by multiplying r with table scale and truncate to integer */
1202 rt = _mm_mul_pd(r10,vftabscale);
1203 vfitab = _mm_cvttpd_epi32(rt);
1205 vfeps = _mm_frcz_pd(rt);
1207 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1209 twovfeps = _mm_add_pd(vfeps,vfeps);
1210 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1213 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1214 F = _mm_setzero_pd();
1215 GMX_MM_TRANSPOSE2_PD(Y,F);
1216 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1217 H = _mm_setzero_pd();
1218 GMX_MM_TRANSPOSE2_PD(G,H);
1219 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1220 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1221 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1225 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1227 /* Update vectorial force */
1228 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1229 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1230 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1232 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1233 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1234 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 r20 = _mm_mul_pd(rsq20,rinv20);
1242 /* Compute parameters for interactions between i and j atoms */
1243 qq20 = _mm_mul_pd(iq2,jq0);
1245 /* Calculate table index by multiplying r with table scale and truncate to integer */
1246 rt = _mm_mul_pd(r20,vftabscale);
1247 vfitab = _mm_cvttpd_epi32(rt);
1249 vfeps = _mm_frcz_pd(rt);
1251 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1253 twovfeps = _mm_add_pd(vfeps,vfeps);
1254 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1256 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1257 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1258 F = _mm_setzero_pd();
1259 GMX_MM_TRANSPOSE2_PD(Y,F);
1260 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1261 H = _mm_setzero_pd();
1262 GMX_MM_TRANSPOSE2_PD(G,H);
1263 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1264 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1265 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1269 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1271 /* Update vectorial force */
1272 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1273 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1274 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1276 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1277 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1278 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 r30 = _mm_mul_pd(rsq30,rinv30);
1286 /* Compute parameters for interactions between i and j atoms */
1287 qq30 = _mm_mul_pd(iq3,jq0);
1289 /* Calculate table index by multiplying r with table scale and truncate to integer */
1290 rt = _mm_mul_pd(r30,vftabscale);
1291 vfitab = _mm_cvttpd_epi32(rt);
1293 vfeps = _mm_frcz_pd(rt);
1295 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1297 twovfeps = _mm_add_pd(vfeps,vfeps);
1298 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1300 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1301 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1302 F = _mm_setzero_pd();
1303 GMX_MM_TRANSPOSE2_PD(Y,F);
1304 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1305 H = _mm_setzero_pd();
1306 GMX_MM_TRANSPOSE2_PD(G,H);
1307 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1308 FF = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1309 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1313 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1315 /* Update vectorial force */
1316 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1317 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1318 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1320 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1321 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1322 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1324 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1326 /* Inner loop uses 180 flops */
1329 /* End of innermost loop */
1331 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1332 f+i_coord_offset,fshift+i_shift_offset);
1334 /* Increment number of inner iterations */
1335 inneriter += j_index_end - j_index_start;
1337 /* Outer loop uses 24 flops */
1340 /* Increment number of outer iterations */
1343 /* Update outer/inner flops */
1345 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*180);