2 * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_sse4_1_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68 int vdwjidx0A,vdwjidx0B;
69 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
70 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
71 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
74 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
77 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
78 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
80 __m128i ifour = _mm_set1_epi32(4);
81 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
83 __m128d dummy_mask,cutoff_mask;
84 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
85 __m128d one = _mm_set1_pd(1.0);
86 __m128d two = _mm_set1_pd(2.0);
92 jindex = nlist->jindex;
94 shiftidx = nlist->shift;
96 shiftvec = fr->shift_vec[0];
97 fshift = fr->fshift[0];
98 facel = _mm_set1_pd(fr->epsfac);
99 charge = mdatoms->chargeA;
100 nvdwtype = fr->ntype;
102 vdwtype = mdatoms->typeA;
104 vftab = kernel_data->table_vdw->data;
105 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
107 /* Avoid stupid compiler warnings */
115 /* Start outer loop over neighborlists */
116 for(iidx=0; iidx<nri; iidx++)
118 /* Load shift vector for this list */
119 i_shift_offset = DIM*shiftidx[iidx];
121 /* Load limits for loop over neighbors */
122 j_index_start = jindex[iidx];
123 j_index_end = jindex[iidx+1];
125 /* Get outer coordinate index */
127 i_coord_offset = DIM*inr;
129 /* Load i particle coords and add shift vector */
130 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
132 fix0 = _mm_setzero_pd();
133 fiy0 = _mm_setzero_pd();
134 fiz0 = _mm_setzero_pd();
136 /* Load parameters for i particles */
137 iq0 = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
138 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
140 /* Reset potential sums */
141 velecsum = _mm_setzero_pd();
142 vvdwsum = _mm_setzero_pd();
144 /* Start inner kernel loop */
145 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
148 /* Get j neighbor index, and coordinate index */
151 j_coord_offsetA = DIM*jnrA;
152 j_coord_offsetB = DIM*jnrB;
154 /* load j atom coordinates */
155 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
158 /* Calculate displacement vector */
159 dx00 = _mm_sub_pd(ix0,jx0);
160 dy00 = _mm_sub_pd(iy0,jy0);
161 dz00 = _mm_sub_pd(iz0,jz0);
163 /* Calculate squared distance and things based on it */
164 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
166 rinv00 = gmx_mm_invsqrt_pd(rsq00);
168 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
170 /* Load parameters for j particles */
171 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
172 vdwjidx0A = 2*vdwtype[jnrA+0];
173 vdwjidx0B = 2*vdwtype[jnrB+0];
175 /**************************
176 * CALCULATE INTERACTIONS *
177 **************************/
179 r00 = _mm_mul_pd(rsq00,rinv00);
181 /* Compute parameters for interactions between i and j atoms */
182 qq00 = _mm_mul_pd(iq0,jq0);
183 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
184 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
186 /* Calculate table index by multiplying r with table scale and truncate to integer */
187 rt = _mm_mul_pd(r00,vftabscale);
188 vfitab = _mm_cvttpd_epi32(rt);
189 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
190 vfitab = _mm_slli_epi32(vfitab,3);
192 /* COULOMB ELECTROSTATICS */
193 velec = _mm_mul_pd(qq00,rinv00);
194 felec = _mm_mul_pd(velec,rinvsq00);
196 /* CUBIC SPLINE TABLE DISPERSION */
197 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
198 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
199 GMX_MM_TRANSPOSE2_PD(Y,F);
200 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
201 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
202 GMX_MM_TRANSPOSE2_PD(G,H);
203 Heps = _mm_mul_pd(vfeps,H);
204 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
205 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
206 vvdw6 = _mm_mul_pd(c6_00,VV);
207 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
208 fvdw6 = _mm_mul_pd(c6_00,FF);
210 /* CUBIC SPLINE TABLE REPULSION */
211 vfitab = _mm_add_epi32(vfitab,ifour);
212 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
213 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
214 GMX_MM_TRANSPOSE2_PD(Y,F);
215 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
216 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
217 GMX_MM_TRANSPOSE2_PD(G,H);
218 Heps = _mm_mul_pd(vfeps,H);
219 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
220 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
221 vvdw12 = _mm_mul_pd(c12_00,VV);
222 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
223 fvdw12 = _mm_mul_pd(c12_00,FF);
224 vvdw = _mm_add_pd(vvdw12,vvdw6);
225 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
227 /* Update potential sum for this i atom from the interaction with this j atom. */
228 velecsum = _mm_add_pd(velecsum,velec);
229 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
231 fscal = _mm_add_pd(felec,fvdw);
233 /* Calculate temporary vectorial force */
234 tx = _mm_mul_pd(fscal,dx00);
235 ty = _mm_mul_pd(fscal,dy00);
236 tz = _mm_mul_pd(fscal,dz00);
238 /* Update vectorial force */
239 fix0 = _mm_add_pd(fix0,tx);
240 fiy0 = _mm_add_pd(fiy0,ty);
241 fiz0 = _mm_add_pd(fiz0,tz);
243 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
245 /* Inner loop uses 63 flops */
252 j_coord_offsetA = DIM*jnrA;
254 /* load j atom coordinates */
255 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
258 /* Calculate displacement vector */
259 dx00 = _mm_sub_pd(ix0,jx0);
260 dy00 = _mm_sub_pd(iy0,jy0);
261 dz00 = _mm_sub_pd(iz0,jz0);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
266 rinv00 = gmx_mm_invsqrt_pd(rsq00);
268 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
270 /* Load parameters for j particles */
271 jq0 = _mm_load_sd(charge+jnrA+0);
272 vdwjidx0A = 2*vdwtype[jnrA+0];
274 /**************************
275 * CALCULATE INTERACTIONS *
276 **************************/
278 r00 = _mm_mul_pd(rsq00,rinv00);
280 /* Compute parameters for interactions between i and j atoms */
281 qq00 = _mm_mul_pd(iq0,jq0);
282 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
284 /* Calculate table index by multiplying r with table scale and truncate to integer */
285 rt = _mm_mul_pd(r00,vftabscale);
286 vfitab = _mm_cvttpd_epi32(rt);
287 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
288 vfitab = _mm_slli_epi32(vfitab,3);
290 /* COULOMB ELECTROSTATICS */
291 velec = _mm_mul_pd(qq00,rinv00);
292 felec = _mm_mul_pd(velec,rinvsq00);
294 /* CUBIC SPLINE TABLE DISPERSION */
295 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
296 F = _mm_setzero_pd();
297 GMX_MM_TRANSPOSE2_PD(Y,F);
298 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
299 H = _mm_setzero_pd();
300 GMX_MM_TRANSPOSE2_PD(G,H);
301 Heps = _mm_mul_pd(vfeps,H);
302 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
303 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
304 vvdw6 = _mm_mul_pd(c6_00,VV);
305 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
306 fvdw6 = _mm_mul_pd(c6_00,FF);
308 /* CUBIC SPLINE TABLE REPULSION */
309 vfitab = _mm_add_epi32(vfitab,ifour);
310 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
311 F = _mm_setzero_pd();
312 GMX_MM_TRANSPOSE2_PD(Y,F);
313 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
314 H = _mm_setzero_pd();
315 GMX_MM_TRANSPOSE2_PD(G,H);
316 Heps = _mm_mul_pd(vfeps,H);
317 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
318 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
319 vvdw12 = _mm_mul_pd(c12_00,VV);
320 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
321 fvdw12 = _mm_mul_pd(c12_00,FF);
322 vvdw = _mm_add_pd(vvdw12,vvdw6);
323 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
325 /* Update potential sum for this i atom from the interaction with this j atom. */
326 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
327 velecsum = _mm_add_pd(velecsum,velec);
328 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
329 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
331 fscal = _mm_add_pd(felec,fvdw);
333 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
335 /* Calculate temporary vectorial force */
336 tx = _mm_mul_pd(fscal,dx00);
337 ty = _mm_mul_pd(fscal,dy00);
338 tz = _mm_mul_pd(fscal,dz00);
340 /* Update vectorial force */
341 fix0 = _mm_add_pd(fix0,tx);
342 fiy0 = _mm_add_pd(fiy0,ty);
343 fiz0 = _mm_add_pd(fiz0,tz);
345 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
347 /* Inner loop uses 63 flops */
350 /* End of innermost loop */
352 gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
353 f+i_coord_offset,fshift+i_shift_offset);
356 /* Update potential energies */
357 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
358 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
360 /* Increment number of inner iterations */
361 inneriter += j_index_end - j_index_start;
363 /* Outer loop uses 9 flops */
366 /* Increment number of outer iterations */
369 /* Update outer/inner flops */
371 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*63);
374 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_double
375 * Electrostatics interaction: Coulomb
376 * VdW interaction: CubicSplineTable
377 * Geometry: Particle-Particle
378 * Calculate force/pot: Force
381 nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_double
382 (t_nblist * gmx_restrict nlist,
383 rvec * gmx_restrict xx,
384 rvec * gmx_restrict ff,
385 t_forcerec * gmx_restrict fr,
386 t_mdatoms * gmx_restrict mdatoms,
387 nb_kernel_data_t * gmx_restrict kernel_data,
388 t_nrnb * gmx_restrict nrnb)
390 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
391 * just 0 for non-waters.
392 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
393 * jnr indices corresponding to data put in the four positions in the SIMD register.
395 int i_shift_offset,i_coord_offset,outeriter,inneriter;
396 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
398 int j_coord_offsetA,j_coord_offsetB;
399 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
401 real *shiftvec,*fshift,*x,*f;
402 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
404 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
405 int vdwjidx0A,vdwjidx0B;
406 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
407 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
408 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
411 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
414 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
415 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
417 __m128i ifour = _mm_set1_epi32(4);
418 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
420 __m128d dummy_mask,cutoff_mask;
421 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
422 __m128d one = _mm_set1_pd(1.0);
423 __m128d two = _mm_set1_pd(2.0);
429 jindex = nlist->jindex;
431 shiftidx = nlist->shift;
433 shiftvec = fr->shift_vec[0];
434 fshift = fr->fshift[0];
435 facel = _mm_set1_pd(fr->epsfac);
436 charge = mdatoms->chargeA;
437 nvdwtype = fr->ntype;
439 vdwtype = mdatoms->typeA;
441 vftab = kernel_data->table_vdw->data;
442 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
444 /* Avoid stupid compiler warnings */
452 /* Start outer loop over neighborlists */
453 for(iidx=0; iidx<nri; iidx++)
455 /* Load shift vector for this list */
456 i_shift_offset = DIM*shiftidx[iidx];
458 /* Load limits for loop over neighbors */
459 j_index_start = jindex[iidx];
460 j_index_end = jindex[iidx+1];
462 /* Get outer coordinate index */
464 i_coord_offset = DIM*inr;
466 /* Load i particle coords and add shift vector */
467 gmx_mm_load_shift_and_1rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
469 fix0 = _mm_setzero_pd();
470 fiy0 = _mm_setzero_pd();
471 fiz0 = _mm_setzero_pd();
473 /* Load parameters for i particles */
474 iq0 = _mm_mul_pd(facel,_mm_load1_pd(charge+inr+0));
475 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
477 /* Start inner kernel loop */
478 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
481 /* Get j neighbor index, and coordinate index */
484 j_coord_offsetA = DIM*jnrA;
485 j_coord_offsetB = DIM*jnrB;
487 /* load j atom coordinates */
488 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
491 /* Calculate displacement vector */
492 dx00 = _mm_sub_pd(ix0,jx0);
493 dy00 = _mm_sub_pd(iy0,jy0);
494 dz00 = _mm_sub_pd(iz0,jz0);
496 /* Calculate squared distance and things based on it */
497 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
499 rinv00 = gmx_mm_invsqrt_pd(rsq00);
501 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
503 /* Load parameters for j particles */
504 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
505 vdwjidx0A = 2*vdwtype[jnrA+0];
506 vdwjidx0B = 2*vdwtype[jnrB+0];
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 r00 = _mm_mul_pd(rsq00,rinv00);
514 /* Compute parameters for interactions between i and j atoms */
515 qq00 = _mm_mul_pd(iq0,jq0);
516 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
517 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
519 /* Calculate table index by multiplying r with table scale and truncate to integer */
520 rt = _mm_mul_pd(r00,vftabscale);
521 vfitab = _mm_cvttpd_epi32(rt);
522 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
523 vfitab = _mm_slli_epi32(vfitab,3);
525 /* COULOMB ELECTROSTATICS */
526 velec = _mm_mul_pd(qq00,rinv00);
527 felec = _mm_mul_pd(velec,rinvsq00);
529 /* CUBIC SPLINE TABLE DISPERSION */
530 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
531 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
532 GMX_MM_TRANSPOSE2_PD(Y,F);
533 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
534 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
535 GMX_MM_TRANSPOSE2_PD(G,H);
536 Heps = _mm_mul_pd(vfeps,H);
537 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
538 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
539 fvdw6 = _mm_mul_pd(c6_00,FF);
541 /* CUBIC SPLINE TABLE REPULSION */
542 vfitab = _mm_add_epi32(vfitab,ifour);
543 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
544 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
545 GMX_MM_TRANSPOSE2_PD(Y,F);
546 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
547 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
548 GMX_MM_TRANSPOSE2_PD(G,H);
549 Heps = _mm_mul_pd(vfeps,H);
550 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
551 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
552 fvdw12 = _mm_mul_pd(c12_00,FF);
553 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
555 fscal = _mm_add_pd(felec,fvdw);
557 /* Calculate temporary vectorial force */
558 tx = _mm_mul_pd(fscal,dx00);
559 ty = _mm_mul_pd(fscal,dy00);
560 tz = _mm_mul_pd(fscal,dz00);
562 /* Update vectorial force */
563 fix0 = _mm_add_pd(fix0,tx);
564 fiy0 = _mm_add_pd(fiy0,ty);
565 fiz0 = _mm_add_pd(fiz0,tz);
567 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,tx,ty,tz);
569 /* Inner loop uses 54 flops */
576 j_coord_offsetA = DIM*jnrA;
578 /* load j atom coordinates */
579 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
582 /* Calculate displacement vector */
583 dx00 = _mm_sub_pd(ix0,jx0);
584 dy00 = _mm_sub_pd(iy0,jy0);
585 dz00 = _mm_sub_pd(iz0,jz0);
587 /* Calculate squared distance and things based on it */
588 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
590 rinv00 = gmx_mm_invsqrt_pd(rsq00);
592 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
594 /* Load parameters for j particles */
595 jq0 = _mm_load_sd(charge+jnrA+0);
596 vdwjidx0A = 2*vdwtype[jnrA+0];
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
602 r00 = _mm_mul_pd(rsq00,rinv00);
604 /* Compute parameters for interactions between i and j atoms */
605 qq00 = _mm_mul_pd(iq0,jq0);
606 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
608 /* Calculate table index by multiplying r with table scale and truncate to integer */
609 rt = _mm_mul_pd(r00,vftabscale);
610 vfitab = _mm_cvttpd_epi32(rt);
611 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
612 vfitab = _mm_slli_epi32(vfitab,3);
614 /* COULOMB ELECTROSTATICS */
615 velec = _mm_mul_pd(qq00,rinv00);
616 felec = _mm_mul_pd(velec,rinvsq00);
618 /* CUBIC SPLINE TABLE DISPERSION */
619 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
620 F = _mm_setzero_pd();
621 GMX_MM_TRANSPOSE2_PD(Y,F);
622 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
623 H = _mm_setzero_pd();
624 GMX_MM_TRANSPOSE2_PD(G,H);
625 Heps = _mm_mul_pd(vfeps,H);
626 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
627 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
628 fvdw6 = _mm_mul_pd(c6_00,FF);
630 /* CUBIC SPLINE TABLE REPULSION */
631 vfitab = _mm_add_epi32(vfitab,ifour);
632 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
633 F = _mm_setzero_pd();
634 GMX_MM_TRANSPOSE2_PD(Y,F);
635 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
636 H = _mm_setzero_pd();
637 GMX_MM_TRANSPOSE2_PD(G,H);
638 Heps = _mm_mul_pd(vfeps,H);
639 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
640 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
641 fvdw12 = _mm_mul_pd(c12_00,FF);
642 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
644 fscal = _mm_add_pd(felec,fvdw);
646 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
648 /* Calculate temporary vectorial force */
649 tx = _mm_mul_pd(fscal,dx00);
650 ty = _mm_mul_pd(fscal,dy00);
651 tz = _mm_mul_pd(fscal,dz00);
653 /* Update vectorial force */
654 fix0 = _mm_add_pd(fix0,tx);
655 fiy0 = _mm_add_pd(fiy0,ty);
656 fiz0 = _mm_add_pd(fiz0,tz);
658 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,tx,ty,tz);
660 /* Inner loop uses 54 flops */
663 /* End of innermost loop */
665 gmx_mm_update_iforce_1atom_swizzle_pd(fix0,fiy0,fiz0,
666 f+i_coord_offset,fshift+i_shift_offset);
668 /* Increment number of inner iterations */
669 inneriter += j_index_end - j_index_start;
671 /* Outer loop uses 7 flops */
674 /* Increment number of outer iterations */
677 /* Update outer/inner flops */
679 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*54);