2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
38 * Electrostatics interaction: GeneralizedBorn
39 * VdW interaction: CubicSplineTable
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecGB_VdwCSTab_GeomP1P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
69 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
70 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
71 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
74 __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
75 __m128 minushalf = _mm_set1_ps(-0.5);
76 real *invsqrta,*dvda,*gbtab;
78 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
81 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
82 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
84 __m128i ifour = _mm_set1_epi32(4);
85 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
87 __m128 dummy_mask,cutoff_mask;
88 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
89 __m128 one = _mm_set1_ps(1.0);
90 __m128 two = _mm_set1_ps(2.0);
96 jindex = nlist->jindex;
98 shiftidx = nlist->shift;
100 shiftvec = fr->shift_vec[0];
101 fshift = fr->fshift[0];
102 facel = _mm_set1_ps(fr->epsfac);
103 charge = mdatoms->chargeA;
104 nvdwtype = fr->ntype;
106 vdwtype = mdatoms->typeA;
108 vftab = kernel_data->table_vdw->data;
109 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
111 invsqrta = fr->invsqrta;
113 gbtabscale = _mm_set1_ps(fr->gbtab.scale);
114 gbtab = fr->gbtab.data;
115 gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
117 /* Avoid stupid compiler warnings */
118 jnrA = jnrB = jnrC = jnrD = 0;
127 /* Start outer loop over neighborlists */
128 for(iidx=0; iidx<nri; iidx++)
130 /* Load shift vector for this list */
131 i_shift_offset = DIM*shiftidx[iidx];
132 shX = shiftvec[i_shift_offset+XX];
133 shY = shiftvec[i_shift_offset+YY];
134 shZ = shiftvec[i_shift_offset+ZZ];
136 /* Load limits for loop over neighbors */
137 j_index_start = jindex[iidx];
138 j_index_end = jindex[iidx+1];
140 /* Get outer coordinate index */
142 i_coord_offset = DIM*inr;
144 /* Load i particle coords and add shift vector */
145 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
146 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
147 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
149 fix0 = _mm_setzero_ps();
150 fiy0 = _mm_setzero_ps();
151 fiz0 = _mm_setzero_ps();
153 /* Load parameters for i particles */
154 iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
155 isai0 = _mm_load1_ps(invsqrta+inr+0);
156 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
158 /* Reset potential sums */
159 velecsum = _mm_setzero_ps();
160 vgbsum = _mm_setzero_ps();
161 vvdwsum = _mm_setzero_ps();
162 dvdasum = _mm_setzero_ps();
164 /* Start inner kernel loop */
165 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
168 /* Get j neighbor index, and coordinate index */
174 j_coord_offsetA = DIM*jnrA;
175 j_coord_offsetB = DIM*jnrB;
176 j_coord_offsetC = DIM*jnrC;
177 j_coord_offsetD = DIM*jnrD;
179 /* load j atom coordinates */
180 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
181 x+j_coord_offsetC,x+j_coord_offsetD,
184 /* Calculate displacement vector */
185 dx00 = _mm_sub_ps(ix0,jx0);
186 dy00 = _mm_sub_ps(iy0,jy0);
187 dz00 = _mm_sub_ps(iz0,jz0);
189 /* Calculate squared distance and things based on it */
190 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
192 rinv00 = gmx_mm_invsqrt_ps(rsq00);
194 /* Load parameters for j particles */
195 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
196 charge+jnrC+0,charge+jnrD+0);
197 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
198 invsqrta+jnrC+0,invsqrta+jnrD+0);
199 vdwjidx0A = 2*vdwtype[jnrA+0];
200 vdwjidx0B = 2*vdwtype[jnrB+0];
201 vdwjidx0C = 2*vdwtype[jnrC+0];
202 vdwjidx0D = 2*vdwtype[jnrD+0];
204 /**************************
205 * CALCULATE INTERACTIONS *
206 **************************/
208 r00 = _mm_mul_ps(rsq00,rinv00);
210 /* Compute parameters for interactions between i and j atoms */
211 qq00 = _mm_mul_ps(iq0,jq0);
212 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
213 vdwparam+vdwioffset0+vdwjidx0B,
214 vdwparam+vdwioffset0+vdwjidx0C,
215 vdwparam+vdwioffset0+vdwjidx0D,
218 /* Calculate table index by multiplying r with table scale and truncate to integer */
219 rt = _mm_mul_ps(r00,vftabscale);
220 vfitab = _mm_cvttps_epi32(rt);
221 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
222 vfitab = _mm_slli_epi32(vfitab,3);
224 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
225 isaprod = _mm_mul_ps(isai0,isaj0);
226 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
227 gbscale = _mm_mul_ps(isaprod,gbtabscale);
228 dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
230 /* Calculate generalized born table index - this is a separate table from the normal one,
231 * but we use the same procedure by multiplying r with scale and truncating to integer.
233 rt = _mm_mul_ps(r00,gbscale);
234 gbitab = _mm_cvttps_epi32(rt);
235 gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
236 gbitab = _mm_slli_epi32(gbitab,2);
238 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
239 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
240 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
241 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
242 _MM_TRANSPOSE4_PS(Y,F,G,H);
243 Heps = _mm_mul_ps(gbeps,H);
244 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
245 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
246 vgb = _mm_mul_ps(gbqqfactor,VV);
248 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
249 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
250 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
251 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
252 gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
253 _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
254 velec = _mm_mul_ps(qq00,rinv00);
255 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
257 /* CUBIC SPLINE TABLE DISPERSION */
258 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
259 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
260 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
261 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
262 _MM_TRANSPOSE4_PS(Y,F,G,H);
263 Heps = _mm_mul_ps(vfeps,H);
264 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
265 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
266 vvdw6 = _mm_mul_ps(c6_00,VV);
267 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
268 fvdw6 = _mm_mul_ps(c6_00,FF);
270 /* CUBIC SPLINE TABLE REPULSION */
271 vfitab = _mm_add_epi32(vfitab,ifour);
272 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
273 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
274 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
275 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
276 _MM_TRANSPOSE4_PS(Y,F,G,H);
277 Heps = _mm_mul_ps(vfeps,H);
278 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
279 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
280 vvdw12 = _mm_mul_ps(c12_00,VV);
281 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
282 fvdw12 = _mm_mul_ps(c12_00,FF);
283 vvdw = _mm_add_ps(vvdw12,vvdw6);
284 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
286 /* Update potential sum for this i atom from the interaction with this j atom. */
287 velecsum = _mm_add_ps(velecsum,velec);
288 vgbsum = _mm_add_ps(vgbsum,vgb);
289 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
291 fscal = _mm_add_ps(felec,fvdw);
293 /* Calculate temporary vectorial force */
294 tx = _mm_mul_ps(fscal,dx00);
295 ty = _mm_mul_ps(fscal,dy00);
296 tz = _mm_mul_ps(fscal,dz00);
298 /* Update vectorial force */
299 fix0 = _mm_add_ps(fix0,tx);
300 fiy0 = _mm_add_ps(fiy0,ty);
301 fiz0 = _mm_add_ps(fiz0,tz);
303 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
304 f+j_coord_offsetC,f+j_coord_offsetD,
307 /* Inner loop uses 92 flops */
313 /* Get j neighbor index, and coordinate index */
319 /* Sign of each element will be negative for non-real atoms.
320 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
321 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
323 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
324 jnrA = (jnrA>=0) ? jnrA : 0;
325 jnrB = (jnrB>=0) ? jnrB : 0;
326 jnrC = (jnrC>=0) ? jnrC : 0;
327 jnrD = (jnrD>=0) ? jnrD : 0;
329 j_coord_offsetA = DIM*jnrA;
330 j_coord_offsetB = DIM*jnrB;
331 j_coord_offsetC = DIM*jnrC;
332 j_coord_offsetD = DIM*jnrD;
334 /* load j atom coordinates */
335 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
336 x+j_coord_offsetC,x+j_coord_offsetD,
339 /* Calculate displacement vector */
340 dx00 = _mm_sub_ps(ix0,jx0);
341 dy00 = _mm_sub_ps(iy0,jy0);
342 dz00 = _mm_sub_ps(iz0,jz0);
344 /* Calculate squared distance and things based on it */
345 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
347 rinv00 = gmx_mm_invsqrt_ps(rsq00);
349 /* Load parameters for j particles */
350 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
351 charge+jnrC+0,charge+jnrD+0);
352 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
353 invsqrta+jnrC+0,invsqrta+jnrD+0);
354 vdwjidx0A = 2*vdwtype[jnrA+0];
355 vdwjidx0B = 2*vdwtype[jnrB+0];
356 vdwjidx0C = 2*vdwtype[jnrC+0];
357 vdwjidx0D = 2*vdwtype[jnrD+0];
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 r00 = _mm_mul_ps(rsq00,rinv00);
364 r00 = _mm_andnot_ps(dummy_mask,r00);
366 /* Compute parameters for interactions between i and j atoms */
367 qq00 = _mm_mul_ps(iq0,jq0);
368 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
369 vdwparam+vdwioffset0+vdwjidx0B,
370 vdwparam+vdwioffset0+vdwjidx0C,
371 vdwparam+vdwioffset0+vdwjidx0D,
374 /* Calculate table index by multiplying r with table scale and truncate to integer */
375 rt = _mm_mul_ps(r00,vftabscale);
376 vfitab = _mm_cvttps_epi32(rt);
377 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
378 vfitab = _mm_slli_epi32(vfitab,3);
380 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
381 isaprod = _mm_mul_ps(isai0,isaj0);
382 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
383 gbscale = _mm_mul_ps(isaprod,gbtabscale);
384 dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
386 /* Calculate generalized born table index - this is a separate table from the normal one,
387 * but we use the same procedure by multiplying r with scale and truncating to integer.
389 rt = _mm_mul_ps(r00,gbscale);
390 gbitab = _mm_cvttps_epi32(rt);
391 gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
392 gbitab = _mm_slli_epi32(gbitab,2);
394 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
395 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
396 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
397 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
398 _MM_TRANSPOSE4_PS(Y,F,G,H);
399 Heps = _mm_mul_ps(gbeps,H);
400 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
401 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
402 vgb = _mm_mul_ps(gbqqfactor,VV);
404 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
405 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
406 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
407 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
408 gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
409 _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
410 velec = _mm_mul_ps(qq00,rinv00);
411 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
413 /* CUBIC SPLINE TABLE DISPERSION */
414 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
415 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
416 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
417 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
418 _MM_TRANSPOSE4_PS(Y,F,G,H);
419 Heps = _mm_mul_ps(vfeps,H);
420 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
421 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
422 vvdw6 = _mm_mul_ps(c6_00,VV);
423 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
424 fvdw6 = _mm_mul_ps(c6_00,FF);
426 /* CUBIC SPLINE TABLE REPULSION */
427 vfitab = _mm_add_epi32(vfitab,ifour);
428 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
429 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
430 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
431 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
432 _MM_TRANSPOSE4_PS(Y,F,G,H);
433 Heps = _mm_mul_ps(vfeps,H);
434 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
435 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
436 vvdw12 = _mm_mul_ps(c12_00,VV);
437 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
438 fvdw12 = _mm_mul_ps(c12_00,FF);
439 vvdw = _mm_add_ps(vvdw12,vvdw6);
440 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velec = _mm_andnot_ps(dummy_mask,velec);
444 velecsum = _mm_add_ps(velecsum,velec);
445 vgb = _mm_andnot_ps(dummy_mask,vgb);
446 vgbsum = _mm_add_ps(vgbsum,vgb);
447 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
448 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
450 fscal = _mm_add_ps(felec,fvdw);
452 fscal = _mm_andnot_ps(dummy_mask,fscal);
454 /* Calculate temporary vectorial force */
455 tx = _mm_mul_ps(fscal,dx00);
456 ty = _mm_mul_ps(fscal,dy00);
457 tz = _mm_mul_ps(fscal,dz00);
459 /* Update vectorial force */
460 fix0 = _mm_add_ps(fix0,tx);
461 fiy0 = _mm_add_ps(fiy0,ty);
462 fiz0 = _mm_add_ps(fiz0,tz);
464 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
465 f+j_coord_offsetC,f+j_coord_offsetD,
468 /* Inner loop uses 93 flops */
471 /* End of innermost loop */
473 gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
474 f+i_coord_offset,fshift+i_shift_offset);
477 /* Update potential energies */
478 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
479 gmx_mm_update_1pot_ps(vgbsum,kernel_data->energygrp_polarization+ggid);
480 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
481 dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
482 gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
484 /* Increment number of inner iterations */
485 inneriter += j_index_end - j_index_start;
487 /* Outer loop uses 13 flops */
490 /* Increment number of outer iterations */
493 /* Update outer/inner flops */
495 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*93);
498 * Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
499 * Electrostatics interaction: GeneralizedBorn
500 * VdW interaction: CubicSplineTable
501 * Geometry: Particle-Particle
502 * Calculate force/pot: Force
505 nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
506 (t_nblist * gmx_restrict nlist,
507 rvec * gmx_restrict xx,
508 rvec * gmx_restrict ff,
509 t_forcerec * gmx_restrict fr,
510 t_mdatoms * gmx_restrict mdatoms,
511 nb_kernel_data_t * gmx_restrict kernel_data,
512 t_nrnb * gmx_restrict nrnb)
514 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
515 * just 0 for non-waters.
516 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
517 * jnr indices corresponding to data put in the four positions in the SIMD register.
519 int i_shift_offset,i_coord_offset,outeriter,inneriter;
520 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
521 int jnrA,jnrB,jnrC,jnrD;
522 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
523 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
524 real shX,shY,shZ,rcutoff_scalar;
525 real *shiftvec,*fshift,*x,*f;
526 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
528 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
529 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
530 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
531 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
532 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
535 __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
536 __m128 minushalf = _mm_set1_ps(-0.5);
537 real *invsqrta,*dvda,*gbtab;
539 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
542 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
543 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
545 __m128i ifour = _mm_set1_epi32(4);
546 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
548 __m128 dummy_mask,cutoff_mask;
549 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
550 __m128 one = _mm_set1_ps(1.0);
551 __m128 two = _mm_set1_ps(2.0);
557 jindex = nlist->jindex;
559 shiftidx = nlist->shift;
561 shiftvec = fr->shift_vec[0];
562 fshift = fr->fshift[0];
563 facel = _mm_set1_ps(fr->epsfac);
564 charge = mdatoms->chargeA;
565 nvdwtype = fr->ntype;
567 vdwtype = mdatoms->typeA;
569 vftab = kernel_data->table_vdw->data;
570 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
572 invsqrta = fr->invsqrta;
574 gbtabscale = _mm_set1_ps(fr->gbtab.scale);
575 gbtab = fr->gbtab.data;
576 gbinvepsdiff = _mm_set1_ps((1.0/fr->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
578 /* Avoid stupid compiler warnings */
579 jnrA = jnrB = jnrC = jnrD = 0;
588 /* Start outer loop over neighborlists */
589 for(iidx=0; iidx<nri; iidx++)
591 /* Load shift vector for this list */
592 i_shift_offset = DIM*shiftidx[iidx];
593 shX = shiftvec[i_shift_offset+XX];
594 shY = shiftvec[i_shift_offset+YY];
595 shZ = shiftvec[i_shift_offset+ZZ];
597 /* Load limits for loop over neighbors */
598 j_index_start = jindex[iidx];
599 j_index_end = jindex[iidx+1];
601 /* Get outer coordinate index */
603 i_coord_offset = DIM*inr;
605 /* Load i particle coords and add shift vector */
606 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
607 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
608 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
610 fix0 = _mm_setzero_ps();
611 fiy0 = _mm_setzero_ps();
612 fiz0 = _mm_setzero_ps();
614 /* Load parameters for i particles */
615 iq0 = _mm_mul_ps(facel,_mm_load1_ps(charge+inr+0));
616 isai0 = _mm_load1_ps(invsqrta+inr+0);
617 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
619 dvdasum = _mm_setzero_ps();
621 /* Start inner kernel loop */
622 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
625 /* Get j neighbor index, and coordinate index */
631 j_coord_offsetA = DIM*jnrA;
632 j_coord_offsetB = DIM*jnrB;
633 j_coord_offsetC = DIM*jnrC;
634 j_coord_offsetD = DIM*jnrD;
636 /* load j atom coordinates */
637 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
638 x+j_coord_offsetC,x+j_coord_offsetD,
641 /* Calculate displacement vector */
642 dx00 = _mm_sub_ps(ix0,jx0);
643 dy00 = _mm_sub_ps(iy0,jy0);
644 dz00 = _mm_sub_ps(iz0,jz0);
646 /* Calculate squared distance and things based on it */
647 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
649 rinv00 = gmx_mm_invsqrt_ps(rsq00);
651 /* Load parameters for j particles */
652 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
653 charge+jnrC+0,charge+jnrD+0);
654 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
655 invsqrta+jnrC+0,invsqrta+jnrD+0);
656 vdwjidx0A = 2*vdwtype[jnrA+0];
657 vdwjidx0B = 2*vdwtype[jnrB+0];
658 vdwjidx0C = 2*vdwtype[jnrC+0];
659 vdwjidx0D = 2*vdwtype[jnrD+0];
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 r00 = _mm_mul_ps(rsq00,rinv00);
667 /* Compute parameters for interactions between i and j atoms */
668 qq00 = _mm_mul_ps(iq0,jq0);
669 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
670 vdwparam+vdwioffset0+vdwjidx0B,
671 vdwparam+vdwioffset0+vdwjidx0C,
672 vdwparam+vdwioffset0+vdwjidx0D,
675 /* Calculate table index by multiplying r with table scale and truncate to integer */
676 rt = _mm_mul_ps(r00,vftabscale);
677 vfitab = _mm_cvttps_epi32(rt);
678 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
679 vfitab = _mm_slli_epi32(vfitab,3);
681 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
682 isaprod = _mm_mul_ps(isai0,isaj0);
683 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
684 gbscale = _mm_mul_ps(isaprod,gbtabscale);
685 dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
687 /* Calculate generalized born table index - this is a separate table from the normal one,
688 * but we use the same procedure by multiplying r with scale and truncating to integer.
690 rt = _mm_mul_ps(r00,gbscale);
691 gbitab = _mm_cvttps_epi32(rt);
692 gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
693 gbitab = _mm_slli_epi32(gbitab,2);
695 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
696 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
697 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
698 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
699 _MM_TRANSPOSE4_PS(Y,F,G,H);
700 Heps = _mm_mul_ps(gbeps,H);
701 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
702 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
703 vgb = _mm_mul_ps(gbqqfactor,VV);
705 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
706 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
707 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
708 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
709 gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
710 _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
711 velec = _mm_mul_ps(qq00,rinv00);
712 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
714 /* CUBIC SPLINE TABLE DISPERSION */
715 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
716 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
717 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
718 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
719 _MM_TRANSPOSE4_PS(Y,F,G,H);
720 Heps = _mm_mul_ps(vfeps,H);
721 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
722 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
723 fvdw6 = _mm_mul_ps(c6_00,FF);
725 /* CUBIC SPLINE TABLE REPULSION */
726 vfitab = _mm_add_epi32(vfitab,ifour);
727 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
728 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
729 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
730 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
731 _MM_TRANSPOSE4_PS(Y,F,G,H);
732 Heps = _mm_mul_ps(vfeps,H);
733 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
734 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
735 fvdw12 = _mm_mul_ps(c12_00,FF);
736 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
738 fscal = _mm_add_ps(felec,fvdw);
740 /* Calculate temporary vectorial force */
741 tx = _mm_mul_ps(fscal,dx00);
742 ty = _mm_mul_ps(fscal,dy00);
743 tz = _mm_mul_ps(fscal,dz00);
745 /* Update vectorial force */
746 fix0 = _mm_add_ps(fix0,tx);
747 fiy0 = _mm_add_ps(fiy0,ty);
748 fiz0 = _mm_add_ps(fiz0,tz);
750 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
751 f+j_coord_offsetC,f+j_coord_offsetD,
754 /* Inner loop uses 82 flops */
760 /* Get j neighbor index, and coordinate index */
766 /* Sign of each element will be negative for non-real atoms.
767 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
768 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
770 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
771 jnrA = (jnrA>=0) ? jnrA : 0;
772 jnrB = (jnrB>=0) ? jnrB : 0;
773 jnrC = (jnrC>=0) ? jnrC : 0;
774 jnrD = (jnrD>=0) ? jnrD : 0;
776 j_coord_offsetA = DIM*jnrA;
777 j_coord_offsetB = DIM*jnrB;
778 j_coord_offsetC = DIM*jnrC;
779 j_coord_offsetD = DIM*jnrD;
781 /* load j atom coordinates */
782 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
783 x+j_coord_offsetC,x+j_coord_offsetD,
786 /* Calculate displacement vector */
787 dx00 = _mm_sub_ps(ix0,jx0);
788 dy00 = _mm_sub_ps(iy0,jy0);
789 dz00 = _mm_sub_ps(iz0,jz0);
791 /* Calculate squared distance and things based on it */
792 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
794 rinv00 = gmx_mm_invsqrt_ps(rsq00);
796 /* Load parameters for j particles */
797 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
798 charge+jnrC+0,charge+jnrD+0);
799 isaj0 = gmx_mm_load_4real_swizzle_ps(invsqrta+jnrA+0,invsqrta+jnrB+0,
800 invsqrta+jnrC+0,invsqrta+jnrD+0);
801 vdwjidx0A = 2*vdwtype[jnrA+0];
802 vdwjidx0B = 2*vdwtype[jnrB+0];
803 vdwjidx0C = 2*vdwtype[jnrC+0];
804 vdwjidx0D = 2*vdwtype[jnrD+0];
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 r00 = _mm_mul_ps(rsq00,rinv00);
811 r00 = _mm_andnot_ps(dummy_mask,r00);
813 /* Compute parameters for interactions between i and j atoms */
814 qq00 = _mm_mul_ps(iq0,jq0);
815 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
816 vdwparam+vdwioffset0+vdwjidx0B,
817 vdwparam+vdwioffset0+vdwjidx0C,
818 vdwparam+vdwioffset0+vdwjidx0D,
821 /* Calculate table index by multiplying r with table scale and truncate to integer */
822 rt = _mm_mul_ps(r00,vftabscale);
823 vfitab = _mm_cvttps_epi32(rt);
824 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
825 vfitab = _mm_slli_epi32(vfitab,3);
827 /* GENERALIZED BORN AND COULOMB ELECTROSTATICS */
828 isaprod = _mm_mul_ps(isai0,isaj0);
829 gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
830 gbscale = _mm_mul_ps(isaprod,gbtabscale);
831 dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
833 /* Calculate generalized born table index - this is a separate table from the normal one,
834 * but we use the same procedure by multiplying r with scale and truncating to integer.
836 rt = _mm_mul_ps(r00,gbscale);
837 gbitab = _mm_cvttps_epi32(rt);
838 gbeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(gbitab));
839 gbitab = _mm_slli_epi32(gbitab,2);
841 Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
842 F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
843 G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
844 H = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,3) );
845 _MM_TRANSPOSE4_PS(Y,F,G,H);
846 Heps = _mm_mul_ps(gbeps,H);
847 Fp = _mm_add_ps(F,_mm_mul_ps(gbeps,_mm_add_ps(G,Heps)));
848 VV = _mm_add_ps(Y,_mm_mul_ps(gbeps,Fp));
849 vgb = _mm_mul_ps(gbqqfactor,VV);
851 FF = _mm_add_ps(Fp,_mm_mul_ps(gbeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
852 fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
853 dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
854 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
855 gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
856 _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
857 velec = _mm_mul_ps(qq00,rinv00);
858 felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
860 /* CUBIC SPLINE TABLE DISPERSION */
861 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
862 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
863 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
864 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
865 _MM_TRANSPOSE4_PS(Y,F,G,H);
866 Heps = _mm_mul_ps(vfeps,H);
867 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
868 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
869 fvdw6 = _mm_mul_ps(c6_00,FF);
871 /* CUBIC SPLINE TABLE REPULSION */
872 vfitab = _mm_add_epi32(vfitab,ifour);
873 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
874 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
875 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
876 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
877 _MM_TRANSPOSE4_PS(Y,F,G,H);
878 Heps = _mm_mul_ps(vfeps,H);
879 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
880 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
881 fvdw12 = _mm_mul_ps(c12_00,FF);
882 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
884 fscal = _mm_add_ps(felec,fvdw);
886 fscal = _mm_andnot_ps(dummy_mask,fscal);
888 /* Calculate temporary vectorial force */
889 tx = _mm_mul_ps(fscal,dx00);
890 ty = _mm_mul_ps(fscal,dy00);
891 tz = _mm_mul_ps(fscal,dz00);
893 /* Update vectorial force */
894 fix0 = _mm_add_ps(fix0,tx);
895 fiy0 = _mm_add_ps(fiy0,ty);
896 fiz0 = _mm_add_ps(fiz0,tz);
898 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
899 f+j_coord_offsetC,f+j_coord_offsetD,
902 /* Inner loop uses 83 flops */
905 /* End of innermost loop */
907 gmx_mm_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
908 f+i_coord_offset,fshift+i_shift_offset);
910 dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai0,isai0));
911 gmx_mm_update_1pot_ps(dvdasum,dvda+inr);
913 /* Increment number of inner iterations */
914 inneriter += j_index_end - j_index_start;
916 /* Outer loop uses 10 flops */
919 /* Increment number of outer iterations */
922 /* Update outer/inner flops */
924 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*83);