2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse2_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
94 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
97 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
100 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
101 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
103 __m128i ifour = _mm_set1_epi32(4);
104 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m128d dummy_mask,cutoff_mask;
107 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108 __m128d one = _mm_set1_pd(1.0);
109 __m128d two = _mm_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_pd(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 vftab = kernel_data->table_elec_vdw->data;
128 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
133 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
134 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
135 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
137 /* Avoid stupid compiler warnings */
145 /* Start outer loop over neighborlists */
146 for(iidx=0; iidx<nri; iidx++)
148 /* Load shift vector for this list */
149 i_shift_offset = DIM*shiftidx[iidx];
151 /* Load limits for loop over neighbors */
152 j_index_start = jindex[iidx];
153 j_index_end = jindex[iidx+1];
155 /* Get outer coordinate index */
157 i_coord_offset = DIM*inr;
159 /* Load i particle coords and add shift vector */
160 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
161 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
163 fix0 = _mm_setzero_pd();
164 fiy0 = _mm_setzero_pd();
165 fiz0 = _mm_setzero_pd();
166 fix1 = _mm_setzero_pd();
167 fiy1 = _mm_setzero_pd();
168 fiz1 = _mm_setzero_pd();
169 fix2 = _mm_setzero_pd();
170 fiy2 = _mm_setzero_pd();
171 fiz2 = _mm_setzero_pd();
172 fix3 = _mm_setzero_pd();
173 fiy3 = _mm_setzero_pd();
174 fiz3 = _mm_setzero_pd();
176 /* Reset potential sums */
177 velecsum = _mm_setzero_pd();
178 vvdwsum = _mm_setzero_pd();
180 /* Start inner kernel loop */
181 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
184 /* Get j neighbor index, and coordinate index */
187 j_coord_offsetA = DIM*jnrA;
188 j_coord_offsetB = DIM*jnrB;
190 /* load j atom coordinates */
191 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
194 /* Calculate displacement vector */
195 dx00 = _mm_sub_pd(ix0,jx0);
196 dy00 = _mm_sub_pd(iy0,jy0);
197 dz00 = _mm_sub_pd(iz0,jz0);
198 dx10 = _mm_sub_pd(ix1,jx0);
199 dy10 = _mm_sub_pd(iy1,jy0);
200 dz10 = _mm_sub_pd(iz1,jz0);
201 dx20 = _mm_sub_pd(ix2,jx0);
202 dy20 = _mm_sub_pd(iy2,jy0);
203 dz20 = _mm_sub_pd(iz2,jz0);
204 dx30 = _mm_sub_pd(ix3,jx0);
205 dy30 = _mm_sub_pd(iy3,jy0);
206 dz30 = _mm_sub_pd(iz3,jz0);
208 /* Calculate squared distance and things based on it */
209 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
210 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
211 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
212 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
214 rinv00 = gmx_mm_invsqrt_pd(rsq00);
215 rinv10 = gmx_mm_invsqrt_pd(rsq10);
216 rinv20 = gmx_mm_invsqrt_pd(rsq20);
217 rinv30 = gmx_mm_invsqrt_pd(rsq30);
219 /* Load parameters for j particles */
220 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
221 vdwjidx0A = 2*vdwtype[jnrA+0];
222 vdwjidx0B = 2*vdwtype[jnrB+0];
224 fjx0 = _mm_setzero_pd();
225 fjy0 = _mm_setzero_pd();
226 fjz0 = _mm_setzero_pd();
228 /**************************
229 * CALCULATE INTERACTIONS *
230 **************************/
232 r00 = _mm_mul_pd(rsq00,rinv00);
234 /* Compute parameters for interactions between i and j atoms */
235 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
236 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
238 /* Calculate table index by multiplying r with table scale and truncate to integer */
239 rt = _mm_mul_pd(r00,vftabscale);
240 vfitab = _mm_cvttpd_epi32(rt);
241 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
242 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
244 /* CUBIC SPLINE TABLE DISPERSION */
245 vfitab = _mm_add_epi32(vfitab,ifour);
246 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
247 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
248 GMX_MM_TRANSPOSE2_PD(Y,F);
249 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
250 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
251 GMX_MM_TRANSPOSE2_PD(G,H);
252 Heps = _mm_mul_pd(vfeps,H);
253 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
254 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
255 vvdw6 = _mm_mul_pd(c6_00,VV);
256 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
257 fvdw6 = _mm_mul_pd(c6_00,FF);
259 /* CUBIC SPLINE TABLE REPULSION */
260 vfitab = _mm_add_epi32(vfitab,ifour);
261 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
262 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
263 GMX_MM_TRANSPOSE2_PD(Y,F);
264 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
265 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
266 GMX_MM_TRANSPOSE2_PD(G,H);
267 Heps = _mm_mul_pd(vfeps,H);
268 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
269 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
270 vvdw12 = _mm_mul_pd(c12_00,VV);
271 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
272 fvdw12 = _mm_mul_pd(c12_00,FF);
273 vvdw = _mm_add_pd(vvdw12,vvdw6);
274 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
276 /* Update potential sum for this i atom from the interaction with this j atom. */
277 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
281 /* Calculate temporary vectorial force */
282 tx = _mm_mul_pd(fscal,dx00);
283 ty = _mm_mul_pd(fscal,dy00);
284 tz = _mm_mul_pd(fscal,dz00);
286 /* Update vectorial force */
287 fix0 = _mm_add_pd(fix0,tx);
288 fiy0 = _mm_add_pd(fiy0,ty);
289 fiz0 = _mm_add_pd(fiz0,tz);
291 fjx0 = _mm_add_pd(fjx0,tx);
292 fjy0 = _mm_add_pd(fjy0,ty);
293 fjz0 = _mm_add_pd(fjz0,tz);
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r10 = _mm_mul_pd(rsq10,rinv10);
301 /* Compute parameters for interactions between i and j atoms */
302 qq10 = _mm_mul_pd(iq1,jq0);
304 /* Calculate table index by multiplying r with table scale and truncate to integer */
305 rt = _mm_mul_pd(r10,vftabscale);
306 vfitab = _mm_cvttpd_epi32(rt);
307 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
308 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
310 /* CUBIC SPLINE TABLE ELECTROSTATICS */
311 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
312 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
313 GMX_MM_TRANSPOSE2_PD(Y,F);
314 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
315 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
316 GMX_MM_TRANSPOSE2_PD(G,H);
317 Heps = _mm_mul_pd(vfeps,H);
318 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
319 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
320 velec = _mm_mul_pd(qq10,VV);
321 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
322 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velecsum = _mm_add_pd(velecsum,velec);
329 /* Calculate temporary vectorial force */
330 tx = _mm_mul_pd(fscal,dx10);
331 ty = _mm_mul_pd(fscal,dy10);
332 tz = _mm_mul_pd(fscal,dz10);
334 /* Update vectorial force */
335 fix1 = _mm_add_pd(fix1,tx);
336 fiy1 = _mm_add_pd(fiy1,ty);
337 fiz1 = _mm_add_pd(fiz1,tz);
339 fjx0 = _mm_add_pd(fjx0,tx);
340 fjy0 = _mm_add_pd(fjy0,ty);
341 fjz0 = _mm_add_pd(fjz0,tz);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 r20 = _mm_mul_pd(rsq20,rinv20);
349 /* Compute parameters for interactions between i and j atoms */
350 qq20 = _mm_mul_pd(iq2,jq0);
352 /* Calculate table index by multiplying r with table scale and truncate to integer */
353 rt = _mm_mul_pd(r20,vftabscale);
354 vfitab = _mm_cvttpd_epi32(rt);
355 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
356 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
358 /* CUBIC SPLINE TABLE ELECTROSTATICS */
359 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
360 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
361 GMX_MM_TRANSPOSE2_PD(Y,F);
362 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
363 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
364 GMX_MM_TRANSPOSE2_PD(G,H);
365 Heps = _mm_mul_pd(vfeps,H);
366 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
367 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
368 velec = _mm_mul_pd(qq20,VV);
369 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
370 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velecsum = _mm_add_pd(velecsum,velec);
377 /* Calculate temporary vectorial force */
378 tx = _mm_mul_pd(fscal,dx20);
379 ty = _mm_mul_pd(fscal,dy20);
380 tz = _mm_mul_pd(fscal,dz20);
382 /* Update vectorial force */
383 fix2 = _mm_add_pd(fix2,tx);
384 fiy2 = _mm_add_pd(fiy2,ty);
385 fiz2 = _mm_add_pd(fiz2,tz);
387 fjx0 = _mm_add_pd(fjx0,tx);
388 fjy0 = _mm_add_pd(fjy0,ty);
389 fjz0 = _mm_add_pd(fjz0,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r30 = _mm_mul_pd(rsq30,rinv30);
397 /* Compute parameters for interactions between i and j atoms */
398 qq30 = _mm_mul_pd(iq3,jq0);
400 /* Calculate table index by multiplying r with table scale and truncate to integer */
401 rt = _mm_mul_pd(r30,vftabscale);
402 vfitab = _mm_cvttpd_epi32(rt);
403 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
404 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
406 /* CUBIC SPLINE TABLE ELECTROSTATICS */
407 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
408 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
409 GMX_MM_TRANSPOSE2_PD(Y,F);
410 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
411 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
412 GMX_MM_TRANSPOSE2_PD(G,H);
413 Heps = _mm_mul_pd(vfeps,H);
414 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
415 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
416 velec = _mm_mul_pd(qq30,VV);
417 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
418 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velecsum = _mm_add_pd(velecsum,velec);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_pd(fscal,dx30);
427 ty = _mm_mul_pd(fscal,dy30);
428 tz = _mm_mul_pd(fscal,dz30);
430 /* Update vectorial force */
431 fix3 = _mm_add_pd(fix3,tx);
432 fiy3 = _mm_add_pd(fiy3,ty);
433 fiz3 = _mm_add_pd(fiz3,tz);
435 fjx0 = _mm_add_pd(fjx0,tx);
436 fjy0 = _mm_add_pd(fjy0,ty);
437 fjz0 = _mm_add_pd(fjz0,tz);
439 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
441 /* Inner loop uses 188 flops */
448 j_coord_offsetA = DIM*jnrA;
450 /* load j atom coordinates */
451 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
454 /* Calculate displacement vector */
455 dx00 = _mm_sub_pd(ix0,jx0);
456 dy00 = _mm_sub_pd(iy0,jy0);
457 dz00 = _mm_sub_pd(iz0,jz0);
458 dx10 = _mm_sub_pd(ix1,jx0);
459 dy10 = _mm_sub_pd(iy1,jy0);
460 dz10 = _mm_sub_pd(iz1,jz0);
461 dx20 = _mm_sub_pd(ix2,jx0);
462 dy20 = _mm_sub_pd(iy2,jy0);
463 dz20 = _mm_sub_pd(iz2,jz0);
464 dx30 = _mm_sub_pd(ix3,jx0);
465 dy30 = _mm_sub_pd(iy3,jy0);
466 dz30 = _mm_sub_pd(iz3,jz0);
468 /* Calculate squared distance and things based on it */
469 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
470 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
471 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
472 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
474 rinv00 = gmx_mm_invsqrt_pd(rsq00);
475 rinv10 = gmx_mm_invsqrt_pd(rsq10);
476 rinv20 = gmx_mm_invsqrt_pd(rsq20);
477 rinv30 = gmx_mm_invsqrt_pd(rsq30);
479 /* Load parameters for j particles */
480 jq0 = _mm_load_sd(charge+jnrA+0);
481 vdwjidx0A = 2*vdwtype[jnrA+0];
483 fjx0 = _mm_setzero_pd();
484 fjy0 = _mm_setzero_pd();
485 fjz0 = _mm_setzero_pd();
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r00 = _mm_mul_pd(rsq00,rinv00);
493 /* Compute parameters for interactions between i and j atoms */
494 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
496 /* Calculate table index by multiplying r with table scale and truncate to integer */
497 rt = _mm_mul_pd(r00,vftabscale);
498 vfitab = _mm_cvttpd_epi32(rt);
499 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
500 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
502 /* CUBIC SPLINE TABLE DISPERSION */
503 vfitab = _mm_add_epi32(vfitab,ifour);
504 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
505 F = _mm_setzero_pd();
506 GMX_MM_TRANSPOSE2_PD(Y,F);
507 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
508 H = _mm_setzero_pd();
509 GMX_MM_TRANSPOSE2_PD(G,H);
510 Heps = _mm_mul_pd(vfeps,H);
511 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
512 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
513 vvdw6 = _mm_mul_pd(c6_00,VV);
514 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
515 fvdw6 = _mm_mul_pd(c6_00,FF);
517 /* CUBIC SPLINE TABLE REPULSION */
518 vfitab = _mm_add_epi32(vfitab,ifour);
519 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
520 F = _mm_setzero_pd();
521 GMX_MM_TRANSPOSE2_PD(Y,F);
522 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
523 H = _mm_setzero_pd();
524 GMX_MM_TRANSPOSE2_PD(G,H);
525 Heps = _mm_mul_pd(vfeps,H);
526 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
527 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
528 vvdw12 = _mm_mul_pd(c12_00,VV);
529 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
530 fvdw12 = _mm_mul_pd(c12_00,FF);
531 vvdw = _mm_add_pd(vvdw12,vvdw6);
532 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
536 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
540 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
542 /* Calculate temporary vectorial force */
543 tx = _mm_mul_pd(fscal,dx00);
544 ty = _mm_mul_pd(fscal,dy00);
545 tz = _mm_mul_pd(fscal,dz00);
547 /* Update vectorial force */
548 fix0 = _mm_add_pd(fix0,tx);
549 fiy0 = _mm_add_pd(fiy0,ty);
550 fiz0 = _mm_add_pd(fiz0,tz);
552 fjx0 = _mm_add_pd(fjx0,tx);
553 fjy0 = _mm_add_pd(fjy0,ty);
554 fjz0 = _mm_add_pd(fjz0,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r10 = _mm_mul_pd(rsq10,rinv10);
562 /* Compute parameters for interactions between i and j atoms */
563 qq10 = _mm_mul_pd(iq1,jq0);
565 /* Calculate table index by multiplying r with table scale and truncate to integer */
566 rt = _mm_mul_pd(r10,vftabscale);
567 vfitab = _mm_cvttpd_epi32(rt);
568 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
569 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
572 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
573 F = _mm_setzero_pd();
574 GMX_MM_TRANSPOSE2_PD(Y,F);
575 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
576 H = _mm_setzero_pd();
577 GMX_MM_TRANSPOSE2_PD(G,H);
578 Heps = _mm_mul_pd(vfeps,H);
579 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
580 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
581 velec = _mm_mul_pd(qq10,VV);
582 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
583 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
587 velecsum = _mm_add_pd(velecsum,velec);
591 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
593 /* Calculate temporary vectorial force */
594 tx = _mm_mul_pd(fscal,dx10);
595 ty = _mm_mul_pd(fscal,dy10);
596 tz = _mm_mul_pd(fscal,dz10);
598 /* Update vectorial force */
599 fix1 = _mm_add_pd(fix1,tx);
600 fiy1 = _mm_add_pd(fiy1,ty);
601 fiz1 = _mm_add_pd(fiz1,tz);
603 fjx0 = _mm_add_pd(fjx0,tx);
604 fjy0 = _mm_add_pd(fjy0,ty);
605 fjz0 = _mm_add_pd(fjz0,tz);
607 /**************************
608 * CALCULATE INTERACTIONS *
609 **************************/
611 r20 = _mm_mul_pd(rsq20,rinv20);
613 /* Compute parameters for interactions between i and j atoms */
614 qq20 = _mm_mul_pd(iq2,jq0);
616 /* Calculate table index by multiplying r with table scale and truncate to integer */
617 rt = _mm_mul_pd(r20,vftabscale);
618 vfitab = _mm_cvttpd_epi32(rt);
619 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
620 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
624 F = _mm_setzero_pd();
625 GMX_MM_TRANSPOSE2_PD(Y,F);
626 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
627 H = _mm_setzero_pd();
628 GMX_MM_TRANSPOSE2_PD(G,H);
629 Heps = _mm_mul_pd(vfeps,H);
630 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
631 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
632 velec = _mm_mul_pd(qq20,VV);
633 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
634 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
636 /* Update potential sum for this i atom from the interaction with this j atom. */
637 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
638 velecsum = _mm_add_pd(velecsum,velec);
642 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
644 /* Calculate temporary vectorial force */
645 tx = _mm_mul_pd(fscal,dx20);
646 ty = _mm_mul_pd(fscal,dy20);
647 tz = _mm_mul_pd(fscal,dz20);
649 /* Update vectorial force */
650 fix2 = _mm_add_pd(fix2,tx);
651 fiy2 = _mm_add_pd(fiy2,ty);
652 fiz2 = _mm_add_pd(fiz2,tz);
654 fjx0 = _mm_add_pd(fjx0,tx);
655 fjy0 = _mm_add_pd(fjy0,ty);
656 fjz0 = _mm_add_pd(fjz0,tz);
658 /**************************
659 * CALCULATE INTERACTIONS *
660 **************************/
662 r30 = _mm_mul_pd(rsq30,rinv30);
664 /* Compute parameters for interactions between i and j atoms */
665 qq30 = _mm_mul_pd(iq3,jq0);
667 /* Calculate table index by multiplying r with table scale and truncate to integer */
668 rt = _mm_mul_pd(r30,vftabscale);
669 vfitab = _mm_cvttpd_epi32(rt);
670 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
671 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
673 /* CUBIC SPLINE TABLE ELECTROSTATICS */
674 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
675 F = _mm_setzero_pd();
676 GMX_MM_TRANSPOSE2_PD(Y,F);
677 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
678 H = _mm_setzero_pd();
679 GMX_MM_TRANSPOSE2_PD(G,H);
680 Heps = _mm_mul_pd(vfeps,H);
681 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
682 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
683 velec = _mm_mul_pd(qq30,VV);
684 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
685 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
689 velecsum = _mm_add_pd(velecsum,velec);
693 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_pd(fscal,dx30);
697 ty = _mm_mul_pd(fscal,dy30);
698 tz = _mm_mul_pd(fscal,dz30);
700 /* Update vectorial force */
701 fix3 = _mm_add_pd(fix3,tx);
702 fiy3 = _mm_add_pd(fiy3,ty);
703 fiz3 = _mm_add_pd(fiz3,tz);
705 fjx0 = _mm_add_pd(fjx0,tx);
706 fjy0 = _mm_add_pd(fjy0,ty);
707 fjz0 = _mm_add_pd(fjz0,tz);
709 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
711 /* Inner loop uses 188 flops */
714 /* End of innermost loop */
716 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
717 f+i_coord_offset,fshift+i_shift_offset);
720 /* Update potential energies */
721 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
722 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
724 /* Increment number of inner iterations */
725 inneriter += j_index_end - j_index_start;
727 /* Outer loop uses 26 flops */
730 /* Increment number of outer iterations */
733 /* Update outer/inner flops */
735 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*188);
738 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double
739 * Electrostatics interaction: CubicSplineTable
740 * VdW interaction: CubicSplineTable
741 * Geometry: Water4-Particle
742 * Calculate force/pot: Force
745 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_double
746 (t_nblist * gmx_restrict nlist,
747 rvec * gmx_restrict xx,
748 rvec * gmx_restrict ff,
749 t_forcerec * gmx_restrict fr,
750 t_mdatoms * gmx_restrict mdatoms,
751 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
752 t_nrnb * gmx_restrict nrnb)
754 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
755 * just 0 for non-waters.
756 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
757 * jnr indices corresponding to data put in the four positions in the SIMD register.
759 int i_shift_offset,i_coord_offset,outeriter,inneriter;
760 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
762 int j_coord_offsetA,j_coord_offsetB;
763 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
765 real *shiftvec,*fshift,*x,*f;
766 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
768 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
770 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
772 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
774 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
775 int vdwjidx0A,vdwjidx0B;
776 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
777 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
778 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
779 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
780 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
781 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
784 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
787 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
788 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
790 __m128i ifour = _mm_set1_epi32(4);
791 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
793 __m128d dummy_mask,cutoff_mask;
794 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
795 __m128d one = _mm_set1_pd(1.0);
796 __m128d two = _mm_set1_pd(2.0);
802 jindex = nlist->jindex;
804 shiftidx = nlist->shift;
806 shiftvec = fr->shift_vec[0];
807 fshift = fr->fshift[0];
808 facel = _mm_set1_pd(fr->epsfac);
809 charge = mdatoms->chargeA;
810 nvdwtype = fr->ntype;
812 vdwtype = mdatoms->typeA;
814 vftab = kernel_data->table_elec_vdw->data;
815 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
817 /* Setup water-specific parameters */
818 inr = nlist->iinr[0];
819 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
820 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
821 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
822 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
824 /* Avoid stupid compiler warnings */
832 /* Start outer loop over neighborlists */
833 for(iidx=0; iidx<nri; iidx++)
835 /* Load shift vector for this list */
836 i_shift_offset = DIM*shiftidx[iidx];
838 /* Load limits for loop over neighbors */
839 j_index_start = jindex[iidx];
840 j_index_end = jindex[iidx+1];
842 /* Get outer coordinate index */
844 i_coord_offset = DIM*inr;
846 /* Load i particle coords and add shift vector */
847 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
848 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
850 fix0 = _mm_setzero_pd();
851 fiy0 = _mm_setzero_pd();
852 fiz0 = _mm_setzero_pd();
853 fix1 = _mm_setzero_pd();
854 fiy1 = _mm_setzero_pd();
855 fiz1 = _mm_setzero_pd();
856 fix2 = _mm_setzero_pd();
857 fiy2 = _mm_setzero_pd();
858 fiz2 = _mm_setzero_pd();
859 fix3 = _mm_setzero_pd();
860 fiy3 = _mm_setzero_pd();
861 fiz3 = _mm_setzero_pd();
863 /* Start inner kernel loop */
864 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
867 /* Get j neighbor index, and coordinate index */
870 j_coord_offsetA = DIM*jnrA;
871 j_coord_offsetB = DIM*jnrB;
873 /* load j atom coordinates */
874 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
877 /* Calculate displacement vector */
878 dx00 = _mm_sub_pd(ix0,jx0);
879 dy00 = _mm_sub_pd(iy0,jy0);
880 dz00 = _mm_sub_pd(iz0,jz0);
881 dx10 = _mm_sub_pd(ix1,jx0);
882 dy10 = _mm_sub_pd(iy1,jy0);
883 dz10 = _mm_sub_pd(iz1,jz0);
884 dx20 = _mm_sub_pd(ix2,jx0);
885 dy20 = _mm_sub_pd(iy2,jy0);
886 dz20 = _mm_sub_pd(iz2,jz0);
887 dx30 = _mm_sub_pd(ix3,jx0);
888 dy30 = _mm_sub_pd(iy3,jy0);
889 dz30 = _mm_sub_pd(iz3,jz0);
891 /* Calculate squared distance and things based on it */
892 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
893 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
894 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
895 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
897 rinv00 = gmx_mm_invsqrt_pd(rsq00);
898 rinv10 = gmx_mm_invsqrt_pd(rsq10);
899 rinv20 = gmx_mm_invsqrt_pd(rsq20);
900 rinv30 = gmx_mm_invsqrt_pd(rsq30);
902 /* Load parameters for j particles */
903 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
904 vdwjidx0A = 2*vdwtype[jnrA+0];
905 vdwjidx0B = 2*vdwtype[jnrB+0];
907 fjx0 = _mm_setzero_pd();
908 fjy0 = _mm_setzero_pd();
909 fjz0 = _mm_setzero_pd();
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 r00 = _mm_mul_pd(rsq00,rinv00);
917 /* Compute parameters for interactions between i and j atoms */
918 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
919 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
921 /* Calculate table index by multiplying r with table scale and truncate to integer */
922 rt = _mm_mul_pd(r00,vftabscale);
923 vfitab = _mm_cvttpd_epi32(rt);
924 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
925 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
927 /* CUBIC SPLINE TABLE DISPERSION */
928 vfitab = _mm_add_epi32(vfitab,ifour);
929 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
930 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
931 GMX_MM_TRANSPOSE2_PD(Y,F);
932 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
933 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
934 GMX_MM_TRANSPOSE2_PD(G,H);
935 Heps = _mm_mul_pd(vfeps,H);
936 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
937 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
938 fvdw6 = _mm_mul_pd(c6_00,FF);
940 /* CUBIC SPLINE TABLE REPULSION */
941 vfitab = _mm_add_epi32(vfitab,ifour);
942 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
943 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
944 GMX_MM_TRANSPOSE2_PD(Y,F);
945 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
946 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
947 GMX_MM_TRANSPOSE2_PD(G,H);
948 Heps = _mm_mul_pd(vfeps,H);
949 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
950 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
951 fvdw12 = _mm_mul_pd(c12_00,FF);
952 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
956 /* Calculate temporary vectorial force */
957 tx = _mm_mul_pd(fscal,dx00);
958 ty = _mm_mul_pd(fscal,dy00);
959 tz = _mm_mul_pd(fscal,dz00);
961 /* Update vectorial force */
962 fix0 = _mm_add_pd(fix0,tx);
963 fiy0 = _mm_add_pd(fiy0,ty);
964 fiz0 = _mm_add_pd(fiz0,tz);
966 fjx0 = _mm_add_pd(fjx0,tx);
967 fjy0 = _mm_add_pd(fjy0,ty);
968 fjz0 = _mm_add_pd(fjz0,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r10 = _mm_mul_pd(rsq10,rinv10);
976 /* Compute parameters for interactions between i and j atoms */
977 qq10 = _mm_mul_pd(iq1,jq0);
979 /* Calculate table index by multiplying r with table scale and truncate to integer */
980 rt = _mm_mul_pd(r10,vftabscale);
981 vfitab = _mm_cvttpd_epi32(rt);
982 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
983 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
985 /* CUBIC SPLINE TABLE ELECTROSTATICS */
986 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
987 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
988 GMX_MM_TRANSPOSE2_PD(Y,F);
989 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
990 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
991 GMX_MM_TRANSPOSE2_PD(G,H);
992 Heps = _mm_mul_pd(vfeps,H);
993 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
994 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
995 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
999 /* Calculate temporary vectorial force */
1000 tx = _mm_mul_pd(fscal,dx10);
1001 ty = _mm_mul_pd(fscal,dy10);
1002 tz = _mm_mul_pd(fscal,dz10);
1004 /* Update vectorial force */
1005 fix1 = _mm_add_pd(fix1,tx);
1006 fiy1 = _mm_add_pd(fiy1,ty);
1007 fiz1 = _mm_add_pd(fiz1,tz);
1009 fjx0 = _mm_add_pd(fjx0,tx);
1010 fjy0 = _mm_add_pd(fjy0,ty);
1011 fjz0 = _mm_add_pd(fjz0,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 r20 = _mm_mul_pd(rsq20,rinv20);
1019 /* Compute parameters for interactions between i and j atoms */
1020 qq20 = _mm_mul_pd(iq2,jq0);
1022 /* Calculate table index by multiplying r with table scale and truncate to integer */
1023 rt = _mm_mul_pd(r20,vftabscale);
1024 vfitab = _mm_cvttpd_epi32(rt);
1025 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1026 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1028 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1029 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1030 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1031 GMX_MM_TRANSPOSE2_PD(Y,F);
1032 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1033 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1034 GMX_MM_TRANSPOSE2_PD(G,H);
1035 Heps = _mm_mul_pd(vfeps,H);
1036 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1037 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1038 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1042 /* Calculate temporary vectorial force */
1043 tx = _mm_mul_pd(fscal,dx20);
1044 ty = _mm_mul_pd(fscal,dy20);
1045 tz = _mm_mul_pd(fscal,dz20);
1047 /* Update vectorial force */
1048 fix2 = _mm_add_pd(fix2,tx);
1049 fiy2 = _mm_add_pd(fiy2,ty);
1050 fiz2 = _mm_add_pd(fiz2,tz);
1052 fjx0 = _mm_add_pd(fjx0,tx);
1053 fjy0 = _mm_add_pd(fjy0,ty);
1054 fjz0 = _mm_add_pd(fjz0,tz);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 r30 = _mm_mul_pd(rsq30,rinv30);
1062 /* Compute parameters for interactions between i and j atoms */
1063 qq30 = _mm_mul_pd(iq3,jq0);
1065 /* Calculate table index by multiplying r with table scale and truncate to integer */
1066 rt = _mm_mul_pd(r30,vftabscale);
1067 vfitab = _mm_cvttpd_epi32(rt);
1068 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1069 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1071 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1072 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1073 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1074 GMX_MM_TRANSPOSE2_PD(Y,F);
1075 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1076 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1077 GMX_MM_TRANSPOSE2_PD(G,H);
1078 Heps = _mm_mul_pd(vfeps,H);
1079 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1080 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1081 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1085 /* Calculate temporary vectorial force */
1086 tx = _mm_mul_pd(fscal,dx30);
1087 ty = _mm_mul_pd(fscal,dy30);
1088 tz = _mm_mul_pd(fscal,dz30);
1090 /* Update vectorial force */
1091 fix3 = _mm_add_pd(fix3,tx);
1092 fiy3 = _mm_add_pd(fiy3,ty);
1093 fiz3 = _mm_add_pd(fiz3,tz);
1095 fjx0 = _mm_add_pd(fjx0,tx);
1096 fjy0 = _mm_add_pd(fjy0,ty);
1097 fjz0 = _mm_add_pd(fjz0,tz);
1099 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1101 /* Inner loop uses 168 flops */
1104 if(jidx<j_index_end)
1108 j_coord_offsetA = DIM*jnrA;
1110 /* load j atom coordinates */
1111 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1114 /* Calculate displacement vector */
1115 dx00 = _mm_sub_pd(ix0,jx0);
1116 dy00 = _mm_sub_pd(iy0,jy0);
1117 dz00 = _mm_sub_pd(iz0,jz0);
1118 dx10 = _mm_sub_pd(ix1,jx0);
1119 dy10 = _mm_sub_pd(iy1,jy0);
1120 dz10 = _mm_sub_pd(iz1,jz0);
1121 dx20 = _mm_sub_pd(ix2,jx0);
1122 dy20 = _mm_sub_pd(iy2,jy0);
1123 dz20 = _mm_sub_pd(iz2,jz0);
1124 dx30 = _mm_sub_pd(ix3,jx0);
1125 dy30 = _mm_sub_pd(iy3,jy0);
1126 dz30 = _mm_sub_pd(iz3,jz0);
1128 /* Calculate squared distance and things based on it */
1129 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1130 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1131 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1132 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1134 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1135 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1136 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1137 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1139 /* Load parameters for j particles */
1140 jq0 = _mm_load_sd(charge+jnrA+0);
1141 vdwjidx0A = 2*vdwtype[jnrA+0];
1143 fjx0 = _mm_setzero_pd();
1144 fjy0 = _mm_setzero_pd();
1145 fjz0 = _mm_setzero_pd();
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 r00 = _mm_mul_pd(rsq00,rinv00);
1153 /* Compute parameters for interactions between i and j atoms */
1154 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1156 /* Calculate table index by multiplying r with table scale and truncate to integer */
1157 rt = _mm_mul_pd(r00,vftabscale);
1158 vfitab = _mm_cvttpd_epi32(rt);
1159 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1160 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1162 /* CUBIC SPLINE TABLE DISPERSION */
1163 vfitab = _mm_add_epi32(vfitab,ifour);
1164 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1165 F = _mm_setzero_pd();
1166 GMX_MM_TRANSPOSE2_PD(Y,F);
1167 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1168 H = _mm_setzero_pd();
1169 GMX_MM_TRANSPOSE2_PD(G,H);
1170 Heps = _mm_mul_pd(vfeps,H);
1171 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1172 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1173 fvdw6 = _mm_mul_pd(c6_00,FF);
1175 /* CUBIC SPLINE TABLE REPULSION */
1176 vfitab = _mm_add_epi32(vfitab,ifour);
1177 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1178 F = _mm_setzero_pd();
1179 GMX_MM_TRANSPOSE2_PD(Y,F);
1180 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1181 H = _mm_setzero_pd();
1182 GMX_MM_TRANSPOSE2_PD(G,H);
1183 Heps = _mm_mul_pd(vfeps,H);
1184 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1185 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1186 fvdw12 = _mm_mul_pd(c12_00,FF);
1187 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1191 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1193 /* Calculate temporary vectorial force */
1194 tx = _mm_mul_pd(fscal,dx00);
1195 ty = _mm_mul_pd(fscal,dy00);
1196 tz = _mm_mul_pd(fscal,dz00);
1198 /* Update vectorial force */
1199 fix0 = _mm_add_pd(fix0,tx);
1200 fiy0 = _mm_add_pd(fiy0,ty);
1201 fiz0 = _mm_add_pd(fiz0,tz);
1203 fjx0 = _mm_add_pd(fjx0,tx);
1204 fjy0 = _mm_add_pd(fjy0,ty);
1205 fjz0 = _mm_add_pd(fjz0,tz);
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 r10 = _mm_mul_pd(rsq10,rinv10);
1213 /* Compute parameters for interactions between i and j atoms */
1214 qq10 = _mm_mul_pd(iq1,jq0);
1216 /* Calculate table index by multiplying r with table scale and truncate to integer */
1217 rt = _mm_mul_pd(r10,vftabscale);
1218 vfitab = _mm_cvttpd_epi32(rt);
1219 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1220 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1222 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1223 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1224 F = _mm_setzero_pd();
1225 GMX_MM_TRANSPOSE2_PD(Y,F);
1226 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1227 H = _mm_setzero_pd();
1228 GMX_MM_TRANSPOSE2_PD(G,H);
1229 Heps = _mm_mul_pd(vfeps,H);
1230 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1231 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1232 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1236 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1238 /* Calculate temporary vectorial force */
1239 tx = _mm_mul_pd(fscal,dx10);
1240 ty = _mm_mul_pd(fscal,dy10);
1241 tz = _mm_mul_pd(fscal,dz10);
1243 /* Update vectorial force */
1244 fix1 = _mm_add_pd(fix1,tx);
1245 fiy1 = _mm_add_pd(fiy1,ty);
1246 fiz1 = _mm_add_pd(fiz1,tz);
1248 fjx0 = _mm_add_pd(fjx0,tx);
1249 fjy0 = _mm_add_pd(fjy0,ty);
1250 fjz0 = _mm_add_pd(fjz0,tz);
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 r20 = _mm_mul_pd(rsq20,rinv20);
1258 /* Compute parameters for interactions between i and j atoms */
1259 qq20 = _mm_mul_pd(iq2,jq0);
1261 /* Calculate table index by multiplying r with table scale and truncate to integer */
1262 rt = _mm_mul_pd(r20,vftabscale);
1263 vfitab = _mm_cvttpd_epi32(rt);
1264 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1265 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1267 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1268 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1269 F = _mm_setzero_pd();
1270 GMX_MM_TRANSPOSE2_PD(Y,F);
1271 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1272 H = _mm_setzero_pd();
1273 GMX_MM_TRANSPOSE2_PD(G,H);
1274 Heps = _mm_mul_pd(vfeps,H);
1275 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1276 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1277 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1281 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1283 /* Calculate temporary vectorial force */
1284 tx = _mm_mul_pd(fscal,dx20);
1285 ty = _mm_mul_pd(fscal,dy20);
1286 tz = _mm_mul_pd(fscal,dz20);
1288 /* Update vectorial force */
1289 fix2 = _mm_add_pd(fix2,tx);
1290 fiy2 = _mm_add_pd(fiy2,ty);
1291 fiz2 = _mm_add_pd(fiz2,tz);
1293 fjx0 = _mm_add_pd(fjx0,tx);
1294 fjy0 = _mm_add_pd(fjy0,ty);
1295 fjz0 = _mm_add_pd(fjz0,tz);
1297 /**************************
1298 * CALCULATE INTERACTIONS *
1299 **************************/
1301 r30 = _mm_mul_pd(rsq30,rinv30);
1303 /* Compute parameters for interactions between i and j atoms */
1304 qq30 = _mm_mul_pd(iq3,jq0);
1306 /* Calculate table index by multiplying r with table scale and truncate to integer */
1307 rt = _mm_mul_pd(r30,vftabscale);
1308 vfitab = _mm_cvttpd_epi32(rt);
1309 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1310 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1312 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1313 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1314 F = _mm_setzero_pd();
1315 GMX_MM_TRANSPOSE2_PD(Y,F);
1316 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1317 H = _mm_setzero_pd();
1318 GMX_MM_TRANSPOSE2_PD(G,H);
1319 Heps = _mm_mul_pd(vfeps,H);
1320 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1321 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1322 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1326 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1328 /* Calculate temporary vectorial force */
1329 tx = _mm_mul_pd(fscal,dx30);
1330 ty = _mm_mul_pd(fscal,dy30);
1331 tz = _mm_mul_pd(fscal,dz30);
1333 /* Update vectorial force */
1334 fix3 = _mm_add_pd(fix3,tx);
1335 fiy3 = _mm_add_pd(fiy3,ty);
1336 fiz3 = _mm_add_pd(fiz3,tz);
1338 fjx0 = _mm_add_pd(fjx0,tx);
1339 fjy0 = _mm_add_pd(fjy0,ty);
1340 fjz0 = _mm_add_pd(fjz0,tz);
1342 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1344 /* Inner loop uses 168 flops */
1347 /* End of innermost loop */
1349 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1350 f+i_coord_offset,fshift+i_shift_offset);
1352 /* Increment number of inner iterations */
1353 inneriter += j_index_end - j_index_start;
1355 /* Outer loop uses 24 flops */
1358 /* Increment number of outer iterations */
1361 /* Update outer/inner flops */
1363 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*168);