2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJEw_GeomW4P1_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
100 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
104 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
109 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
111 __m128 one_half = _mm_set1_ps(0.5);
112 __m128 minus_one = _mm_set1_ps(-1.0);
114 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
136 vdwgridparam = fr->ljpme_c6grid;
137 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
138 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
139 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
141 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 /* Avoid stupid compiler warnings */
154 jnrA = jnrB = jnrC = jnrD = 0;
163 for(iidx=0;iidx<4*DIM;iidx++)
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
186 fix0 = _mm_setzero_ps();
187 fiy0 = _mm_setzero_ps();
188 fiz0 = _mm_setzero_ps();
189 fix1 = _mm_setzero_ps();
190 fiy1 = _mm_setzero_ps();
191 fiz1 = _mm_setzero_ps();
192 fix2 = _mm_setzero_ps();
193 fiy2 = _mm_setzero_ps();
194 fiz2 = _mm_setzero_ps();
195 fix3 = _mm_setzero_ps();
196 fiy3 = _mm_setzero_ps();
197 fiz3 = _mm_setzero_ps();
199 /* Reset potential sums */
200 velecsum = _mm_setzero_ps();
201 vvdwsum = _mm_setzero_ps();
203 /* Start inner kernel loop */
204 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
207 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA = DIM*jnrA;
213 j_coord_offsetB = DIM*jnrB;
214 j_coord_offsetC = DIM*jnrC;
215 j_coord_offsetD = DIM*jnrD;
217 /* load j atom coordinates */
218 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
219 x+j_coord_offsetC,x+j_coord_offsetD,
222 /* Calculate displacement vector */
223 dx00 = _mm_sub_ps(ix0,jx0);
224 dy00 = _mm_sub_ps(iy0,jy0);
225 dz00 = _mm_sub_ps(iz0,jz0);
226 dx10 = _mm_sub_ps(ix1,jx0);
227 dy10 = _mm_sub_ps(iy1,jy0);
228 dz10 = _mm_sub_ps(iz1,jz0);
229 dx20 = _mm_sub_ps(ix2,jx0);
230 dy20 = _mm_sub_ps(iy2,jy0);
231 dz20 = _mm_sub_ps(iz2,jz0);
232 dx30 = _mm_sub_ps(ix3,jx0);
233 dy30 = _mm_sub_ps(iy3,jy0);
234 dz30 = _mm_sub_ps(iz3,jz0);
236 /* Calculate squared distance and things based on it */
237 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
238 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
239 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
240 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
242 rinv00 = gmx_mm_invsqrt_ps(rsq00);
243 rinv10 = gmx_mm_invsqrt_ps(rsq10);
244 rinv20 = gmx_mm_invsqrt_ps(rsq20);
245 rinv30 = gmx_mm_invsqrt_ps(rsq30);
247 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
248 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
249 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
250 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
252 /* Load parameters for j particles */
253 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
254 charge+jnrC+0,charge+jnrD+0);
255 vdwjidx0A = 2*vdwtype[jnrA+0];
256 vdwjidx0B = 2*vdwtype[jnrB+0];
257 vdwjidx0C = 2*vdwtype[jnrC+0];
258 vdwjidx0D = 2*vdwtype[jnrD+0];
260 fjx0 = _mm_setzero_ps();
261 fjy0 = _mm_setzero_ps();
262 fjz0 = _mm_setzero_ps();
264 /**************************
265 * CALCULATE INTERACTIONS *
266 **************************/
268 r00 = _mm_mul_ps(rsq00,rinv00);
270 /* Compute parameters for interactions between i and j atoms */
271 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
272 vdwparam+vdwioffset0+vdwjidx0B,
273 vdwparam+vdwioffset0+vdwjidx0C,
274 vdwparam+vdwioffset0+vdwjidx0D,
276 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
277 vdwgridparam+vdwioffset0+vdwjidx0B,
278 vdwgridparam+vdwioffset0+vdwjidx0C,
279 vdwgridparam+vdwioffset0+vdwjidx0D);
281 /* Analytical LJ-PME */
282 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
283 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
284 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
285 exponent = gmx_simd_exp_r(ewcljrsq);
286 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
287 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
288 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
289 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
290 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
291 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
292 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
293 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
300 /* Calculate temporary vectorial force */
301 tx = _mm_mul_ps(fscal,dx00);
302 ty = _mm_mul_ps(fscal,dy00);
303 tz = _mm_mul_ps(fscal,dz00);
305 /* Update vectorial force */
306 fix0 = _mm_add_ps(fix0,tx);
307 fiy0 = _mm_add_ps(fiy0,ty);
308 fiz0 = _mm_add_ps(fiz0,tz);
310 fjx0 = _mm_add_ps(fjx0,tx);
311 fjy0 = _mm_add_ps(fjy0,ty);
312 fjz0 = _mm_add_ps(fjz0,tz);
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 r10 = _mm_mul_ps(rsq10,rinv10);
320 /* Compute parameters for interactions between i and j atoms */
321 qq10 = _mm_mul_ps(iq1,jq0);
323 /* EWALD ELECTROSTATICS */
325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
326 ewrt = _mm_mul_ps(r10,ewtabscale);
327 ewitab = _mm_cvttps_epi32(ewrt);
328 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
329 ewitab = _mm_slli_epi32(ewitab,2);
330 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
331 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
332 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
333 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
334 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
335 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
336 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
337 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
338 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velecsum = _mm_add_ps(velecsum,velec);
345 /* Calculate temporary vectorial force */
346 tx = _mm_mul_ps(fscal,dx10);
347 ty = _mm_mul_ps(fscal,dy10);
348 tz = _mm_mul_ps(fscal,dz10);
350 /* Update vectorial force */
351 fix1 = _mm_add_ps(fix1,tx);
352 fiy1 = _mm_add_ps(fiy1,ty);
353 fiz1 = _mm_add_ps(fiz1,tz);
355 fjx0 = _mm_add_ps(fjx0,tx);
356 fjy0 = _mm_add_ps(fjy0,ty);
357 fjz0 = _mm_add_ps(fjz0,tz);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 r20 = _mm_mul_ps(rsq20,rinv20);
365 /* Compute parameters for interactions between i and j atoms */
366 qq20 = _mm_mul_ps(iq2,jq0);
368 /* EWALD ELECTROSTATICS */
370 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
371 ewrt = _mm_mul_ps(r20,ewtabscale);
372 ewitab = _mm_cvttps_epi32(ewrt);
373 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
374 ewitab = _mm_slli_epi32(ewitab,2);
375 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
376 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
377 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
378 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
379 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
380 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
381 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
382 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
383 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velecsum = _mm_add_ps(velecsum,velec);
390 /* Calculate temporary vectorial force */
391 tx = _mm_mul_ps(fscal,dx20);
392 ty = _mm_mul_ps(fscal,dy20);
393 tz = _mm_mul_ps(fscal,dz20);
395 /* Update vectorial force */
396 fix2 = _mm_add_ps(fix2,tx);
397 fiy2 = _mm_add_ps(fiy2,ty);
398 fiz2 = _mm_add_ps(fiz2,tz);
400 fjx0 = _mm_add_ps(fjx0,tx);
401 fjy0 = _mm_add_ps(fjy0,ty);
402 fjz0 = _mm_add_ps(fjz0,tz);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 r30 = _mm_mul_ps(rsq30,rinv30);
410 /* Compute parameters for interactions between i and j atoms */
411 qq30 = _mm_mul_ps(iq3,jq0);
413 /* EWALD ELECTROSTATICS */
415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416 ewrt = _mm_mul_ps(r30,ewtabscale);
417 ewitab = _mm_cvttps_epi32(ewrt);
418 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
419 ewitab = _mm_slli_epi32(ewitab,2);
420 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
421 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
422 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
423 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
424 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
425 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
426 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
427 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
428 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm_add_ps(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm_mul_ps(fscal,dx30);
437 ty = _mm_mul_ps(fscal,dy30);
438 tz = _mm_mul_ps(fscal,dz30);
440 /* Update vectorial force */
441 fix3 = _mm_add_ps(fix3,tx);
442 fiy3 = _mm_add_ps(fiy3,ty);
443 fiz3 = _mm_add_ps(fiz3,tz);
445 fjx0 = _mm_add_ps(fjx0,tx);
446 fjy0 = _mm_add_ps(fjy0,ty);
447 fjz0 = _mm_add_ps(fjz0,tz);
449 fjptrA = f+j_coord_offsetA;
450 fjptrB = f+j_coord_offsetB;
451 fjptrC = f+j_coord_offsetC;
452 fjptrD = f+j_coord_offsetD;
454 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
456 /* Inner loop uses 174 flops */
462 /* Get j neighbor index, and coordinate index */
463 jnrlistA = jjnr[jidx];
464 jnrlistB = jjnr[jidx+1];
465 jnrlistC = jjnr[jidx+2];
466 jnrlistD = jjnr[jidx+3];
467 /* Sign of each element will be negative for non-real atoms.
468 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
469 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
471 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
472 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
473 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
474 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
475 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
476 j_coord_offsetA = DIM*jnrA;
477 j_coord_offsetB = DIM*jnrB;
478 j_coord_offsetC = DIM*jnrC;
479 j_coord_offsetD = DIM*jnrD;
481 /* load j atom coordinates */
482 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
483 x+j_coord_offsetC,x+j_coord_offsetD,
486 /* Calculate displacement vector */
487 dx00 = _mm_sub_ps(ix0,jx0);
488 dy00 = _mm_sub_ps(iy0,jy0);
489 dz00 = _mm_sub_ps(iz0,jz0);
490 dx10 = _mm_sub_ps(ix1,jx0);
491 dy10 = _mm_sub_ps(iy1,jy0);
492 dz10 = _mm_sub_ps(iz1,jz0);
493 dx20 = _mm_sub_ps(ix2,jx0);
494 dy20 = _mm_sub_ps(iy2,jy0);
495 dz20 = _mm_sub_ps(iz2,jz0);
496 dx30 = _mm_sub_ps(ix3,jx0);
497 dy30 = _mm_sub_ps(iy3,jy0);
498 dz30 = _mm_sub_ps(iz3,jz0);
500 /* Calculate squared distance and things based on it */
501 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
502 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
503 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
504 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
506 rinv00 = gmx_mm_invsqrt_ps(rsq00);
507 rinv10 = gmx_mm_invsqrt_ps(rsq10);
508 rinv20 = gmx_mm_invsqrt_ps(rsq20);
509 rinv30 = gmx_mm_invsqrt_ps(rsq30);
511 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
512 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
513 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
514 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
516 /* Load parameters for j particles */
517 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
518 charge+jnrC+0,charge+jnrD+0);
519 vdwjidx0A = 2*vdwtype[jnrA+0];
520 vdwjidx0B = 2*vdwtype[jnrB+0];
521 vdwjidx0C = 2*vdwtype[jnrC+0];
522 vdwjidx0D = 2*vdwtype[jnrD+0];
524 fjx0 = _mm_setzero_ps();
525 fjy0 = _mm_setzero_ps();
526 fjz0 = _mm_setzero_ps();
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 r00 = _mm_mul_ps(rsq00,rinv00);
533 r00 = _mm_andnot_ps(dummy_mask,r00);
535 /* Compute parameters for interactions between i and j atoms */
536 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
537 vdwparam+vdwioffset0+vdwjidx0B,
538 vdwparam+vdwioffset0+vdwjidx0C,
539 vdwparam+vdwioffset0+vdwjidx0D,
541 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
542 vdwgridparam+vdwioffset0+vdwjidx0B,
543 vdwgridparam+vdwioffset0+vdwjidx0C,
544 vdwgridparam+vdwioffset0+vdwjidx0D);
546 /* Analytical LJ-PME */
547 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
548 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
549 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
550 exponent = gmx_simd_exp_r(ewcljrsq);
551 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
552 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
553 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
554 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
555 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
556 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
557 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
558 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
562 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
566 fscal = _mm_andnot_ps(dummy_mask,fscal);
568 /* Calculate temporary vectorial force */
569 tx = _mm_mul_ps(fscal,dx00);
570 ty = _mm_mul_ps(fscal,dy00);
571 tz = _mm_mul_ps(fscal,dz00);
573 /* Update vectorial force */
574 fix0 = _mm_add_ps(fix0,tx);
575 fiy0 = _mm_add_ps(fiy0,ty);
576 fiz0 = _mm_add_ps(fiz0,tz);
578 fjx0 = _mm_add_ps(fjx0,tx);
579 fjy0 = _mm_add_ps(fjy0,ty);
580 fjz0 = _mm_add_ps(fjz0,tz);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 r10 = _mm_mul_ps(rsq10,rinv10);
587 r10 = _mm_andnot_ps(dummy_mask,r10);
589 /* Compute parameters for interactions between i and j atoms */
590 qq10 = _mm_mul_ps(iq1,jq0);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r10,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec = _mm_andnot_ps(dummy_mask,velec);
611 velecsum = _mm_add_ps(velecsum,velec);
615 fscal = _mm_andnot_ps(dummy_mask,fscal);
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_ps(fscal,dx10);
619 ty = _mm_mul_ps(fscal,dy10);
620 tz = _mm_mul_ps(fscal,dz10);
622 /* Update vectorial force */
623 fix1 = _mm_add_ps(fix1,tx);
624 fiy1 = _mm_add_ps(fiy1,ty);
625 fiz1 = _mm_add_ps(fiz1,tz);
627 fjx0 = _mm_add_ps(fjx0,tx);
628 fjy0 = _mm_add_ps(fjy0,ty);
629 fjz0 = _mm_add_ps(fjz0,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 r20 = _mm_mul_ps(rsq20,rinv20);
636 r20 = _mm_andnot_ps(dummy_mask,r20);
638 /* Compute parameters for interactions between i and j atoms */
639 qq20 = _mm_mul_ps(iq2,jq0);
641 /* EWALD ELECTROSTATICS */
643 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644 ewrt = _mm_mul_ps(r20,ewtabscale);
645 ewitab = _mm_cvttps_epi32(ewrt);
646 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
647 ewitab = _mm_slli_epi32(ewitab,2);
648 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
649 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
650 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
651 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
652 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
653 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
654 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
655 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
656 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_andnot_ps(dummy_mask,velec);
660 velecsum = _mm_add_ps(velecsum,velec);
664 fscal = _mm_andnot_ps(dummy_mask,fscal);
666 /* Calculate temporary vectorial force */
667 tx = _mm_mul_ps(fscal,dx20);
668 ty = _mm_mul_ps(fscal,dy20);
669 tz = _mm_mul_ps(fscal,dz20);
671 /* Update vectorial force */
672 fix2 = _mm_add_ps(fix2,tx);
673 fiy2 = _mm_add_ps(fiy2,ty);
674 fiz2 = _mm_add_ps(fiz2,tz);
676 fjx0 = _mm_add_ps(fjx0,tx);
677 fjy0 = _mm_add_ps(fjy0,ty);
678 fjz0 = _mm_add_ps(fjz0,tz);
680 /**************************
681 * CALCULATE INTERACTIONS *
682 **************************/
684 r30 = _mm_mul_ps(rsq30,rinv30);
685 r30 = _mm_andnot_ps(dummy_mask,r30);
687 /* Compute parameters for interactions between i and j atoms */
688 qq30 = _mm_mul_ps(iq3,jq0);
690 /* EWALD ELECTROSTATICS */
692 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
693 ewrt = _mm_mul_ps(r30,ewtabscale);
694 ewitab = _mm_cvttps_epi32(ewrt);
695 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
696 ewitab = _mm_slli_epi32(ewitab,2);
697 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
698 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
699 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
700 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
701 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
702 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
703 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
704 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
705 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
707 /* Update potential sum for this i atom from the interaction with this j atom. */
708 velec = _mm_andnot_ps(dummy_mask,velec);
709 velecsum = _mm_add_ps(velecsum,velec);
713 fscal = _mm_andnot_ps(dummy_mask,fscal);
715 /* Calculate temporary vectorial force */
716 tx = _mm_mul_ps(fscal,dx30);
717 ty = _mm_mul_ps(fscal,dy30);
718 tz = _mm_mul_ps(fscal,dz30);
720 /* Update vectorial force */
721 fix3 = _mm_add_ps(fix3,tx);
722 fiy3 = _mm_add_ps(fiy3,ty);
723 fiz3 = _mm_add_ps(fiz3,tz);
725 fjx0 = _mm_add_ps(fjx0,tx);
726 fjy0 = _mm_add_ps(fjy0,ty);
727 fjz0 = _mm_add_ps(fjz0,tz);
729 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
730 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
731 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
732 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
734 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
736 /* Inner loop uses 178 flops */
739 /* End of innermost loop */
741 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
742 f+i_coord_offset,fshift+i_shift_offset);
745 /* Update potential energies */
746 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
747 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
749 /* Increment number of inner iterations */
750 inneriter += j_index_end - j_index_start;
752 /* Outer loop uses 26 flops */
755 /* Increment number of outer iterations */
758 /* Update outer/inner flops */
760 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*178);
763 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single
764 * Electrostatics interaction: Ewald
765 * VdW interaction: LJEwald
766 * Geometry: Water4-Particle
767 * Calculate force/pot: Force
770 nb_kernel_ElecEw_VdwLJEw_GeomW4P1_F_sse2_single
771 (t_nblist * gmx_restrict nlist,
772 rvec * gmx_restrict xx,
773 rvec * gmx_restrict ff,
774 t_forcerec * gmx_restrict fr,
775 t_mdatoms * gmx_restrict mdatoms,
776 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
777 t_nrnb * gmx_restrict nrnb)
779 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
780 * just 0 for non-waters.
781 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
782 * jnr indices corresponding to data put in the four positions in the SIMD register.
784 int i_shift_offset,i_coord_offset,outeriter,inneriter;
785 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
786 int jnrA,jnrB,jnrC,jnrD;
787 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
788 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
789 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
791 real *shiftvec,*fshift,*x,*f;
792 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
794 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
796 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
798 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
800 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
802 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
803 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
804 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
805 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
806 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
807 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
808 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
809 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
812 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
815 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
816 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
821 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
823 __m128 one_half = _mm_set1_ps(0.5);
824 __m128 minus_one = _mm_set1_ps(-1.0);
826 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
828 __m128 dummy_mask,cutoff_mask;
829 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
830 __m128 one = _mm_set1_ps(1.0);
831 __m128 two = _mm_set1_ps(2.0);
837 jindex = nlist->jindex;
839 shiftidx = nlist->shift;
841 shiftvec = fr->shift_vec[0];
842 fshift = fr->fshift[0];
843 facel = _mm_set1_ps(fr->epsfac);
844 charge = mdatoms->chargeA;
845 nvdwtype = fr->ntype;
847 vdwtype = mdatoms->typeA;
848 vdwgridparam = fr->ljpme_c6grid;
849 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
850 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
851 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
853 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
854 ewtab = fr->ic->tabq_coul_F;
855 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
856 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
858 /* Setup water-specific parameters */
859 inr = nlist->iinr[0];
860 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
861 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
862 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
863 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
865 /* Avoid stupid compiler warnings */
866 jnrA = jnrB = jnrC = jnrD = 0;
875 for(iidx=0;iidx<4*DIM;iidx++)
880 /* Start outer loop over neighborlists */
881 for(iidx=0; iidx<nri; iidx++)
883 /* Load shift vector for this list */
884 i_shift_offset = DIM*shiftidx[iidx];
886 /* Load limits for loop over neighbors */
887 j_index_start = jindex[iidx];
888 j_index_end = jindex[iidx+1];
890 /* Get outer coordinate index */
892 i_coord_offset = DIM*inr;
894 /* Load i particle coords and add shift vector */
895 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
896 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
898 fix0 = _mm_setzero_ps();
899 fiy0 = _mm_setzero_ps();
900 fiz0 = _mm_setzero_ps();
901 fix1 = _mm_setzero_ps();
902 fiy1 = _mm_setzero_ps();
903 fiz1 = _mm_setzero_ps();
904 fix2 = _mm_setzero_ps();
905 fiy2 = _mm_setzero_ps();
906 fiz2 = _mm_setzero_ps();
907 fix3 = _mm_setzero_ps();
908 fiy3 = _mm_setzero_ps();
909 fiz3 = _mm_setzero_ps();
911 /* Start inner kernel loop */
912 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
915 /* Get j neighbor index, and coordinate index */
920 j_coord_offsetA = DIM*jnrA;
921 j_coord_offsetB = DIM*jnrB;
922 j_coord_offsetC = DIM*jnrC;
923 j_coord_offsetD = DIM*jnrD;
925 /* load j atom coordinates */
926 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
927 x+j_coord_offsetC,x+j_coord_offsetD,
930 /* Calculate displacement vector */
931 dx00 = _mm_sub_ps(ix0,jx0);
932 dy00 = _mm_sub_ps(iy0,jy0);
933 dz00 = _mm_sub_ps(iz0,jz0);
934 dx10 = _mm_sub_ps(ix1,jx0);
935 dy10 = _mm_sub_ps(iy1,jy0);
936 dz10 = _mm_sub_ps(iz1,jz0);
937 dx20 = _mm_sub_ps(ix2,jx0);
938 dy20 = _mm_sub_ps(iy2,jy0);
939 dz20 = _mm_sub_ps(iz2,jz0);
940 dx30 = _mm_sub_ps(ix3,jx0);
941 dy30 = _mm_sub_ps(iy3,jy0);
942 dz30 = _mm_sub_ps(iz3,jz0);
944 /* Calculate squared distance and things based on it */
945 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
946 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
947 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
948 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
950 rinv00 = gmx_mm_invsqrt_ps(rsq00);
951 rinv10 = gmx_mm_invsqrt_ps(rsq10);
952 rinv20 = gmx_mm_invsqrt_ps(rsq20);
953 rinv30 = gmx_mm_invsqrt_ps(rsq30);
955 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
956 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
957 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
958 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
960 /* Load parameters for j particles */
961 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
962 charge+jnrC+0,charge+jnrD+0);
963 vdwjidx0A = 2*vdwtype[jnrA+0];
964 vdwjidx0B = 2*vdwtype[jnrB+0];
965 vdwjidx0C = 2*vdwtype[jnrC+0];
966 vdwjidx0D = 2*vdwtype[jnrD+0];
968 fjx0 = _mm_setzero_ps();
969 fjy0 = _mm_setzero_ps();
970 fjz0 = _mm_setzero_ps();
972 /**************************
973 * CALCULATE INTERACTIONS *
974 **************************/
976 r00 = _mm_mul_ps(rsq00,rinv00);
978 /* Compute parameters for interactions between i and j atoms */
979 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
980 vdwparam+vdwioffset0+vdwjidx0B,
981 vdwparam+vdwioffset0+vdwjidx0C,
982 vdwparam+vdwioffset0+vdwjidx0D,
984 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
985 vdwgridparam+vdwioffset0+vdwjidx0B,
986 vdwgridparam+vdwioffset0+vdwjidx0C,
987 vdwgridparam+vdwioffset0+vdwjidx0D);
989 /* Analytical LJ-PME */
990 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
991 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
992 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
993 exponent = gmx_simd_exp_r(ewcljrsq);
994 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
995 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
996 /* f6A = 6 * C6grid * (1 - poly) */
997 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
998 /* f6B = C6grid * exponent * beta^6 */
999 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1000 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1001 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1005 /* Calculate temporary vectorial force */
1006 tx = _mm_mul_ps(fscal,dx00);
1007 ty = _mm_mul_ps(fscal,dy00);
1008 tz = _mm_mul_ps(fscal,dz00);
1010 /* Update vectorial force */
1011 fix0 = _mm_add_ps(fix0,tx);
1012 fiy0 = _mm_add_ps(fiy0,ty);
1013 fiz0 = _mm_add_ps(fiz0,tz);
1015 fjx0 = _mm_add_ps(fjx0,tx);
1016 fjy0 = _mm_add_ps(fjy0,ty);
1017 fjz0 = _mm_add_ps(fjz0,tz);
1019 /**************************
1020 * CALCULATE INTERACTIONS *
1021 **************************/
1023 r10 = _mm_mul_ps(rsq10,rinv10);
1025 /* Compute parameters for interactions between i and j atoms */
1026 qq10 = _mm_mul_ps(iq1,jq0);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_ps(r10,ewtabscale);
1032 ewitab = _mm_cvttps_epi32(ewrt);
1033 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1034 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1035 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1037 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1038 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1042 /* Calculate temporary vectorial force */
1043 tx = _mm_mul_ps(fscal,dx10);
1044 ty = _mm_mul_ps(fscal,dy10);
1045 tz = _mm_mul_ps(fscal,dz10);
1047 /* Update vectorial force */
1048 fix1 = _mm_add_ps(fix1,tx);
1049 fiy1 = _mm_add_ps(fiy1,ty);
1050 fiz1 = _mm_add_ps(fiz1,tz);
1052 fjx0 = _mm_add_ps(fjx0,tx);
1053 fjy0 = _mm_add_ps(fjy0,ty);
1054 fjz0 = _mm_add_ps(fjz0,tz);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 r20 = _mm_mul_ps(rsq20,rinv20);
1062 /* Compute parameters for interactions between i and j atoms */
1063 qq20 = _mm_mul_ps(iq2,jq0);
1065 /* EWALD ELECTROSTATICS */
1067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1068 ewrt = _mm_mul_ps(r20,ewtabscale);
1069 ewitab = _mm_cvttps_epi32(ewrt);
1070 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1071 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1072 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1074 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1075 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1079 /* Calculate temporary vectorial force */
1080 tx = _mm_mul_ps(fscal,dx20);
1081 ty = _mm_mul_ps(fscal,dy20);
1082 tz = _mm_mul_ps(fscal,dz20);
1084 /* Update vectorial force */
1085 fix2 = _mm_add_ps(fix2,tx);
1086 fiy2 = _mm_add_ps(fiy2,ty);
1087 fiz2 = _mm_add_ps(fiz2,tz);
1089 fjx0 = _mm_add_ps(fjx0,tx);
1090 fjy0 = _mm_add_ps(fjy0,ty);
1091 fjz0 = _mm_add_ps(fjz0,tz);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r30 = _mm_mul_ps(rsq30,rinv30);
1099 /* Compute parameters for interactions between i and j atoms */
1100 qq30 = _mm_mul_ps(iq3,jq0);
1102 /* EWALD ELECTROSTATICS */
1104 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1105 ewrt = _mm_mul_ps(r30,ewtabscale);
1106 ewitab = _mm_cvttps_epi32(ewrt);
1107 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1108 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1109 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1111 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1112 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1116 /* Calculate temporary vectorial force */
1117 tx = _mm_mul_ps(fscal,dx30);
1118 ty = _mm_mul_ps(fscal,dy30);
1119 tz = _mm_mul_ps(fscal,dz30);
1121 /* Update vectorial force */
1122 fix3 = _mm_add_ps(fix3,tx);
1123 fiy3 = _mm_add_ps(fiy3,ty);
1124 fiz3 = _mm_add_ps(fiz3,tz);
1126 fjx0 = _mm_add_ps(fjx0,tx);
1127 fjy0 = _mm_add_ps(fjy0,ty);
1128 fjz0 = _mm_add_ps(fjz0,tz);
1130 fjptrA = f+j_coord_offsetA;
1131 fjptrB = f+j_coord_offsetB;
1132 fjptrC = f+j_coord_offsetC;
1133 fjptrD = f+j_coord_offsetD;
1135 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1137 /* Inner loop uses 154 flops */
1140 if(jidx<j_index_end)
1143 /* Get j neighbor index, and coordinate index */
1144 jnrlistA = jjnr[jidx];
1145 jnrlistB = jjnr[jidx+1];
1146 jnrlistC = jjnr[jidx+2];
1147 jnrlistD = jjnr[jidx+3];
1148 /* Sign of each element will be negative for non-real atoms.
1149 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1150 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1152 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1153 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1154 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1155 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1156 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1157 j_coord_offsetA = DIM*jnrA;
1158 j_coord_offsetB = DIM*jnrB;
1159 j_coord_offsetC = DIM*jnrC;
1160 j_coord_offsetD = DIM*jnrD;
1162 /* load j atom coordinates */
1163 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1164 x+j_coord_offsetC,x+j_coord_offsetD,
1167 /* Calculate displacement vector */
1168 dx00 = _mm_sub_ps(ix0,jx0);
1169 dy00 = _mm_sub_ps(iy0,jy0);
1170 dz00 = _mm_sub_ps(iz0,jz0);
1171 dx10 = _mm_sub_ps(ix1,jx0);
1172 dy10 = _mm_sub_ps(iy1,jy0);
1173 dz10 = _mm_sub_ps(iz1,jz0);
1174 dx20 = _mm_sub_ps(ix2,jx0);
1175 dy20 = _mm_sub_ps(iy2,jy0);
1176 dz20 = _mm_sub_ps(iz2,jz0);
1177 dx30 = _mm_sub_ps(ix3,jx0);
1178 dy30 = _mm_sub_ps(iy3,jy0);
1179 dz30 = _mm_sub_ps(iz3,jz0);
1181 /* Calculate squared distance and things based on it */
1182 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1183 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1184 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1185 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1187 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1188 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1189 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1190 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1192 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1193 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1194 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1195 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1197 /* Load parameters for j particles */
1198 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1199 charge+jnrC+0,charge+jnrD+0);
1200 vdwjidx0A = 2*vdwtype[jnrA+0];
1201 vdwjidx0B = 2*vdwtype[jnrB+0];
1202 vdwjidx0C = 2*vdwtype[jnrC+0];
1203 vdwjidx0D = 2*vdwtype[jnrD+0];
1205 fjx0 = _mm_setzero_ps();
1206 fjy0 = _mm_setzero_ps();
1207 fjz0 = _mm_setzero_ps();
1209 /**************************
1210 * CALCULATE INTERACTIONS *
1211 **************************/
1213 r00 = _mm_mul_ps(rsq00,rinv00);
1214 r00 = _mm_andnot_ps(dummy_mask,r00);
1216 /* Compute parameters for interactions between i and j atoms */
1217 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1218 vdwparam+vdwioffset0+vdwjidx0B,
1219 vdwparam+vdwioffset0+vdwjidx0C,
1220 vdwparam+vdwioffset0+vdwjidx0D,
1222 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
1223 vdwgridparam+vdwioffset0+vdwjidx0B,
1224 vdwgridparam+vdwioffset0+vdwjidx0C,
1225 vdwgridparam+vdwioffset0+vdwjidx0D);
1227 /* Analytical LJ-PME */
1228 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1229 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1230 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1231 exponent = gmx_simd_exp_r(ewcljrsq);
1232 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1233 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1234 /* f6A = 6 * C6grid * (1 - poly) */
1235 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1236 /* f6B = C6grid * exponent * beta^6 */
1237 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1238 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1239 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1243 fscal = _mm_andnot_ps(dummy_mask,fscal);
1245 /* Calculate temporary vectorial force */
1246 tx = _mm_mul_ps(fscal,dx00);
1247 ty = _mm_mul_ps(fscal,dy00);
1248 tz = _mm_mul_ps(fscal,dz00);
1250 /* Update vectorial force */
1251 fix0 = _mm_add_ps(fix0,tx);
1252 fiy0 = _mm_add_ps(fiy0,ty);
1253 fiz0 = _mm_add_ps(fiz0,tz);
1255 fjx0 = _mm_add_ps(fjx0,tx);
1256 fjy0 = _mm_add_ps(fjy0,ty);
1257 fjz0 = _mm_add_ps(fjz0,tz);
1259 /**************************
1260 * CALCULATE INTERACTIONS *
1261 **************************/
1263 r10 = _mm_mul_ps(rsq10,rinv10);
1264 r10 = _mm_andnot_ps(dummy_mask,r10);
1266 /* Compute parameters for interactions between i and j atoms */
1267 qq10 = _mm_mul_ps(iq1,jq0);
1269 /* EWALD ELECTROSTATICS */
1271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272 ewrt = _mm_mul_ps(r10,ewtabscale);
1273 ewitab = _mm_cvttps_epi32(ewrt);
1274 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1275 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1276 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1278 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1279 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx10);
1287 ty = _mm_mul_ps(fscal,dy10);
1288 tz = _mm_mul_ps(fscal,dz10);
1290 /* Update vectorial force */
1291 fix1 = _mm_add_ps(fix1,tx);
1292 fiy1 = _mm_add_ps(fiy1,ty);
1293 fiz1 = _mm_add_ps(fiz1,tz);
1295 fjx0 = _mm_add_ps(fjx0,tx);
1296 fjy0 = _mm_add_ps(fjy0,ty);
1297 fjz0 = _mm_add_ps(fjz0,tz);
1299 /**************************
1300 * CALCULATE INTERACTIONS *
1301 **************************/
1303 r20 = _mm_mul_ps(rsq20,rinv20);
1304 r20 = _mm_andnot_ps(dummy_mask,r20);
1306 /* Compute parameters for interactions between i and j atoms */
1307 qq20 = _mm_mul_ps(iq2,jq0);
1309 /* EWALD ELECTROSTATICS */
1311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1312 ewrt = _mm_mul_ps(r20,ewtabscale);
1313 ewitab = _mm_cvttps_epi32(ewrt);
1314 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1315 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1316 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1318 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1319 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1323 fscal = _mm_andnot_ps(dummy_mask,fscal);
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_ps(fscal,dx20);
1327 ty = _mm_mul_ps(fscal,dy20);
1328 tz = _mm_mul_ps(fscal,dz20);
1330 /* Update vectorial force */
1331 fix2 = _mm_add_ps(fix2,tx);
1332 fiy2 = _mm_add_ps(fiy2,ty);
1333 fiz2 = _mm_add_ps(fiz2,tz);
1335 fjx0 = _mm_add_ps(fjx0,tx);
1336 fjy0 = _mm_add_ps(fjy0,ty);
1337 fjz0 = _mm_add_ps(fjz0,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 r30 = _mm_mul_ps(rsq30,rinv30);
1344 r30 = _mm_andnot_ps(dummy_mask,r30);
1346 /* Compute parameters for interactions between i and j atoms */
1347 qq30 = _mm_mul_ps(iq3,jq0);
1349 /* EWALD ELECTROSTATICS */
1351 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1352 ewrt = _mm_mul_ps(r30,ewtabscale);
1353 ewitab = _mm_cvttps_epi32(ewrt);
1354 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1355 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1356 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1358 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1359 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1363 fscal = _mm_andnot_ps(dummy_mask,fscal);
1365 /* Calculate temporary vectorial force */
1366 tx = _mm_mul_ps(fscal,dx30);
1367 ty = _mm_mul_ps(fscal,dy30);
1368 tz = _mm_mul_ps(fscal,dz30);
1370 /* Update vectorial force */
1371 fix3 = _mm_add_ps(fix3,tx);
1372 fiy3 = _mm_add_ps(fiy3,ty);
1373 fiz3 = _mm_add_ps(fiz3,tz);
1375 fjx0 = _mm_add_ps(fjx0,tx);
1376 fjy0 = _mm_add_ps(fjy0,ty);
1377 fjz0 = _mm_add_ps(fjz0,tz);
1379 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1380 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1381 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1382 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1384 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1386 /* Inner loop uses 158 flops */
1389 /* End of innermost loop */
1391 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1392 f+i_coord_offset,fshift+i_shift_offset);
1394 /* Increment number of inner iterations */
1395 inneriter += j_index_end - j_index_start;
1397 /* Outer loop uses 24 flops */
1400 /* Increment number of outer iterations */
1403 /* Update outer/inner flops */
1405 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*158);