2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
80 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
81 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
82 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
83 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
86 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
89 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
90 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
91 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
92 real rswitch_scalar,d_scalar;
93 __m128 dummy_mask,cutoff_mask;
94 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
95 __m128 one = _mm_set1_ps(1.0);
96 __m128 two = _mm_set1_ps(2.0);
102 jindex = nlist->jindex;
104 shiftidx = nlist->shift;
106 shiftvec = fr->shift_vec[0];
107 fshift = fr->fshift[0];
108 facel = _mm_set1_ps(fr->epsfac);
109 charge = mdatoms->chargeA;
110 krf = _mm_set1_ps(fr->ic->k_rf);
111 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
112 crf = _mm_set1_ps(fr->ic->c_rf);
113 nvdwtype = fr->ntype;
115 vdwtype = mdatoms->typeA;
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
120 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
121 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
122 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
124 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
125 rcutoff_scalar = fr->rcoulomb;
126 rcutoff = _mm_set1_ps(rcutoff_scalar);
127 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
129 rswitch_scalar = fr->rvdw_switch;
130 rswitch = _mm_set1_ps(rswitch_scalar);
131 /* Setup switch parameters */
132 d_scalar = rcutoff_scalar-rswitch_scalar;
133 d = _mm_set1_ps(d_scalar);
134 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
135 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
136 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
137 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
138 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
139 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
141 /* Avoid stupid compiler warnings */
142 jnrA = jnrB = jnrC = jnrD = 0;
151 for(iidx=0;iidx<4*DIM;iidx++)
156 /* Start outer loop over neighborlists */
157 for(iidx=0; iidx<nri; iidx++)
159 /* Load shift vector for this list */
160 i_shift_offset = DIM*shiftidx[iidx];
162 /* Load limits for loop over neighbors */
163 j_index_start = jindex[iidx];
164 j_index_end = jindex[iidx+1];
166 /* Get outer coordinate index */
168 i_coord_offset = DIM*inr;
170 /* Load i particle coords and add shift vector */
171 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
172 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
174 fix0 = _mm_setzero_ps();
175 fiy0 = _mm_setzero_ps();
176 fiz0 = _mm_setzero_ps();
177 fix1 = _mm_setzero_ps();
178 fiy1 = _mm_setzero_ps();
179 fiz1 = _mm_setzero_ps();
180 fix2 = _mm_setzero_ps();
181 fiy2 = _mm_setzero_ps();
182 fiz2 = _mm_setzero_ps();
183 fix3 = _mm_setzero_ps();
184 fiy3 = _mm_setzero_ps();
185 fiz3 = _mm_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm_setzero_ps();
189 vvdwsum = _mm_setzero_ps();
191 /* Start inner kernel loop */
192 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
195 /* Get j neighbor index, and coordinate index */
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
207 x+j_coord_offsetC,x+j_coord_offsetD,
210 /* Calculate displacement vector */
211 dx00 = _mm_sub_ps(ix0,jx0);
212 dy00 = _mm_sub_ps(iy0,jy0);
213 dz00 = _mm_sub_ps(iz0,jz0);
214 dx10 = _mm_sub_ps(ix1,jx0);
215 dy10 = _mm_sub_ps(iy1,jy0);
216 dz10 = _mm_sub_ps(iz1,jz0);
217 dx20 = _mm_sub_ps(ix2,jx0);
218 dy20 = _mm_sub_ps(iy2,jy0);
219 dz20 = _mm_sub_ps(iz2,jz0);
220 dx30 = _mm_sub_ps(ix3,jx0);
221 dy30 = _mm_sub_ps(iy3,jy0);
222 dz30 = _mm_sub_ps(iz3,jz0);
224 /* Calculate squared distance and things based on it */
225 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
226 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
227 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
228 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
230 rinv00 = gmx_mm_invsqrt_ps(rsq00);
231 rinv10 = gmx_mm_invsqrt_ps(rsq10);
232 rinv20 = gmx_mm_invsqrt_ps(rsq20);
233 rinv30 = gmx_mm_invsqrt_ps(rsq30);
235 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
236 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
237 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
238 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
240 /* Load parameters for j particles */
241 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
242 charge+jnrC+0,charge+jnrD+0);
243 vdwjidx0A = 2*vdwtype[jnrA+0];
244 vdwjidx0B = 2*vdwtype[jnrB+0];
245 vdwjidx0C = 2*vdwtype[jnrC+0];
246 vdwjidx0D = 2*vdwtype[jnrD+0];
248 fjx0 = _mm_setzero_ps();
249 fjy0 = _mm_setzero_ps();
250 fjz0 = _mm_setzero_ps();
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 if (gmx_mm_any_lt(rsq00,rcutoff2))
259 r00 = _mm_mul_ps(rsq00,rinv00);
261 /* Compute parameters for interactions between i and j atoms */
262 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
263 vdwparam+vdwioffset0+vdwjidx0B,
264 vdwparam+vdwioffset0+vdwjidx0C,
265 vdwparam+vdwioffset0+vdwjidx0D,
268 /* LENNARD-JONES DISPERSION/REPULSION */
270 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
271 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
272 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
273 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
274 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
276 d = _mm_sub_ps(r00,rswitch);
277 d = _mm_max_ps(d,_mm_setzero_ps());
278 d2 = _mm_mul_ps(d,d);
279 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
281 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
283 /* Evaluate switch function */
284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
285 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
286 vvdw = _mm_mul_ps(vvdw,sw);
287 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 vvdw = _mm_and_ps(vvdw,cutoff_mask);
291 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
295 fscal = _mm_and_ps(fscal,cutoff_mask);
297 /* Update vectorial force */
298 fix0 = _mm_macc_ps(dx00,fscal,fix0);
299 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
300 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
302 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
303 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
304 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq10,rcutoff2))
315 /* Compute parameters for interactions between i and j atoms */
316 qq10 = _mm_mul_ps(iq1,jq0);
318 /* REACTION-FIELD ELECTROSTATICS */
319 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
320 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
322 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velec = _mm_and_ps(velec,cutoff_mask);
326 velecsum = _mm_add_ps(velecsum,velec);
330 fscal = _mm_and_ps(fscal,cutoff_mask);
332 /* Update vectorial force */
333 fix1 = _mm_macc_ps(dx10,fscal,fix1);
334 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
335 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
337 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
338 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
339 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 if (gmx_mm_any_lt(rsq20,rcutoff2))
350 /* Compute parameters for interactions between i and j atoms */
351 qq20 = _mm_mul_ps(iq2,jq0);
353 /* REACTION-FIELD ELECTROSTATICS */
354 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
355 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
357 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velec = _mm_and_ps(velec,cutoff_mask);
361 velecsum = _mm_add_ps(velecsum,velec);
365 fscal = _mm_and_ps(fscal,cutoff_mask);
367 /* Update vectorial force */
368 fix2 = _mm_macc_ps(dx20,fscal,fix2);
369 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
370 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
372 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
373 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
374 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 if (gmx_mm_any_lt(rsq30,rcutoff2))
385 /* Compute parameters for interactions between i and j atoms */
386 qq30 = _mm_mul_ps(iq3,jq0);
388 /* REACTION-FIELD ELECTROSTATICS */
389 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_macc_ps(krf,rsq30,rinv30),crf));
390 felec = _mm_mul_ps(qq30,_mm_msub_ps(rinv30,rinvsq30,krf2));
392 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velec = _mm_and_ps(velec,cutoff_mask);
396 velecsum = _mm_add_ps(velecsum,velec);
400 fscal = _mm_and_ps(fscal,cutoff_mask);
402 /* Update vectorial force */
403 fix3 = _mm_macc_ps(dx30,fscal,fix3);
404 fiy3 = _mm_macc_ps(dy30,fscal,fiy3);
405 fiz3 = _mm_macc_ps(dz30,fscal,fiz3);
407 fjx0 = _mm_macc_ps(dx30,fscal,fjx0);
408 fjy0 = _mm_macc_ps(dy30,fscal,fjy0);
409 fjz0 = _mm_macc_ps(dz30,fscal,fjz0);
413 fjptrA = f+j_coord_offsetA;
414 fjptrB = f+j_coord_offsetB;
415 fjptrC = f+j_coord_offsetC;
416 fjptrD = f+j_coord_offsetD;
418 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
420 /* Inner loop uses 179 flops */
426 /* Get j neighbor index, and coordinate index */
427 jnrlistA = jjnr[jidx];
428 jnrlistB = jjnr[jidx+1];
429 jnrlistC = jjnr[jidx+2];
430 jnrlistD = jjnr[jidx+3];
431 /* Sign of each element will be negative for non-real atoms.
432 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
433 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
435 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
436 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
437 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
438 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
439 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
440 j_coord_offsetA = DIM*jnrA;
441 j_coord_offsetB = DIM*jnrB;
442 j_coord_offsetC = DIM*jnrC;
443 j_coord_offsetD = DIM*jnrD;
445 /* load j atom coordinates */
446 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
447 x+j_coord_offsetC,x+j_coord_offsetD,
450 /* Calculate displacement vector */
451 dx00 = _mm_sub_ps(ix0,jx0);
452 dy00 = _mm_sub_ps(iy0,jy0);
453 dz00 = _mm_sub_ps(iz0,jz0);
454 dx10 = _mm_sub_ps(ix1,jx0);
455 dy10 = _mm_sub_ps(iy1,jy0);
456 dz10 = _mm_sub_ps(iz1,jz0);
457 dx20 = _mm_sub_ps(ix2,jx0);
458 dy20 = _mm_sub_ps(iy2,jy0);
459 dz20 = _mm_sub_ps(iz2,jz0);
460 dx30 = _mm_sub_ps(ix3,jx0);
461 dy30 = _mm_sub_ps(iy3,jy0);
462 dz30 = _mm_sub_ps(iz3,jz0);
464 /* Calculate squared distance and things based on it */
465 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
466 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
467 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
468 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
470 rinv00 = gmx_mm_invsqrt_ps(rsq00);
471 rinv10 = gmx_mm_invsqrt_ps(rsq10);
472 rinv20 = gmx_mm_invsqrt_ps(rsq20);
473 rinv30 = gmx_mm_invsqrt_ps(rsq30);
475 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
476 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
477 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
478 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
480 /* Load parameters for j particles */
481 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
482 charge+jnrC+0,charge+jnrD+0);
483 vdwjidx0A = 2*vdwtype[jnrA+0];
484 vdwjidx0B = 2*vdwtype[jnrB+0];
485 vdwjidx0C = 2*vdwtype[jnrC+0];
486 vdwjidx0D = 2*vdwtype[jnrD+0];
488 fjx0 = _mm_setzero_ps();
489 fjy0 = _mm_setzero_ps();
490 fjz0 = _mm_setzero_ps();
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 if (gmx_mm_any_lt(rsq00,rcutoff2))
499 r00 = _mm_mul_ps(rsq00,rinv00);
500 r00 = _mm_andnot_ps(dummy_mask,r00);
502 /* Compute parameters for interactions between i and j atoms */
503 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
504 vdwparam+vdwioffset0+vdwjidx0B,
505 vdwparam+vdwioffset0+vdwjidx0C,
506 vdwparam+vdwioffset0+vdwjidx0D,
509 /* LENNARD-JONES DISPERSION/REPULSION */
511 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
512 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
513 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
514 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
515 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
517 d = _mm_sub_ps(r00,rswitch);
518 d = _mm_max_ps(d,_mm_setzero_ps());
519 d2 = _mm_mul_ps(d,d);
520 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
522 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
524 /* Evaluate switch function */
525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
526 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
527 vvdw = _mm_mul_ps(vvdw,sw);
528 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 vvdw = _mm_and_ps(vvdw,cutoff_mask);
532 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
533 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
537 fscal = _mm_and_ps(fscal,cutoff_mask);
539 fscal = _mm_andnot_ps(dummy_mask,fscal);
541 /* Update vectorial force */
542 fix0 = _mm_macc_ps(dx00,fscal,fix0);
543 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
544 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
546 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
547 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
548 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
552 /**************************
553 * CALCULATE INTERACTIONS *
554 **************************/
556 if (gmx_mm_any_lt(rsq10,rcutoff2))
559 /* Compute parameters for interactions between i and j atoms */
560 qq10 = _mm_mul_ps(iq1,jq0);
562 /* REACTION-FIELD ELECTROSTATICS */
563 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
564 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
566 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_ps(velec,cutoff_mask);
570 velec = _mm_andnot_ps(dummy_mask,velec);
571 velecsum = _mm_add_ps(velecsum,velec);
575 fscal = _mm_and_ps(fscal,cutoff_mask);
577 fscal = _mm_andnot_ps(dummy_mask,fscal);
579 /* Update vectorial force */
580 fix1 = _mm_macc_ps(dx10,fscal,fix1);
581 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
582 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
584 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
585 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
586 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 if (gmx_mm_any_lt(rsq20,rcutoff2))
597 /* Compute parameters for interactions between i and j atoms */
598 qq20 = _mm_mul_ps(iq2,jq0);
600 /* REACTION-FIELD ELECTROSTATICS */
601 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
602 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
604 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec = _mm_and_ps(velec,cutoff_mask);
608 velec = _mm_andnot_ps(dummy_mask,velec);
609 velecsum = _mm_add_ps(velecsum,velec);
613 fscal = _mm_and_ps(fscal,cutoff_mask);
615 fscal = _mm_andnot_ps(dummy_mask,fscal);
617 /* Update vectorial force */
618 fix2 = _mm_macc_ps(dx20,fscal,fix2);
619 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
620 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
622 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
623 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
624 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 if (gmx_mm_any_lt(rsq30,rcutoff2))
635 /* Compute parameters for interactions between i and j atoms */
636 qq30 = _mm_mul_ps(iq3,jq0);
638 /* REACTION-FIELD ELECTROSTATICS */
639 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_macc_ps(krf,rsq30,rinv30),crf));
640 felec = _mm_mul_ps(qq30,_mm_msub_ps(rinv30,rinvsq30,krf2));
642 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec = _mm_and_ps(velec,cutoff_mask);
646 velec = _mm_andnot_ps(dummy_mask,velec);
647 velecsum = _mm_add_ps(velecsum,velec);
651 fscal = _mm_and_ps(fscal,cutoff_mask);
653 fscal = _mm_andnot_ps(dummy_mask,fscal);
655 /* Update vectorial force */
656 fix3 = _mm_macc_ps(dx30,fscal,fix3);
657 fiy3 = _mm_macc_ps(dy30,fscal,fiy3);
658 fiz3 = _mm_macc_ps(dz30,fscal,fiz3);
660 fjx0 = _mm_macc_ps(dx30,fscal,fjx0);
661 fjy0 = _mm_macc_ps(dy30,fscal,fjy0);
662 fjz0 = _mm_macc_ps(dz30,fscal,fjz0);
666 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
667 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
668 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
669 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
671 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
673 /* Inner loop uses 180 flops */
676 /* End of innermost loop */
678 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
679 f+i_coord_offset,fshift+i_shift_offset);
682 /* Update potential energies */
683 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
684 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
686 /* Increment number of inner iterations */
687 inneriter += j_index_end - j_index_start;
689 /* Outer loop uses 26 flops */
692 /* Increment number of outer iterations */
695 /* Update outer/inner flops */
697 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*180);
700 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
701 * Electrostatics interaction: ReactionField
702 * VdW interaction: LennardJones
703 * Geometry: Water4-Particle
704 * Calculate force/pot: Force
707 nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_avx_128_fma_single
708 (t_nblist * gmx_restrict nlist,
709 rvec * gmx_restrict xx,
710 rvec * gmx_restrict ff,
711 t_forcerec * gmx_restrict fr,
712 t_mdatoms * gmx_restrict mdatoms,
713 nb_kernel_data_t * gmx_restrict kernel_data,
714 t_nrnb * gmx_restrict nrnb)
716 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
717 * just 0 for non-waters.
718 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
719 * jnr indices corresponding to data put in the four positions in the SIMD register.
721 int i_shift_offset,i_coord_offset,outeriter,inneriter;
722 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
723 int jnrA,jnrB,jnrC,jnrD;
724 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
725 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
726 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
728 real *shiftvec,*fshift,*x,*f;
729 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
731 __m128 fscal,rcutoff,rcutoff2,jidxall;
733 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
735 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
737 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
739 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
740 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
741 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
742 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
743 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
744 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
745 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
746 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
749 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
752 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
753 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
754 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
755 real rswitch_scalar,d_scalar;
756 __m128 dummy_mask,cutoff_mask;
757 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
758 __m128 one = _mm_set1_ps(1.0);
759 __m128 two = _mm_set1_ps(2.0);
765 jindex = nlist->jindex;
767 shiftidx = nlist->shift;
769 shiftvec = fr->shift_vec[0];
770 fshift = fr->fshift[0];
771 facel = _mm_set1_ps(fr->epsfac);
772 charge = mdatoms->chargeA;
773 krf = _mm_set1_ps(fr->ic->k_rf);
774 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
775 crf = _mm_set1_ps(fr->ic->c_rf);
776 nvdwtype = fr->ntype;
778 vdwtype = mdatoms->typeA;
780 /* Setup water-specific parameters */
781 inr = nlist->iinr[0];
782 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
783 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
784 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
785 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
787 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
788 rcutoff_scalar = fr->rcoulomb;
789 rcutoff = _mm_set1_ps(rcutoff_scalar);
790 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
792 rswitch_scalar = fr->rvdw_switch;
793 rswitch = _mm_set1_ps(rswitch_scalar);
794 /* Setup switch parameters */
795 d_scalar = rcutoff_scalar-rswitch_scalar;
796 d = _mm_set1_ps(d_scalar);
797 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
798 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
799 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
800 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
801 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
802 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
804 /* Avoid stupid compiler warnings */
805 jnrA = jnrB = jnrC = jnrD = 0;
814 for(iidx=0;iidx<4*DIM;iidx++)
819 /* Start outer loop over neighborlists */
820 for(iidx=0; iidx<nri; iidx++)
822 /* Load shift vector for this list */
823 i_shift_offset = DIM*shiftidx[iidx];
825 /* Load limits for loop over neighbors */
826 j_index_start = jindex[iidx];
827 j_index_end = jindex[iidx+1];
829 /* Get outer coordinate index */
831 i_coord_offset = DIM*inr;
833 /* Load i particle coords and add shift vector */
834 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
835 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
837 fix0 = _mm_setzero_ps();
838 fiy0 = _mm_setzero_ps();
839 fiz0 = _mm_setzero_ps();
840 fix1 = _mm_setzero_ps();
841 fiy1 = _mm_setzero_ps();
842 fiz1 = _mm_setzero_ps();
843 fix2 = _mm_setzero_ps();
844 fiy2 = _mm_setzero_ps();
845 fiz2 = _mm_setzero_ps();
846 fix3 = _mm_setzero_ps();
847 fiy3 = _mm_setzero_ps();
848 fiz3 = _mm_setzero_ps();
850 /* Start inner kernel loop */
851 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
854 /* Get j neighbor index, and coordinate index */
859 j_coord_offsetA = DIM*jnrA;
860 j_coord_offsetB = DIM*jnrB;
861 j_coord_offsetC = DIM*jnrC;
862 j_coord_offsetD = DIM*jnrD;
864 /* load j atom coordinates */
865 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
866 x+j_coord_offsetC,x+j_coord_offsetD,
869 /* Calculate displacement vector */
870 dx00 = _mm_sub_ps(ix0,jx0);
871 dy00 = _mm_sub_ps(iy0,jy0);
872 dz00 = _mm_sub_ps(iz0,jz0);
873 dx10 = _mm_sub_ps(ix1,jx0);
874 dy10 = _mm_sub_ps(iy1,jy0);
875 dz10 = _mm_sub_ps(iz1,jz0);
876 dx20 = _mm_sub_ps(ix2,jx0);
877 dy20 = _mm_sub_ps(iy2,jy0);
878 dz20 = _mm_sub_ps(iz2,jz0);
879 dx30 = _mm_sub_ps(ix3,jx0);
880 dy30 = _mm_sub_ps(iy3,jy0);
881 dz30 = _mm_sub_ps(iz3,jz0);
883 /* Calculate squared distance and things based on it */
884 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
885 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
886 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
887 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
889 rinv00 = gmx_mm_invsqrt_ps(rsq00);
890 rinv10 = gmx_mm_invsqrt_ps(rsq10);
891 rinv20 = gmx_mm_invsqrt_ps(rsq20);
892 rinv30 = gmx_mm_invsqrt_ps(rsq30);
894 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
895 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
896 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
897 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
899 /* Load parameters for j particles */
900 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
901 charge+jnrC+0,charge+jnrD+0);
902 vdwjidx0A = 2*vdwtype[jnrA+0];
903 vdwjidx0B = 2*vdwtype[jnrB+0];
904 vdwjidx0C = 2*vdwtype[jnrC+0];
905 vdwjidx0D = 2*vdwtype[jnrD+0];
907 fjx0 = _mm_setzero_ps();
908 fjy0 = _mm_setzero_ps();
909 fjz0 = _mm_setzero_ps();
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 if (gmx_mm_any_lt(rsq00,rcutoff2))
918 r00 = _mm_mul_ps(rsq00,rinv00);
920 /* Compute parameters for interactions between i and j atoms */
921 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
922 vdwparam+vdwioffset0+vdwjidx0B,
923 vdwparam+vdwioffset0+vdwjidx0C,
924 vdwparam+vdwioffset0+vdwjidx0D,
927 /* LENNARD-JONES DISPERSION/REPULSION */
929 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
930 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
931 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
932 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
933 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
935 d = _mm_sub_ps(r00,rswitch);
936 d = _mm_max_ps(d,_mm_setzero_ps());
937 d2 = _mm_mul_ps(d,d);
938 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
940 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
942 /* Evaluate switch function */
943 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
944 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
945 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
949 fscal = _mm_and_ps(fscal,cutoff_mask);
951 /* Update vectorial force */
952 fix0 = _mm_macc_ps(dx00,fscal,fix0);
953 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
954 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
956 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
957 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
958 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
962 /**************************
963 * CALCULATE INTERACTIONS *
964 **************************/
966 if (gmx_mm_any_lt(rsq10,rcutoff2))
969 /* Compute parameters for interactions between i and j atoms */
970 qq10 = _mm_mul_ps(iq1,jq0);
972 /* REACTION-FIELD ELECTROSTATICS */
973 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
975 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
979 fscal = _mm_and_ps(fscal,cutoff_mask);
981 /* Update vectorial force */
982 fix1 = _mm_macc_ps(dx10,fscal,fix1);
983 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
984 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
986 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
987 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
988 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 if (gmx_mm_any_lt(rsq20,rcutoff2))
999 /* Compute parameters for interactions between i and j atoms */
1000 qq20 = _mm_mul_ps(iq2,jq0);
1002 /* REACTION-FIELD ELECTROSTATICS */
1003 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1005 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1009 fscal = _mm_and_ps(fscal,cutoff_mask);
1011 /* Update vectorial force */
1012 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1013 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1014 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1016 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1017 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1018 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 if (gmx_mm_any_lt(rsq30,rcutoff2))
1029 /* Compute parameters for interactions between i and j atoms */
1030 qq30 = _mm_mul_ps(iq3,jq0);
1032 /* REACTION-FIELD ELECTROSTATICS */
1033 felec = _mm_mul_ps(qq30,_mm_msub_ps(rinv30,rinvsq30,krf2));
1035 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1039 fscal = _mm_and_ps(fscal,cutoff_mask);
1041 /* Update vectorial force */
1042 fix3 = _mm_macc_ps(dx30,fscal,fix3);
1043 fiy3 = _mm_macc_ps(dy30,fscal,fiy3);
1044 fiz3 = _mm_macc_ps(dz30,fscal,fiz3);
1046 fjx0 = _mm_macc_ps(dx30,fscal,fjx0);
1047 fjy0 = _mm_macc_ps(dy30,fscal,fjy0);
1048 fjz0 = _mm_macc_ps(dz30,fscal,fjz0);
1052 fjptrA = f+j_coord_offsetA;
1053 fjptrB = f+j_coord_offsetB;
1054 fjptrC = f+j_coord_offsetC;
1055 fjptrD = f+j_coord_offsetD;
1057 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1059 /* Inner loop uses 158 flops */
1062 if(jidx<j_index_end)
1065 /* Get j neighbor index, and coordinate index */
1066 jnrlistA = jjnr[jidx];
1067 jnrlistB = jjnr[jidx+1];
1068 jnrlistC = jjnr[jidx+2];
1069 jnrlistD = jjnr[jidx+3];
1070 /* Sign of each element will be negative for non-real atoms.
1071 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1072 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1074 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1075 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1076 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1077 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1078 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1079 j_coord_offsetA = DIM*jnrA;
1080 j_coord_offsetB = DIM*jnrB;
1081 j_coord_offsetC = DIM*jnrC;
1082 j_coord_offsetD = DIM*jnrD;
1084 /* load j atom coordinates */
1085 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1086 x+j_coord_offsetC,x+j_coord_offsetD,
1089 /* Calculate displacement vector */
1090 dx00 = _mm_sub_ps(ix0,jx0);
1091 dy00 = _mm_sub_ps(iy0,jy0);
1092 dz00 = _mm_sub_ps(iz0,jz0);
1093 dx10 = _mm_sub_ps(ix1,jx0);
1094 dy10 = _mm_sub_ps(iy1,jy0);
1095 dz10 = _mm_sub_ps(iz1,jz0);
1096 dx20 = _mm_sub_ps(ix2,jx0);
1097 dy20 = _mm_sub_ps(iy2,jy0);
1098 dz20 = _mm_sub_ps(iz2,jz0);
1099 dx30 = _mm_sub_ps(ix3,jx0);
1100 dy30 = _mm_sub_ps(iy3,jy0);
1101 dz30 = _mm_sub_ps(iz3,jz0);
1103 /* Calculate squared distance and things based on it */
1104 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1105 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1106 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1107 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1109 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1110 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1111 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1112 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1114 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1115 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1116 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1117 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1119 /* Load parameters for j particles */
1120 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1121 charge+jnrC+0,charge+jnrD+0);
1122 vdwjidx0A = 2*vdwtype[jnrA+0];
1123 vdwjidx0B = 2*vdwtype[jnrB+0];
1124 vdwjidx0C = 2*vdwtype[jnrC+0];
1125 vdwjidx0D = 2*vdwtype[jnrD+0];
1127 fjx0 = _mm_setzero_ps();
1128 fjy0 = _mm_setzero_ps();
1129 fjz0 = _mm_setzero_ps();
1131 /**************************
1132 * CALCULATE INTERACTIONS *
1133 **************************/
1135 if (gmx_mm_any_lt(rsq00,rcutoff2))
1138 r00 = _mm_mul_ps(rsq00,rinv00);
1139 r00 = _mm_andnot_ps(dummy_mask,r00);
1141 /* Compute parameters for interactions between i and j atoms */
1142 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1143 vdwparam+vdwioffset0+vdwjidx0B,
1144 vdwparam+vdwioffset0+vdwjidx0C,
1145 vdwparam+vdwioffset0+vdwjidx0D,
1148 /* LENNARD-JONES DISPERSION/REPULSION */
1150 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1151 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1152 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1153 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1154 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1156 d = _mm_sub_ps(r00,rswitch);
1157 d = _mm_max_ps(d,_mm_setzero_ps());
1158 d2 = _mm_mul_ps(d,d);
1159 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1161 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1163 /* Evaluate switch function */
1164 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1165 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1166 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1170 fscal = _mm_and_ps(fscal,cutoff_mask);
1172 fscal = _mm_andnot_ps(dummy_mask,fscal);
1174 /* Update vectorial force */
1175 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1176 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1177 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1179 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1180 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1181 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 if (gmx_mm_any_lt(rsq10,rcutoff2))
1192 /* Compute parameters for interactions between i and j atoms */
1193 qq10 = _mm_mul_ps(iq1,jq0);
1195 /* REACTION-FIELD ELECTROSTATICS */
1196 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1198 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1202 fscal = _mm_and_ps(fscal,cutoff_mask);
1204 fscal = _mm_andnot_ps(dummy_mask,fscal);
1206 /* Update vectorial force */
1207 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1208 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1209 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1211 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1212 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1213 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 if (gmx_mm_any_lt(rsq20,rcutoff2))
1224 /* Compute parameters for interactions between i and j atoms */
1225 qq20 = _mm_mul_ps(iq2,jq0);
1227 /* REACTION-FIELD ELECTROSTATICS */
1228 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1230 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1234 fscal = _mm_and_ps(fscal,cutoff_mask);
1236 fscal = _mm_andnot_ps(dummy_mask,fscal);
1238 /* Update vectorial force */
1239 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1240 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1241 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1243 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1244 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1245 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1249 /**************************
1250 * CALCULATE INTERACTIONS *
1251 **************************/
1253 if (gmx_mm_any_lt(rsq30,rcutoff2))
1256 /* Compute parameters for interactions between i and j atoms */
1257 qq30 = _mm_mul_ps(iq3,jq0);
1259 /* REACTION-FIELD ELECTROSTATICS */
1260 felec = _mm_mul_ps(qq30,_mm_msub_ps(rinv30,rinvsq30,krf2));
1262 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1266 fscal = _mm_and_ps(fscal,cutoff_mask);
1268 fscal = _mm_andnot_ps(dummy_mask,fscal);
1270 /* Update vectorial force */
1271 fix3 = _mm_macc_ps(dx30,fscal,fix3);
1272 fiy3 = _mm_macc_ps(dy30,fscal,fiy3);
1273 fiz3 = _mm_macc_ps(dz30,fscal,fiz3);
1275 fjx0 = _mm_macc_ps(dx30,fscal,fjx0);
1276 fjy0 = _mm_macc_ps(dy30,fscal,fjy0);
1277 fjz0 = _mm_macc_ps(dz30,fscal,fjz0);
1281 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1282 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1283 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1284 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1286 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1288 /* Inner loop uses 159 flops */
1291 /* End of innermost loop */
1293 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1294 f+i_coord_offset,fshift+i_shift_offset);
1296 /* Increment number of inner iterations */
1297 inneriter += j_index_end - j_index_start;
1299 /* Outer loop uses 24 flops */
1302 /* Increment number of outer iterations */
1305 /* Update outer/inner flops */
1307 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*159);