2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
98 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
99 real rswitch_scalar,d_scalar;
100 __m128 dummy_mask,cutoff_mask;
101 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
102 __m128 one = _mm_set1_ps(1.0);
103 __m128 two = _mm_set1_ps(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_ps(fr->epsfac);
116 charge = mdatoms->chargeA;
117 krf = _mm_set1_ps(fr->ic->k_rf);
118 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
119 crf = _mm_set1_ps(fr->ic->c_rf);
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
127 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
128 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 jq0 = _mm_set1_ps(charge[inr+0]);
132 jq1 = _mm_set1_ps(charge[inr+1]);
133 jq2 = _mm_set1_ps(charge[inr+2]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 qq00 = _mm_mul_ps(iq0,jq0);
136 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
137 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
138 qq01 = _mm_mul_ps(iq0,jq1);
139 qq02 = _mm_mul_ps(iq0,jq2);
140 qq10 = _mm_mul_ps(iq1,jq0);
141 qq11 = _mm_mul_ps(iq1,jq1);
142 qq12 = _mm_mul_ps(iq1,jq2);
143 qq20 = _mm_mul_ps(iq2,jq0);
144 qq21 = _mm_mul_ps(iq2,jq1);
145 qq22 = _mm_mul_ps(iq2,jq2);
147 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148 rcutoff_scalar = fr->rcoulomb;
149 rcutoff = _mm_set1_ps(rcutoff_scalar);
150 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
152 rswitch_scalar = fr->rvdw_switch;
153 rswitch = _mm_set1_ps(rswitch_scalar);
154 /* Setup switch parameters */
155 d_scalar = rcutoff_scalar-rswitch_scalar;
156 d = _mm_set1_ps(d_scalar);
157 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
158 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
159 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
160 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
161 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
162 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197 fix0 = _mm_setzero_ps();
198 fiy0 = _mm_setzero_ps();
199 fiz0 = _mm_setzero_ps();
200 fix1 = _mm_setzero_ps();
201 fiy1 = _mm_setzero_ps();
202 fiz1 = _mm_setzero_ps();
203 fix2 = _mm_setzero_ps();
204 fiy2 = _mm_setzero_ps();
205 fiz2 = _mm_setzero_ps();
207 /* Reset potential sums */
208 velecsum = _mm_setzero_ps();
209 vvdwsum = _mm_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx01 = _mm_sub_ps(ix0,jx1);
235 dy01 = _mm_sub_ps(iy0,jy1);
236 dz01 = _mm_sub_ps(iz0,jz1);
237 dx02 = _mm_sub_ps(ix0,jx2);
238 dy02 = _mm_sub_ps(iy0,jy2);
239 dz02 = _mm_sub_ps(iz0,jz2);
240 dx10 = _mm_sub_ps(ix1,jx0);
241 dy10 = _mm_sub_ps(iy1,jy0);
242 dz10 = _mm_sub_ps(iz1,jz0);
243 dx11 = _mm_sub_ps(ix1,jx1);
244 dy11 = _mm_sub_ps(iy1,jy1);
245 dz11 = _mm_sub_ps(iz1,jz1);
246 dx12 = _mm_sub_ps(ix1,jx2);
247 dy12 = _mm_sub_ps(iy1,jy2);
248 dz12 = _mm_sub_ps(iz1,jz2);
249 dx20 = _mm_sub_ps(ix2,jx0);
250 dy20 = _mm_sub_ps(iy2,jy0);
251 dz20 = _mm_sub_ps(iz2,jz0);
252 dx21 = _mm_sub_ps(ix2,jx1);
253 dy21 = _mm_sub_ps(iy2,jy1);
254 dz21 = _mm_sub_ps(iz2,jz1);
255 dx22 = _mm_sub_ps(ix2,jx2);
256 dy22 = _mm_sub_ps(iy2,jy2);
257 dz22 = _mm_sub_ps(iz2,jz2);
259 /* Calculate squared distance and things based on it */
260 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
261 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
262 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
263 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270 rinv00 = gmx_mm_invsqrt_ps(rsq00);
271 rinv01 = gmx_mm_invsqrt_ps(rsq01);
272 rinv02 = gmx_mm_invsqrt_ps(rsq02);
273 rinv10 = gmx_mm_invsqrt_ps(rsq10);
274 rinv11 = gmx_mm_invsqrt_ps(rsq11);
275 rinv12 = gmx_mm_invsqrt_ps(rsq12);
276 rinv20 = gmx_mm_invsqrt_ps(rsq20);
277 rinv21 = gmx_mm_invsqrt_ps(rsq21);
278 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
281 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
282 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
283 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
284 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
285 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
286 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
287 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
288 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
290 fjx0 = _mm_setzero_ps();
291 fjy0 = _mm_setzero_ps();
292 fjz0 = _mm_setzero_ps();
293 fjx1 = _mm_setzero_ps();
294 fjy1 = _mm_setzero_ps();
295 fjz1 = _mm_setzero_ps();
296 fjx2 = _mm_setzero_ps();
297 fjy2 = _mm_setzero_ps();
298 fjz2 = _mm_setzero_ps();
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 if (gmx_mm_any_lt(rsq00,rcutoff2))
307 r00 = _mm_mul_ps(rsq00,rinv00);
309 /* REACTION-FIELD ELECTROSTATICS */
310 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
311 felec = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
313 /* LENNARD-JONES DISPERSION/REPULSION */
315 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
316 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
317 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
318 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
319 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
321 d = _mm_sub_ps(r00,rswitch);
322 d = _mm_max_ps(d,_mm_setzero_ps());
323 d2 = _mm_mul_ps(d,d);
324 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
326 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
328 /* Evaluate switch function */
329 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
330 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
331 vvdw = _mm_mul_ps(vvdw,sw);
332 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 velec = _mm_and_ps(velec,cutoff_mask);
336 velecsum = _mm_add_ps(velecsum,velec);
337 vvdw = _mm_and_ps(vvdw,cutoff_mask);
338 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
340 fscal = _mm_add_ps(felec,fvdw);
342 fscal = _mm_and_ps(fscal,cutoff_mask);
344 /* Update vectorial force */
345 fix0 = _mm_macc_ps(dx00,fscal,fix0);
346 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
347 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
349 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
350 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
351 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
355 /**************************
356 * CALCULATE INTERACTIONS *
357 **************************/
359 if (gmx_mm_any_lt(rsq01,rcutoff2))
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
364 felec = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
366 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 velec = _mm_and_ps(velec,cutoff_mask);
370 velecsum = _mm_add_ps(velecsum,velec);
374 fscal = _mm_and_ps(fscal,cutoff_mask);
376 /* Update vectorial force */
377 fix0 = _mm_macc_ps(dx01,fscal,fix0);
378 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
379 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
381 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
382 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
383 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 if (gmx_mm_any_lt(rsq02,rcutoff2))
394 /* REACTION-FIELD ELECTROSTATICS */
395 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
396 felec = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
398 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec = _mm_and_ps(velec,cutoff_mask);
402 velecsum = _mm_add_ps(velecsum,velec);
406 fscal = _mm_and_ps(fscal,cutoff_mask);
408 /* Update vectorial force */
409 fix0 = _mm_macc_ps(dx02,fscal,fix0);
410 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
411 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
413 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
414 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
415 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm_any_lt(rsq10,rcutoff2))
426 /* REACTION-FIELD ELECTROSTATICS */
427 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
428 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
430 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_ps(velec,cutoff_mask);
434 velecsum = _mm_add_ps(velecsum,velec);
438 fscal = _mm_and_ps(fscal,cutoff_mask);
440 /* Update vectorial force */
441 fix1 = _mm_macc_ps(dx10,fscal,fix1);
442 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
443 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
445 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
446 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
447 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 if (gmx_mm_any_lt(rsq11,rcutoff2))
458 /* REACTION-FIELD ELECTROSTATICS */
459 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
460 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
462 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm_and_ps(velec,cutoff_mask);
466 velecsum = _mm_add_ps(velecsum,velec);
470 fscal = _mm_and_ps(fscal,cutoff_mask);
472 /* Update vectorial force */
473 fix1 = _mm_macc_ps(dx11,fscal,fix1);
474 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
475 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
477 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
478 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
479 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 if (gmx_mm_any_lt(rsq12,rcutoff2))
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
492 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
494 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm_and_ps(velec,cutoff_mask);
498 velecsum = _mm_add_ps(velecsum,velec);
502 fscal = _mm_and_ps(fscal,cutoff_mask);
504 /* Update vectorial force */
505 fix1 = _mm_macc_ps(dx12,fscal,fix1);
506 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
507 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
509 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
510 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
511 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
515 /**************************
516 * CALCULATE INTERACTIONS *
517 **************************/
519 if (gmx_mm_any_lt(rsq20,rcutoff2))
522 /* REACTION-FIELD ELECTROSTATICS */
523 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
524 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
526 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velec = _mm_and_ps(velec,cutoff_mask);
530 velecsum = _mm_add_ps(velecsum,velec);
534 fscal = _mm_and_ps(fscal,cutoff_mask);
536 /* Update vectorial force */
537 fix2 = _mm_macc_ps(dx20,fscal,fix2);
538 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
539 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
541 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
542 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
543 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 if (gmx_mm_any_lt(rsq21,rcutoff2))
554 /* REACTION-FIELD ELECTROSTATICS */
555 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
556 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
558 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec = _mm_and_ps(velec,cutoff_mask);
562 velecsum = _mm_add_ps(velecsum,velec);
566 fscal = _mm_and_ps(fscal,cutoff_mask);
568 /* Update vectorial force */
569 fix2 = _mm_macc_ps(dx21,fscal,fix2);
570 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
571 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
573 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
574 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
575 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm_any_lt(rsq22,rcutoff2))
586 /* REACTION-FIELD ELECTROSTATICS */
587 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
588 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
590 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velec = _mm_and_ps(velec,cutoff_mask);
594 velecsum = _mm_add_ps(velecsum,velec);
598 fscal = _mm_and_ps(fscal,cutoff_mask);
600 /* Update vectorial force */
601 fix2 = _mm_macc_ps(dx22,fscal,fix2);
602 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
603 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
605 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
606 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
607 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
611 fjptrA = f+j_coord_offsetA;
612 fjptrB = f+j_coord_offsetB;
613 fjptrC = f+j_coord_offsetC;
614 fjptrD = f+j_coord_offsetD;
616 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
617 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
619 /* Inner loop uses 385 flops */
625 /* Get j neighbor index, and coordinate index */
626 jnrlistA = jjnr[jidx];
627 jnrlistB = jjnr[jidx+1];
628 jnrlistC = jjnr[jidx+2];
629 jnrlistD = jjnr[jidx+3];
630 /* Sign of each element will be negative for non-real atoms.
631 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
632 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
634 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
635 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
636 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
637 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
638 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
639 j_coord_offsetA = DIM*jnrA;
640 j_coord_offsetB = DIM*jnrB;
641 j_coord_offsetC = DIM*jnrC;
642 j_coord_offsetD = DIM*jnrD;
644 /* load j atom coordinates */
645 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
646 x+j_coord_offsetC,x+j_coord_offsetD,
647 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
649 /* Calculate displacement vector */
650 dx00 = _mm_sub_ps(ix0,jx0);
651 dy00 = _mm_sub_ps(iy0,jy0);
652 dz00 = _mm_sub_ps(iz0,jz0);
653 dx01 = _mm_sub_ps(ix0,jx1);
654 dy01 = _mm_sub_ps(iy0,jy1);
655 dz01 = _mm_sub_ps(iz0,jz1);
656 dx02 = _mm_sub_ps(ix0,jx2);
657 dy02 = _mm_sub_ps(iy0,jy2);
658 dz02 = _mm_sub_ps(iz0,jz2);
659 dx10 = _mm_sub_ps(ix1,jx0);
660 dy10 = _mm_sub_ps(iy1,jy0);
661 dz10 = _mm_sub_ps(iz1,jz0);
662 dx11 = _mm_sub_ps(ix1,jx1);
663 dy11 = _mm_sub_ps(iy1,jy1);
664 dz11 = _mm_sub_ps(iz1,jz1);
665 dx12 = _mm_sub_ps(ix1,jx2);
666 dy12 = _mm_sub_ps(iy1,jy2);
667 dz12 = _mm_sub_ps(iz1,jz2);
668 dx20 = _mm_sub_ps(ix2,jx0);
669 dy20 = _mm_sub_ps(iy2,jy0);
670 dz20 = _mm_sub_ps(iz2,jz0);
671 dx21 = _mm_sub_ps(ix2,jx1);
672 dy21 = _mm_sub_ps(iy2,jy1);
673 dz21 = _mm_sub_ps(iz2,jz1);
674 dx22 = _mm_sub_ps(ix2,jx2);
675 dy22 = _mm_sub_ps(iy2,jy2);
676 dz22 = _mm_sub_ps(iz2,jz2);
678 /* Calculate squared distance and things based on it */
679 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
680 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
681 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
682 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
683 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
684 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
685 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
686 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
687 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
689 rinv00 = gmx_mm_invsqrt_ps(rsq00);
690 rinv01 = gmx_mm_invsqrt_ps(rsq01);
691 rinv02 = gmx_mm_invsqrt_ps(rsq02);
692 rinv10 = gmx_mm_invsqrt_ps(rsq10);
693 rinv11 = gmx_mm_invsqrt_ps(rsq11);
694 rinv12 = gmx_mm_invsqrt_ps(rsq12);
695 rinv20 = gmx_mm_invsqrt_ps(rsq20);
696 rinv21 = gmx_mm_invsqrt_ps(rsq21);
697 rinv22 = gmx_mm_invsqrt_ps(rsq22);
699 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
700 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
701 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
702 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
703 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
704 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
705 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
706 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
707 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
709 fjx0 = _mm_setzero_ps();
710 fjy0 = _mm_setzero_ps();
711 fjz0 = _mm_setzero_ps();
712 fjx1 = _mm_setzero_ps();
713 fjy1 = _mm_setzero_ps();
714 fjz1 = _mm_setzero_ps();
715 fjx2 = _mm_setzero_ps();
716 fjy2 = _mm_setzero_ps();
717 fjz2 = _mm_setzero_ps();
719 /**************************
720 * CALCULATE INTERACTIONS *
721 **************************/
723 if (gmx_mm_any_lt(rsq00,rcutoff2))
726 r00 = _mm_mul_ps(rsq00,rinv00);
727 r00 = _mm_andnot_ps(dummy_mask,r00);
729 /* REACTION-FIELD ELECTROSTATICS */
730 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
731 felec = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
733 /* LENNARD-JONES DISPERSION/REPULSION */
735 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
736 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
737 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
738 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
739 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
741 d = _mm_sub_ps(r00,rswitch);
742 d = _mm_max_ps(d,_mm_setzero_ps());
743 d2 = _mm_mul_ps(d,d);
744 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
746 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
748 /* Evaluate switch function */
749 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
750 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
751 vvdw = _mm_mul_ps(vvdw,sw);
752 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
754 /* Update potential sum for this i atom from the interaction with this j atom. */
755 velec = _mm_and_ps(velec,cutoff_mask);
756 velec = _mm_andnot_ps(dummy_mask,velec);
757 velecsum = _mm_add_ps(velecsum,velec);
758 vvdw = _mm_and_ps(vvdw,cutoff_mask);
759 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
760 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
762 fscal = _mm_add_ps(felec,fvdw);
764 fscal = _mm_and_ps(fscal,cutoff_mask);
766 fscal = _mm_andnot_ps(dummy_mask,fscal);
768 /* Update vectorial force */
769 fix0 = _mm_macc_ps(dx00,fscal,fix0);
770 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
771 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
773 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
774 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
775 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
779 /**************************
780 * CALCULATE INTERACTIONS *
781 **************************/
783 if (gmx_mm_any_lt(rsq01,rcutoff2))
786 /* REACTION-FIELD ELECTROSTATICS */
787 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
788 felec = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
790 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
792 /* Update potential sum for this i atom from the interaction with this j atom. */
793 velec = _mm_and_ps(velec,cutoff_mask);
794 velec = _mm_andnot_ps(dummy_mask,velec);
795 velecsum = _mm_add_ps(velecsum,velec);
799 fscal = _mm_and_ps(fscal,cutoff_mask);
801 fscal = _mm_andnot_ps(dummy_mask,fscal);
803 /* Update vectorial force */
804 fix0 = _mm_macc_ps(dx01,fscal,fix0);
805 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
806 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
808 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
809 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
810 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 if (gmx_mm_any_lt(rsq02,rcutoff2))
821 /* REACTION-FIELD ELECTROSTATICS */
822 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
823 felec = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
825 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
827 /* Update potential sum for this i atom from the interaction with this j atom. */
828 velec = _mm_and_ps(velec,cutoff_mask);
829 velec = _mm_andnot_ps(dummy_mask,velec);
830 velecsum = _mm_add_ps(velecsum,velec);
834 fscal = _mm_and_ps(fscal,cutoff_mask);
836 fscal = _mm_andnot_ps(dummy_mask,fscal);
838 /* Update vectorial force */
839 fix0 = _mm_macc_ps(dx02,fscal,fix0);
840 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
841 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
843 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
844 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
845 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 if (gmx_mm_any_lt(rsq10,rcutoff2))
856 /* REACTION-FIELD ELECTROSTATICS */
857 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
858 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
860 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm_and_ps(velec,cutoff_mask);
864 velec = _mm_andnot_ps(dummy_mask,velec);
865 velecsum = _mm_add_ps(velecsum,velec);
869 fscal = _mm_and_ps(fscal,cutoff_mask);
871 fscal = _mm_andnot_ps(dummy_mask,fscal);
873 /* Update vectorial force */
874 fix1 = _mm_macc_ps(dx10,fscal,fix1);
875 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
876 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
878 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
879 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
880 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
884 /**************************
885 * CALCULATE INTERACTIONS *
886 **************************/
888 if (gmx_mm_any_lt(rsq11,rcutoff2))
891 /* REACTION-FIELD ELECTROSTATICS */
892 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
893 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
895 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
897 /* Update potential sum for this i atom from the interaction with this j atom. */
898 velec = _mm_and_ps(velec,cutoff_mask);
899 velec = _mm_andnot_ps(dummy_mask,velec);
900 velecsum = _mm_add_ps(velecsum,velec);
904 fscal = _mm_and_ps(fscal,cutoff_mask);
906 fscal = _mm_andnot_ps(dummy_mask,fscal);
908 /* Update vectorial force */
909 fix1 = _mm_macc_ps(dx11,fscal,fix1);
910 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
911 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
913 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
914 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
915 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
919 /**************************
920 * CALCULATE INTERACTIONS *
921 **************************/
923 if (gmx_mm_any_lt(rsq12,rcutoff2))
926 /* REACTION-FIELD ELECTROSTATICS */
927 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
928 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
930 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm_and_ps(velec,cutoff_mask);
934 velec = _mm_andnot_ps(dummy_mask,velec);
935 velecsum = _mm_add_ps(velecsum,velec);
939 fscal = _mm_and_ps(fscal,cutoff_mask);
941 fscal = _mm_andnot_ps(dummy_mask,fscal);
943 /* Update vectorial force */
944 fix1 = _mm_macc_ps(dx12,fscal,fix1);
945 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
946 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
948 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
949 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
950 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 if (gmx_mm_any_lt(rsq20,rcutoff2))
961 /* REACTION-FIELD ELECTROSTATICS */
962 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
963 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
965 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_and_ps(velec,cutoff_mask);
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_and_ps(fscal,cutoff_mask);
976 fscal = _mm_andnot_ps(dummy_mask,fscal);
978 /* Update vectorial force */
979 fix2 = _mm_macc_ps(dx20,fscal,fix2);
980 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
981 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
983 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
984 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
985 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 if (gmx_mm_any_lt(rsq21,rcutoff2))
996 /* REACTION-FIELD ELECTROSTATICS */
997 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
998 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1000 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1002 /* Update potential sum for this i atom from the interaction with this j atom. */
1003 velec = _mm_and_ps(velec,cutoff_mask);
1004 velec = _mm_andnot_ps(dummy_mask,velec);
1005 velecsum = _mm_add_ps(velecsum,velec);
1009 fscal = _mm_and_ps(fscal,cutoff_mask);
1011 fscal = _mm_andnot_ps(dummy_mask,fscal);
1013 /* Update vectorial force */
1014 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1015 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1016 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1018 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1019 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1020 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 if (gmx_mm_any_lt(rsq22,rcutoff2))
1031 /* REACTION-FIELD ELECTROSTATICS */
1032 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
1033 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1035 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm_and_ps(velec,cutoff_mask);
1039 velec = _mm_andnot_ps(dummy_mask,velec);
1040 velecsum = _mm_add_ps(velecsum,velec);
1044 fscal = _mm_and_ps(fscal,cutoff_mask);
1046 fscal = _mm_andnot_ps(dummy_mask,fscal);
1048 /* Update vectorial force */
1049 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1050 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1051 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1053 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1054 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1055 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1059 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1060 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1061 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1062 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1064 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1065 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1067 /* Inner loop uses 386 flops */
1070 /* End of innermost loop */
1072 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1073 f+i_coord_offset,fshift+i_shift_offset);
1076 /* Update potential energies */
1077 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1078 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1080 /* Increment number of inner iterations */
1081 inneriter += j_index_end - j_index_start;
1083 /* Outer loop uses 20 flops */
1086 /* Increment number of outer iterations */
1089 /* Update outer/inner flops */
1091 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*386);
1094 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1095 * Electrostatics interaction: ReactionField
1096 * VdW interaction: LennardJones
1097 * Geometry: Water3-Water3
1098 * Calculate force/pot: Force
1101 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1102 (t_nblist * gmx_restrict nlist,
1103 rvec * gmx_restrict xx,
1104 rvec * gmx_restrict ff,
1105 t_forcerec * gmx_restrict fr,
1106 t_mdatoms * gmx_restrict mdatoms,
1107 nb_kernel_data_t * gmx_restrict kernel_data,
1108 t_nrnb * gmx_restrict nrnb)
1110 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1111 * just 0 for non-waters.
1112 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1113 * jnr indices corresponding to data put in the four positions in the SIMD register.
1115 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1116 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1117 int jnrA,jnrB,jnrC,jnrD;
1118 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1119 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1120 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1121 real rcutoff_scalar;
1122 real *shiftvec,*fshift,*x,*f;
1123 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1124 real scratch[4*DIM];
1125 __m128 fscal,rcutoff,rcutoff2,jidxall;
1127 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1129 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1131 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1132 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1133 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1134 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1135 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1136 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1137 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1138 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1139 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1140 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1141 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1142 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1143 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1144 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1145 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1146 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1147 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1150 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1153 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1154 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1155 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1156 real rswitch_scalar,d_scalar;
1157 __m128 dummy_mask,cutoff_mask;
1158 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1159 __m128 one = _mm_set1_ps(1.0);
1160 __m128 two = _mm_set1_ps(2.0);
1166 jindex = nlist->jindex;
1168 shiftidx = nlist->shift;
1170 shiftvec = fr->shift_vec[0];
1171 fshift = fr->fshift[0];
1172 facel = _mm_set1_ps(fr->epsfac);
1173 charge = mdatoms->chargeA;
1174 krf = _mm_set1_ps(fr->ic->k_rf);
1175 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1176 crf = _mm_set1_ps(fr->ic->c_rf);
1177 nvdwtype = fr->ntype;
1178 vdwparam = fr->nbfp;
1179 vdwtype = mdatoms->typeA;
1181 /* Setup water-specific parameters */
1182 inr = nlist->iinr[0];
1183 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1184 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1185 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1186 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1188 jq0 = _mm_set1_ps(charge[inr+0]);
1189 jq1 = _mm_set1_ps(charge[inr+1]);
1190 jq2 = _mm_set1_ps(charge[inr+2]);
1191 vdwjidx0A = 2*vdwtype[inr+0];
1192 qq00 = _mm_mul_ps(iq0,jq0);
1193 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1194 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1195 qq01 = _mm_mul_ps(iq0,jq1);
1196 qq02 = _mm_mul_ps(iq0,jq2);
1197 qq10 = _mm_mul_ps(iq1,jq0);
1198 qq11 = _mm_mul_ps(iq1,jq1);
1199 qq12 = _mm_mul_ps(iq1,jq2);
1200 qq20 = _mm_mul_ps(iq2,jq0);
1201 qq21 = _mm_mul_ps(iq2,jq1);
1202 qq22 = _mm_mul_ps(iq2,jq2);
1204 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1205 rcutoff_scalar = fr->rcoulomb;
1206 rcutoff = _mm_set1_ps(rcutoff_scalar);
1207 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1209 rswitch_scalar = fr->rvdw_switch;
1210 rswitch = _mm_set1_ps(rswitch_scalar);
1211 /* Setup switch parameters */
1212 d_scalar = rcutoff_scalar-rswitch_scalar;
1213 d = _mm_set1_ps(d_scalar);
1214 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1215 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1216 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1217 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1218 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1219 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1221 /* Avoid stupid compiler warnings */
1222 jnrA = jnrB = jnrC = jnrD = 0;
1223 j_coord_offsetA = 0;
1224 j_coord_offsetB = 0;
1225 j_coord_offsetC = 0;
1226 j_coord_offsetD = 0;
1231 for(iidx=0;iidx<4*DIM;iidx++)
1233 scratch[iidx] = 0.0;
1236 /* Start outer loop over neighborlists */
1237 for(iidx=0; iidx<nri; iidx++)
1239 /* Load shift vector for this list */
1240 i_shift_offset = DIM*shiftidx[iidx];
1242 /* Load limits for loop over neighbors */
1243 j_index_start = jindex[iidx];
1244 j_index_end = jindex[iidx+1];
1246 /* Get outer coordinate index */
1248 i_coord_offset = DIM*inr;
1250 /* Load i particle coords and add shift vector */
1251 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1252 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1254 fix0 = _mm_setzero_ps();
1255 fiy0 = _mm_setzero_ps();
1256 fiz0 = _mm_setzero_ps();
1257 fix1 = _mm_setzero_ps();
1258 fiy1 = _mm_setzero_ps();
1259 fiz1 = _mm_setzero_ps();
1260 fix2 = _mm_setzero_ps();
1261 fiy2 = _mm_setzero_ps();
1262 fiz2 = _mm_setzero_ps();
1264 /* Start inner kernel loop */
1265 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1268 /* Get j neighbor index, and coordinate index */
1270 jnrB = jjnr[jidx+1];
1271 jnrC = jjnr[jidx+2];
1272 jnrD = jjnr[jidx+3];
1273 j_coord_offsetA = DIM*jnrA;
1274 j_coord_offsetB = DIM*jnrB;
1275 j_coord_offsetC = DIM*jnrC;
1276 j_coord_offsetD = DIM*jnrD;
1278 /* load j atom coordinates */
1279 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1280 x+j_coord_offsetC,x+j_coord_offsetD,
1281 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1283 /* Calculate displacement vector */
1284 dx00 = _mm_sub_ps(ix0,jx0);
1285 dy00 = _mm_sub_ps(iy0,jy0);
1286 dz00 = _mm_sub_ps(iz0,jz0);
1287 dx01 = _mm_sub_ps(ix0,jx1);
1288 dy01 = _mm_sub_ps(iy0,jy1);
1289 dz01 = _mm_sub_ps(iz0,jz1);
1290 dx02 = _mm_sub_ps(ix0,jx2);
1291 dy02 = _mm_sub_ps(iy0,jy2);
1292 dz02 = _mm_sub_ps(iz0,jz2);
1293 dx10 = _mm_sub_ps(ix1,jx0);
1294 dy10 = _mm_sub_ps(iy1,jy0);
1295 dz10 = _mm_sub_ps(iz1,jz0);
1296 dx11 = _mm_sub_ps(ix1,jx1);
1297 dy11 = _mm_sub_ps(iy1,jy1);
1298 dz11 = _mm_sub_ps(iz1,jz1);
1299 dx12 = _mm_sub_ps(ix1,jx2);
1300 dy12 = _mm_sub_ps(iy1,jy2);
1301 dz12 = _mm_sub_ps(iz1,jz2);
1302 dx20 = _mm_sub_ps(ix2,jx0);
1303 dy20 = _mm_sub_ps(iy2,jy0);
1304 dz20 = _mm_sub_ps(iz2,jz0);
1305 dx21 = _mm_sub_ps(ix2,jx1);
1306 dy21 = _mm_sub_ps(iy2,jy1);
1307 dz21 = _mm_sub_ps(iz2,jz1);
1308 dx22 = _mm_sub_ps(ix2,jx2);
1309 dy22 = _mm_sub_ps(iy2,jy2);
1310 dz22 = _mm_sub_ps(iz2,jz2);
1312 /* Calculate squared distance and things based on it */
1313 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1314 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1315 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1316 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1317 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1318 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1319 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1320 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1321 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1323 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1324 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1325 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1326 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1327 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1328 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1329 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1330 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1331 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1333 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1334 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1335 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1336 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1337 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1338 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1339 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1340 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1341 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1343 fjx0 = _mm_setzero_ps();
1344 fjy0 = _mm_setzero_ps();
1345 fjz0 = _mm_setzero_ps();
1346 fjx1 = _mm_setzero_ps();
1347 fjy1 = _mm_setzero_ps();
1348 fjz1 = _mm_setzero_ps();
1349 fjx2 = _mm_setzero_ps();
1350 fjy2 = _mm_setzero_ps();
1351 fjz2 = _mm_setzero_ps();
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 if (gmx_mm_any_lt(rsq00,rcutoff2))
1360 r00 = _mm_mul_ps(rsq00,rinv00);
1362 /* REACTION-FIELD ELECTROSTATICS */
1363 felec = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1365 /* LENNARD-JONES DISPERSION/REPULSION */
1367 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1368 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1369 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1370 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1371 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1373 d = _mm_sub_ps(r00,rswitch);
1374 d = _mm_max_ps(d,_mm_setzero_ps());
1375 d2 = _mm_mul_ps(d,d);
1376 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1378 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1380 /* Evaluate switch function */
1381 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1382 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1383 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1385 fscal = _mm_add_ps(felec,fvdw);
1387 fscal = _mm_and_ps(fscal,cutoff_mask);
1389 /* Update vectorial force */
1390 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1391 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1392 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1394 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1395 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1396 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1400 /**************************
1401 * CALCULATE INTERACTIONS *
1402 **************************/
1404 if (gmx_mm_any_lt(rsq01,rcutoff2))
1407 /* REACTION-FIELD ELECTROSTATICS */
1408 felec = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1410 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1414 fscal = _mm_and_ps(fscal,cutoff_mask);
1416 /* Update vectorial force */
1417 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1418 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1419 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1421 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1422 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1423 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1427 /**************************
1428 * CALCULATE INTERACTIONS *
1429 **************************/
1431 if (gmx_mm_any_lt(rsq02,rcutoff2))
1434 /* REACTION-FIELD ELECTROSTATICS */
1435 felec = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1437 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1441 fscal = _mm_and_ps(fscal,cutoff_mask);
1443 /* Update vectorial force */
1444 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1445 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1446 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1448 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1449 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1450 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1454 /**************************
1455 * CALCULATE INTERACTIONS *
1456 **************************/
1458 if (gmx_mm_any_lt(rsq10,rcutoff2))
1461 /* REACTION-FIELD ELECTROSTATICS */
1462 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1464 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1468 fscal = _mm_and_ps(fscal,cutoff_mask);
1470 /* Update vectorial force */
1471 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1472 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1473 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1475 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1476 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1477 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1481 /**************************
1482 * CALCULATE INTERACTIONS *
1483 **************************/
1485 if (gmx_mm_any_lt(rsq11,rcutoff2))
1488 /* REACTION-FIELD ELECTROSTATICS */
1489 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1491 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1495 fscal = _mm_and_ps(fscal,cutoff_mask);
1497 /* Update vectorial force */
1498 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1499 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1500 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1502 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1503 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1504 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 if (gmx_mm_any_lt(rsq12,rcutoff2))
1515 /* REACTION-FIELD ELECTROSTATICS */
1516 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1518 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1522 fscal = _mm_and_ps(fscal,cutoff_mask);
1524 /* Update vectorial force */
1525 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1526 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1527 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1529 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1530 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1531 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1535 /**************************
1536 * CALCULATE INTERACTIONS *
1537 **************************/
1539 if (gmx_mm_any_lt(rsq20,rcutoff2))
1542 /* REACTION-FIELD ELECTROSTATICS */
1543 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1545 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1549 fscal = _mm_and_ps(fscal,cutoff_mask);
1551 /* Update vectorial force */
1552 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1553 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1554 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1556 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1557 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1558 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1562 /**************************
1563 * CALCULATE INTERACTIONS *
1564 **************************/
1566 if (gmx_mm_any_lt(rsq21,rcutoff2))
1569 /* REACTION-FIELD ELECTROSTATICS */
1570 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1572 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1576 fscal = _mm_and_ps(fscal,cutoff_mask);
1578 /* Update vectorial force */
1579 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1580 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1581 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1583 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1584 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1585 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1589 /**************************
1590 * CALCULATE INTERACTIONS *
1591 **************************/
1593 if (gmx_mm_any_lt(rsq22,rcutoff2))
1596 /* REACTION-FIELD ELECTROSTATICS */
1597 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1599 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1603 fscal = _mm_and_ps(fscal,cutoff_mask);
1605 /* Update vectorial force */
1606 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1607 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1608 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1610 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1611 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1612 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1616 fjptrA = f+j_coord_offsetA;
1617 fjptrB = f+j_coord_offsetB;
1618 fjptrC = f+j_coord_offsetC;
1619 fjptrD = f+j_coord_offsetD;
1621 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1622 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1624 /* Inner loop uses 328 flops */
1627 if(jidx<j_index_end)
1630 /* Get j neighbor index, and coordinate index */
1631 jnrlistA = jjnr[jidx];
1632 jnrlistB = jjnr[jidx+1];
1633 jnrlistC = jjnr[jidx+2];
1634 jnrlistD = jjnr[jidx+3];
1635 /* Sign of each element will be negative for non-real atoms.
1636 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1637 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1639 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1640 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1641 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1642 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1643 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1644 j_coord_offsetA = DIM*jnrA;
1645 j_coord_offsetB = DIM*jnrB;
1646 j_coord_offsetC = DIM*jnrC;
1647 j_coord_offsetD = DIM*jnrD;
1649 /* load j atom coordinates */
1650 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1651 x+j_coord_offsetC,x+j_coord_offsetD,
1652 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1654 /* Calculate displacement vector */
1655 dx00 = _mm_sub_ps(ix0,jx0);
1656 dy00 = _mm_sub_ps(iy0,jy0);
1657 dz00 = _mm_sub_ps(iz0,jz0);
1658 dx01 = _mm_sub_ps(ix0,jx1);
1659 dy01 = _mm_sub_ps(iy0,jy1);
1660 dz01 = _mm_sub_ps(iz0,jz1);
1661 dx02 = _mm_sub_ps(ix0,jx2);
1662 dy02 = _mm_sub_ps(iy0,jy2);
1663 dz02 = _mm_sub_ps(iz0,jz2);
1664 dx10 = _mm_sub_ps(ix1,jx0);
1665 dy10 = _mm_sub_ps(iy1,jy0);
1666 dz10 = _mm_sub_ps(iz1,jz0);
1667 dx11 = _mm_sub_ps(ix1,jx1);
1668 dy11 = _mm_sub_ps(iy1,jy1);
1669 dz11 = _mm_sub_ps(iz1,jz1);
1670 dx12 = _mm_sub_ps(ix1,jx2);
1671 dy12 = _mm_sub_ps(iy1,jy2);
1672 dz12 = _mm_sub_ps(iz1,jz2);
1673 dx20 = _mm_sub_ps(ix2,jx0);
1674 dy20 = _mm_sub_ps(iy2,jy0);
1675 dz20 = _mm_sub_ps(iz2,jz0);
1676 dx21 = _mm_sub_ps(ix2,jx1);
1677 dy21 = _mm_sub_ps(iy2,jy1);
1678 dz21 = _mm_sub_ps(iz2,jz1);
1679 dx22 = _mm_sub_ps(ix2,jx2);
1680 dy22 = _mm_sub_ps(iy2,jy2);
1681 dz22 = _mm_sub_ps(iz2,jz2);
1683 /* Calculate squared distance and things based on it */
1684 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1685 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1686 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1687 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1688 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1689 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1690 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1691 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1692 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1694 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1695 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1696 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1697 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1698 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1699 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1700 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1701 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1702 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1704 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1705 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1706 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1707 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1708 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1709 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1710 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1711 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1712 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1714 fjx0 = _mm_setzero_ps();
1715 fjy0 = _mm_setzero_ps();
1716 fjz0 = _mm_setzero_ps();
1717 fjx1 = _mm_setzero_ps();
1718 fjy1 = _mm_setzero_ps();
1719 fjz1 = _mm_setzero_ps();
1720 fjx2 = _mm_setzero_ps();
1721 fjy2 = _mm_setzero_ps();
1722 fjz2 = _mm_setzero_ps();
1724 /**************************
1725 * CALCULATE INTERACTIONS *
1726 **************************/
1728 if (gmx_mm_any_lt(rsq00,rcutoff2))
1731 r00 = _mm_mul_ps(rsq00,rinv00);
1732 r00 = _mm_andnot_ps(dummy_mask,r00);
1734 /* REACTION-FIELD ELECTROSTATICS */
1735 felec = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1737 /* LENNARD-JONES DISPERSION/REPULSION */
1739 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1740 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1741 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1742 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1743 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1745 d = _mm_sub_ps(r00,rswitch);
1746 d = _mm_max_ps(d,_mm_setzero_ps());
1747 d2 = _mm_mul_ps(d,d);
1748 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1750 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1752 /* Evaluate switch function */
1753 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1754 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1755 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1757 fscal = _mm_add_ps(felec,fvdw);
1759 fscal = _mm_and_ps(fscal,cutoff_mask);
1761 fscal = _mm_andnot_ps(dummy_mask,fscal);
1763 /* Update vectorial force */
1764 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1765 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1766 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1768 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1769 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1770 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1774 /**************************
1775 * CALCULATE INTERACTIONS *
1776 **************************/
1778 if (gmx_mm_any_lt(rsq01,rcutoff2))
1781 /* REACTION-FIELD ELECTROSTATICS */
1782 felec = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1784 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1788 fscal = _mm_and_ps(fscal,cutoff_mask);
1790 fscal = _mm_andnot_ps(dummy_mask,fscal);
1792 /* Update vectorial force */
1793 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1794 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1795 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1797 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1798 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1799 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 if (gmx_mm_any_lt(rsq02,rcutoff2))
1810 /* REACTION-FIELD ELECTROSTATICS */
1811 felec = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1813 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1817 fscal = _mm_and_ps(fscal,cutoff_mask);
1819 fscal = _mm_andnot_ps(dummy_mask,fscal);
1821 /* Update vectorial force */
1822 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1823 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1824 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1826 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1827 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1828 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 if (gmx_mm_any_lt(rsq10,rcutoff2))
1839 /* REACTION-FIELD ELECTROSTATICS */
1840 felec = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1842 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1846 fscal = _mm_and_ps(fscal,cutoff_mask);
1848 fscal = _mm_andnot_ps(dummy_mask,fscal);
1850 /* Update vectorial force */
1851 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1852 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1853 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1855 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1856 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1857 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1861 /**************************
1862 * CALCULATE INTERACTIONS *
1863 **************************/
1865 if (gmx_mm_any_lt(rsq11,rcutoff2))
1868 /* REACTION-FIELD ELECTROSTATICS */
1869 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1871 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1875 fscal = _mm_and_ps(fscal,cutoff_mask);
1877 fscal = _mm_andnot_ps(dummy_mask,fscal);
1879 /* Update vectorial force */
1880 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1881 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1882 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1884 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1885 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1886 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 if (gmx_mm_any_lt(rsq12,rcutoff2))
1897 /* REACTION-FIELD ELECTROSTATICS */
1898 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1900 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1904 fscal = _mm_and_ps(fscal,cutoff_mask);
1906 fscal = _mm_andnot_ps(dummy_mask,fscal);
1908 /* Update vectorial force */
1909 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1910 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1911 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1913 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1914 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1915 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_mm_any_lt(rsq20,rcutoff2))
1926 /* REACTION-FIELD ELECTROSTATICS */
1927 felec = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1929 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1933 fscal = _mm_and_ps(fscal,cutoff_mask);
1935 fscal = _mm_andnot_ps(dummy_mask,fscal);
1937 /* Update vectorial force */
1938 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1939 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1940 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1942 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1943 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1944 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1948 /**************************
1949 * CALCULATE INTERACTIONS *
1950 **************************/
1952 if (gmx_mm_any_lt(rsq21,rcutoff2))
1955 /* REACTION-FIELD ELECTROSTATICS */
1956 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1958 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1962 fscal = _mm_and_ps(fscal,cutoff_mask);
1964 fscal = _mm_andnot_ps(dummy_mask,fscal);
1966 /* Update vectorial force */
1967 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1968 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1969 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1971 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1972 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1973 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1981 if (gmx_mm_any_lt(rsq22,rcutoff2))
1984 /* REACTION-FIELD ELECTROSTATICS */
1985 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1987 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1991 fscal = _mm_and_ps(fscal,cutoff_mask);
1993 fscal = _mm_andnot_ps(dummy_mask,fscal);
1995 /* Update vectorial force */
1996 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1997 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1998 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2000 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2001 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2002 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2006 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2007 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2008 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2009 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2011 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2012 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2014 /* Inner loop uses 329 flops */
2017 /* End of innermost loop */
2019 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2020 f+i_coord_offset,fshift+i_shift_offset);
2022 /* Increment number of inner iterations */
2023 inneriter += j_index_end - j_index_start;
2025 /* Outer loop uses 18 flops */
2028 /* Increment number of outer iterations */
2031 /* Update outer/inner flops */
2033 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*329);