2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
124 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
125 beta2 = _mm_mul_ps(beta,beta);
126 beta3 = _mm_mul_ps(beta,beta2);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
134 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq0 = _mm_set1_ps(charge[inr+0]);
139 jq1 = _mm_set1_ps(charge[inr+1]);
140 jq2 = _mm_set1_ps(charge[inr+2]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 qq00 = _mm_mul_ps(iq0,jq0);
143 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
144 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
145 qq01 = _mm_mul_ps(iq0,jq1);
146 qq02 = _mm_mul_ps(iq0,jq2);
147 qq10 = _mm_mul_ps(iq1,jq0);
148 qq11 = _mm_mul_ps(iq1,jq1);
149 qq12 = _mm_mul_ps(iq1,jq2);
150 qq20 = _mm_mul_ps(iq2,jq0);
151 qq21 = _mm_mul_ps(iq2,jq1);
152 qq22 = _mm_mul_ps(iq2,jq2);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = 0;
164 for(iidx=0;iidx<4*DIM;iidx++)
169 /* Start outer loop over neighborlists */
170 for(iidx=0; iidx<nri; iidx++)
172 /* Load shift vector for this list */
173 i_shift_offset = DIM*shiftidx[iidx];
175 /* Load limits for loop over neighbors */
176 j_index_start = jindex[iidx];
177 j_index_end = jindex[iidx+1];
179 /* Get outer coordinate index */
181 i_coord_offset = DIM*inr;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187 fix0 = _mm_setzero_ps();
188 fiy0 = _mm_setzero_ps();
189 fiz0 = _mm_setzero_ps();
190 fix1 = _mm_setzero_ps();
191 fiy1 = _mm_setzero_ps();
192 fiz1 = _mm_setzero_ps();
193 fix2 = _mm_setzero_ps();
194 fiy2 = _mm_setzero_ps();
195 fiz2 = _mm_setzero_ps();
197 /* Reset potential sums */
198 velecsum = _mm_setzero_ps();
199 vvdwsum = _mm_setzero_ps();
201 /* Start inner kernel loop */
202 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
205 /* Get j neighbor index, and coordinate index */
210 j_coord_offsetA = DIM*jnrA;
211 j_coord_offsetB = DIM*jnrB;
212 j_coord_offsetC = DIM*jnrC;
213 j_coord_offsetD = DIM*jnrD;
215 /* load j atom coordinates */
216 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
217 x+j_coord_offsetC,x+j_coord_offsetD,
218 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
220 /* Calculate displacement vector */
221 dx00 = _mm_sub_ps(ix0,jx0);
222 dy00 = _mm_sub_ps(iy0,jy0);
223 dz00 = _mm_sub_ps(iz0,jz0);
224 dx01 = _mm_sub_ps(ix0,jx1);
225 dy01 = _mm_sub_ps(iy0,jy1);
226 dz01 = _mm_sub_ps(iz0,jz1);
227 dx02 = _mm_sub_ps(ix0,jx2);
228 dy02 = _mm_sub_ps(iy0,jy2);
229 dz02 = _mm_sub_ps(iz0,jz2);
230 dx10 = _mm_sub_ps(ix1,jx0);
231 dy10 = _mm_sub_ps(iy1,jy0);
232 dz10 = _mm_sub_ps(iz1,jz0);
233 dx11 = _mm_sub_ps(ix1,jx1);
234 dy11 = _mm_sub_ps(iy1,jy1);
235 dz11 = _mm_sub_ps(iz1,jz1);
236 dx12 = _mm_sub_ps(ix1,jx2);
237 dy12 = _mm_sub_ps(iy1,jy2);
238 dz12 = _mm_sub_ps(iz1,jz2);
239 dx20 = _mm_sub_ps(ix2,jx0);
240 dy20 = _mm_sub_ps(iy2,jy0);
241 dz20 = _mm_sub_ps(iz2,jz0);
242 dx21 = _mm_sub_ps(ix2,jx1);
243 dy21 = _mm_sub_ps(iy2,jy1);
244 dz21 = _mm_sub_ps(iz2,jz1);
245 dx22 = _mm_sub_ps(ix2,jx2);
246 dy22 = _mm_sub_ps(iy2,jy2);
247 dz22 = _mm_sub_ps(iz2,jz2);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
251 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
252 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
253 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
254 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
255 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
256 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
257 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
258 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
260 rinv00 = gmx_mm_invsqrt_ps(rsq00);
261 rinv01 = gmx_mm_invsqrt_ps(rsq01);
262 rinv02 = gmx_mm_invsqrt_ps(rsq02);
263 rinv10 = gmx_mm_invsqrt_ps(rsq10);
264 rinv11 = gmx_mm_invsqrt_ps(rsq11);
265 rinv12 = gmx_mm_invsqrt_ps(rsq12);
266 rinv20 = gmx_mm_invsqrt_ps(rsq20);
267 rinv21 = gmx_mm_invsqrt_ps(rsq21);
268 rinv22 = gmx_mm_invsqrt_ps(rsq22);
270 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
271 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
272 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
273 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
274 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
275 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
276 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
277 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
278 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
280 fjx0 = _mm_setzero_ps();
281 fjy0 = _mm_setzero_ps();
282 fjz0 = _mm_setzero_ps();
283 fjx1 = _mm_setzero_ps();
284 fjy1 = _mm_setzero_ps();
285 fjz1 = _mm_setzero_ps();
286 fjx2 = _mm_setzero_ps();
287 fjy2 = _mm_setzero_ps();
288 fjz2 = _mm_setzero_ps();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 r00 = _mm_mul_ps(rsq00,rinv00);
296 /* EWALD ELECTROSTATICS */
298 /* Analytical PME correction */
299 zeta2 = _mm_mul_ps(beta2,rsq00);
300 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
301 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
302 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
303 felec = _mm_mul_ps(qq00,felec);
304 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
305 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
306 velec = _mm_mul_ps(qq00,velec);
308 /* LENNARD-JONES DISPERSION/REPULSION */
310 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
311 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
312 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
313 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
314 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
316 /* Update potential sum for this i atom from the interaction with this j atom. */
317 velecsum = _mm_add_ps(velecsum,velec);
318 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
320 fscal = _mm_add_ps(felec,fvdw);
322 /* Update vectorial force */
323 fix0 = _mm_macc_ps(dx00,fscal,fix0);
324 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
325 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
327 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
328 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
329 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 r01 = _mm_mul_ps(rsq01,rinv01);
337 /* EWALD ELECTROSTATICS */
339 /* Analytical PME correction */
340 zeta2 = _mm_mul_ps(beta2,rsq01);
341 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
342 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
343 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
344 felec = _mm_mul_ps(qq01,felec);
345 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
346 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
347 velec = _mm_mul_ps(qq01,velec);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm_add_ps(velecsum,velec);
354 /* Update vectorial force */
355 fix0 = _mm_macc_ps(dx01,fscal,fix0);
356 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
357 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
359 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
360 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
361 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r02 = _mm_mul_ps(rsq02,rinv02);
369 /* EWALD ELECTROSTATICS */
371 /* Analytical PME correction */
372 zeta2 = _mm_mul_ps(beta2,rsq02);
373 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
374 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
375 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
376 felec = _mm_mul_ps(qq02,felec);
377 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
378 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
379 velec = _mm_mul_ps(qq02,velec);
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velecsum = _mm_add_ps(velecsum,velec);
386 /* Update vectorial force */
387 fix0 = _mm_macc_ps(dx02,fscal,fix0);
388 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
389 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
391 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
392 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
393 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 r10 = _mm_mul_ps(rsq10,rinv10);
401 /* EWALD ELECTROSTATICS */
403 /* Analytical PME correction */
404 zeta2 = _mm_mul_ps(beta2,rsq10);
405 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
406 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
407 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
408 felec = _mm_mul_ps(qq10,felec);
409 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
410 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
411 velec = _mm_mul_ps(qq10,velec);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velecsum = _mm_add_ps(velecsum,velec);
418 /* Update vectorial force */
419 fix1 = _mm_macc_ps(dx10,fscal,fix1);
420 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
421 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
423 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
424 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
425 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
427 /**************************
428 * CALCULATE INTERACTIONS *
429 **************************/
431 r11 = _mm_mul_ps(rsq11,rinv11);
433 /* EWALD ELECTROSTATICS */
435 /* Analytical PME correction */
436 zeta2 = _mm_mul_ps(beta2,rsq11);
437 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
438 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
439 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
440 felec = _mm_mul_ps(qq11,felec);
441 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
442 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
443 velec = _mm_mul_ps(qq11,velec);
445 /* Update potential sum for this i atom from the interaction with this j atom. */
446 velecsum = _mm_add_ps(velecsum,velec);
450 /* Update vectorial force */
451 fix1 = _mm_macc_ps(dx11,fscal,fix1);
452 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
453 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
455 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
456 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
457 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 r12 = _mm_mul_ps(rsq12,rinv12);
465 /* EWALD ELECTROSTATICS */
467 /* Analytical PME correction */
468 zeta2 = _mm_mul_ps(beta2,rsq12);
469 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
470 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
471 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
472 felec = _mm_mul_ps(qq12,felec);
473 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
474 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
475 velec = _mm_mul_ps(qq12,velec);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velecsum = _mm_add_ps(velecsum,velec);
482 /* Update vectorial force */
483 fix1 = _mm_macc_ps(dx12,fscal,fix1);
484 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
485 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
487 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
488 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
489 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 r20 = _mm_mul_ps(rsq20,rinv20);
497 /* EWALD ELECTROSTATICS */
499 /* Analytical PME correction */
500 zeta2 = _mm_mul_ps(beta2,rsq20);
501 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
502 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
503 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
504 felec = _mm_mul_ps(qq20,felec);
505 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
506 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
507 velec = _mm_mul_ps(qq20,velec);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velecsum = _mm_add_ps(velecsum,velec);
514 /* Update vectorial force */
515 fix2 = _mm_macc_ps(dx20,fscal,fix2);
516 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
517 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
519 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
520 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
521 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
523 /**************************
524 * CALCULATE INTERACTIONS *
525 **************************/
527 r21 = _mm_mul_ps(rsq21,rinv21);
529 /* EWALD ELECTROSTATICS */
531 /* Analytical PME correction */
532 zeta2 = _mm_mul_ps(beta2,rsq21);
533 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
534 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
535 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
536 felec = _mm_mul_ps(qq21,felec);
537 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
538 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
539 velec = _mm_mul_ps(qq21,velec);
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velecsum = _mm_add_ps(velecsum,velec);
546 /* Update vectorial force */
547 fix2 = _mm_macc_ps(dx21,fscal,fix2);
548 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
549 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
551 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
552 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
553 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r22 = _mm_mul_ps(rsq22,rinv22);
561 /* EWALD ELECTROSTATICS */
563 /* Analytical PME correction */
564 zeta2 = _mm_mul_ps(beta2,rsq22);
565 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
566 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
567 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
568 felec = _mm_mul_ps(qq22,felec);
569 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
570 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
571 velec = _mm_mul_ps(qq22,velec);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velecsum = _mm_add_ps(velecsum,velec);
578 /* Update vectorial force */
579 fix2 = _mm_macc_ps(dx22,fscal,fix2);
580 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
581 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
583 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
584 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
585 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
587 fjptrA = f+j_coord_offsetA;
588 fjptrB = f+j_coord_offsetB;
589 fjptrC = f+j_coord_offsetC;
590 fjptrD = f+j_coord_offsetD;
592 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
593 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
595 /* Inner loop uses 273 flops */
601 /* Get j neighbor index, and coordinate index */
602 jnrlistA = jjnr[jidx];
603 jnrlistB = jjnr[jidx+1];
604 jnrlistC = jjnr[jidx+2];
605 jnrlistD = jjnr[jidx+3];
606 /* Sign of each element will be negative for non-real atoms.
607 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
608 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
610 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
611 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
612 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
613 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
614 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
615 j_coord_offsetA = DIM*jnrA;
616 j_coord_offsetB = DIM*jnrB;
617 j_coord_offsetC = DIM*jnrC;
618 j_coord_offsetD = DIM*jnrD;
620 /* load j atom coordinates */
621 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
622 x+j_coord_offsetC,x+j_coord_offsetD,
623 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
625 /* Calculate displacement vector */
626 dx00 = _mm_sub_ps(ix0,jx0);
627 dy00 = _mm_sub_ps(iy0,jy0);
628 dz00 = _mm_sub_ps(iz0,jz0);
629 dx01 = _mm_sub_ps(ix0,jx1);
630 dy01 = _mm_sub_ps(iy0,jy1);
631 dz01 = _mm_sub_ps(iz0,jz1);
632 dx02 = _mm_sub_ps(ix0,jx2);
633 dy02 = _mm_sub_ps(iy0,jy2);
634 dz02 = _mm_sub_ps(iz0,jz2);
635 dx10 = _mm_sub_ps(ix1,jx0);
636 dy10 = _mm_sub_ps(iy1,jy0);
637 dz10 = _mm_sub_ps(iz1,jz0);
638 dx11 = _mm_sub_ps(ix1,jx1);
639 dy11 = _mm_sub_ps(iy1,jy1);
640 dz11 = _mm_sub_ps(iz1,jz1);
641 dx12 = _mm_sub_ps(ix1,jx2);
642 dy12 = _mm_sub_ps(iy1,jy2);
643 dz12 = _mm_sub_ps(iz1,jz2);
644 dx20 = _mm_sub_ps(ix2,jx0);
645 dy20 = _mm_sub_ps(iy2,jy0);
646 dz20 = _mm_sub_ps(iz2,jz0);
647 dx21 = _mm_sub_ps(ix2,jx1);
648 dy21 = _mm_sub_ps(iy2,jy1);
649 dz21 = _mm_sub_ps(iz2,jz1);
650 dx22 = _mm_sub_ps(ix2,jx2);
651 dy22 = _mm_sub_ps(iy2,jy2);
652 dz22 = _mm_sub_ps(iz2,jz2);
654 /* Calculate squared distance and things based on it */
655 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
656 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
657 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
658 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
659 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
660 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
661 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
662 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
663 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
665 rinv00 = gmx_mm_invsqrt_ps(rsq00);
666 rinv01 = gmx_mm_invsqrt_ps(rsq01);
667 rinv02 = gmx_mm_invsqrt_ps(rsq02);
668 rinv10 = gmx_mm_invsqrt_ps(rsq10);
669 rinv11 = gmx_mm_invsqrt_ps(rsq11);
670 rinv12 = gmx_mm_invsqrt_ps(rsq12);
671 rinv20 = gmx_mm_invsqrt_ps(rsq20);
672 rinv21 = gmx_mm_invsqrt_ps(rsq21);
673 rinv22 = gmx_mm_invsqrt_ps(rsq22);
675 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
676 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
677 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
678 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
679 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
680 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
681 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
682 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
683 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
685 fjx0 = _mm_setzero_ps();
686 fjy0 = _mm_setzero_ps();
687 fjz0 = _mm_setzero_ps();
688 fjx1 = _mm_setzero_ps();
689 fjy1 = _mm_setzero_ps();
690 fjz1 = _mm_setzero_ps();
691 fjx2 = _mm_setzero_ps();
692 fjy2 = _mm_setzero_ps();
693 fjz2 = _mm_setzero_ps();
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 r00 = _mm_mul_ps(rsq00,rinv00);
700 r00 = _mm_andnot_ps(dummy_mask,r00);
702 /* EWALD ELECTROSTATICS */
704 /* Analytical PME correction */
705 zeta2 = _mm_mul_ps(beta2,rsq00);
706 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
707 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
708 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
709 felec = _mm_mul_ps(qq00,felec);
710 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
711 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
712 velec = _mm_mul_ps(qq00,velec);
714 /* LENNARD-JONES DISPERSION/REPULSION */
716 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
717 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
718 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
719 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
720 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
722 /* Update potential sum for this i atom from the interaction with this j atom. */
723 velec = _mm_andnot_ps(dummy_mask,velec);
724 velecsum = _mm_add_ps(velecsum,velec);
725 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
726 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
728 fscal = _mm_add_ps(felec,fvdw);
730 fscal = _mm_andnot_ps(dummy_mask,fscal);
732 /* Update vectorial force */
733 fix0 = _mm_macc_ps(dx00,fscal,fix0);
734 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
735 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
737 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
738 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
739 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 r01 = _mm_mul_ps(rsq01,rinv01);
746 r01 = _mm_andnot_ps(dummy_mask,r01);
748 /* EWALD ELECTROSTATICS */
750 /* Analytical PME correction */
751 zeta2 = _mm_mul_ps(beta2,rsq01);
752 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
753 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
754 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
755 felec = _mm_mul_ps(qq01,felec);
756 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
757 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
758 velec = _mm_mul_ps(qq01,velec);
760 /* Update potential sum for this i atom from the interaction with this j atom. */
761 velec = _mm_andnot_ps(dummy_mask,velec);
762 velecsum = _mm_add_ps(velecsum,velec);
766 fscal = _mm_andnot_ps(dummy_mask,fscal);
768 /* Update vectorial force */
769 fix0 = _mm_macc_ps(dx01,fscal,fix0);
770 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
771 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
773 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
774 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
775 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
777 /**************************
778 * CALCULATE INTERACTIONS *
779 **************************/
781 r02 = _mm_mul_ps(rsq02,rinv02);
782 r02 = _mm_andnot_ps(dummy_mask,r02);
784 /* EWALD ELECTROSTATICS */
786 /* Analytical PME correction */
787 zeta2 = _mm_mul_ps(beta2,rsq02);
788 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
789 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
790 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
791 felec = _mm_mul_ps(qq02,felec);
792 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
793 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
794 velec = _mm_mul_ps(qq02,velec);
796 /* Update potential sum for this i atom from the interaction with this j atom. */
797 velec = _mm_andnot_ps(dummy_mask,velec);
798 velecsum = _mm_add_ps(velecsum,velec);
802 fscal = _mm_andnot_ps(dummy_mask,fscal);
804 /* Update vectorial force */
805 fix0 = _mm_macc_ps(dx02,fscal,fix0);
806 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
807 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
809 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
810 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
811 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 r10 = _mm_mul_ps(rsq10,rinv10);
818 r10 = _mm_andnot_ps(dummy_mask,r10);
820 /* EWALD ELECTROSTATICS */
822 /* Analytical PME correction */
823 zeta2 = _mm_mul_ps(beta2,rsq10);
824 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
825 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
826 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
827 felec = _mm_mul_ps(qq10,felec);
828 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
829 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
830 velec = _mm_mul_ps(qq10,velec);
832 /* Update potential sum for this i atom from the interaction with this j atom. */
833 velec = _mm_andnot_ps(dummy_mask,velec);
834 velecsum = _mm_add_ps(velecsum,velec);
838 fscal = _mm_andnot_ps(dummy_mask,fscal);
840 /* Update vectorial force */
841 fix1 = _mm_macc_ps(dx10,fscal,fix1);
842 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
843 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
845 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
846 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
847 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 r11 = _mm_mul_ps(rsq11,rinv11);
854 r11 = _mm_andnot_ps(dummy_mask,r11);
856 /* EWALD ELECTROSTATICS */
858 /* Analytical PME correction */
859 zeta2 = _mm_mul_ps(beta2,rsq11);
860 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
861 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
862 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
863 felec = _mm_mul_ps(qq11,felec);
864 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
865 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
866 velec = _mm_mul_ps(qq11,velec);
868 /* Update potential sum for this i atom from the interaction with this j atom. */
869 velec = _mm_andnot_ps(dummy_mask,velec);
870 velecsum = _mm_add_ps(velecsum,velec);
874 fscal = _mm_andnot_ps(dummy_mask,fscal);
876 /* Update vectorial force */
877 fix1 = _mm_macc_ps(dx11,fscal,fix1);
878 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
879 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
881 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
882 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
883 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 r12 = _mm_mul_ps(rsq12,rinv12);
890 r12 = _mm_andnot_ps(dummy_mask,r12);
892 /* EWALD ELECTROSTATICS */
894 /* Analytical PME correction */
895 zeta2 = _mm_mul_ps(beta2,rsq12);
896 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
897 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
898 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
899 felec = _mm_mul_ps(qq12,felec);
900 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
901 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
902 velec = _mm_mul_ps(qq12,velec);
904 /* Update potential sum for this i atom from the interaction with this j atom. */
905 velec = _mm_andnot_ps(dummy_mask,velec);
906 velecsum = _mm_add_ps(velecsum,velec);
910 fscal = _mm_andnot_ps(dummy_mask,fscal);
912 /* Update vectorial force */
913 fix1 = _mm_macc_ps(dx12,fscal,fix1);
914 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
915 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
917 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
918 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
919 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 r20 = _mm_mul_ps(rsq20,rinv20);
926 r20 = _mm_andnot_ps(dummy_mask,r20);
928 /* EWALD ELECTROSTATICS */
930 /* Analytical PME correction */
931 zeta2 = _mm_mul_ps(beta2,rsq20);
932 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
933 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
934 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
935 felec = _mm_mul_ps(qq20,felec);
936 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
937 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
938 velec = _mm_mul_ps(qq20,velec);
940 /* Update potential sum for this i atom from the interaction with this j atom. */
941 velec = _mm_andnot_ps(dummy_mask,velec);
942 velecsum = _mm_add_ps(velecsum,velec);
946 fscal = _mm_andnot_ps(dummy_mask,fscal);
948 /* Update vectorial force */
949 fix2 = _mm_macc_ps(dx20,fscal,fix2);
950 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
951 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
953 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
954 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
955 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
957 /**************************
958 * CALCULATE INTERACTIONS *
959 **************************/
961 r21 = _mm_mul_ps(rsq21,rinv21);
962 r21 = _mm_andnot_ps(dummy_mask,r21);
964 /* EWALD ELECTROSTATICS */
966 /* Analytical PME correction */
967 zeta2 = _mm_mul_ps(beta2,rsq21);
968 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
969 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
970 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
971 felec = _mm_mul_ps(qq21,felec);
972 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
973 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
974 velec = _mm_mul_ps(qq21,velec);
976 /* Update potential sum for this i atom from the interaction with this j atom. */
977 velec = _mm_andnot_ps(dummy_mask,velec);
978 velecsum = _mm_add_ps(velecsum,velec);
982 fscal = _mm_andnot_ps(dummy_mask,fscal);
984 /* Update vectorial force */
985 fix2 = _mm_macc_ps(dx21,fscal,fix2);
986 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
987 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
989 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
990 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
991 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
993 /**************************
994 * CALCULATE INTERACTIONS *
995 **************************/
997 r22 = _mm_mul_ps(rsq22,rinv22);
998 r22 = _mm_andnot_ps(dummy_mask,r22);
1000 /* EWALD ELECTROSTATICS */
1002 /* Analytical PME correction */
1003 zeta2 = _mm_mul_ps(beta2,rsq22);
1004 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1005 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1006 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1007 felec = _mm_mul_ps(qq22,felec);
1008 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1009 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1010 velec = _mm_mul_ps(qq22,velec);
1012 /* Update potential sum for this i atom from the interaction with this j atom. */
1013 velec = _mm_andnot_ps(dummy_mask,velec);
1014 velecsum = _mm_add_ps(velecsum,velec);
1018 fscal = _mm_andnot_ps(dummy_mask,fscal);
1020 /* Update vectorial force */
1021 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1022 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1023 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1025 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1026 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1027 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1029 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1030 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1031 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1032 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1034 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1035 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1037 /* Inner loop uses 282 flops */
1040 /* End of innermost loop */
1042 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1043 f+i_coord_offset,fshift+i_shift_offset);
1046 /* Update potential energies */
1047 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1048 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1050 /* Increment number of inner iterations */
1051 inneriter += j_index_end - j_index_start;
1053 /* Outer loop uses 20 flops */
1056 /* Increment number of outer iterations */
1059 /* Update outer/inner flops */
1061 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*282);
1064 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1065 * Electrostatics interaction: Ewald
1066 * VdW interaction: LennardJones
1067 * Geometry: Water3-Water3
1068 * Calculate force/pot: Force
1071 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1072 (t_nblist * gmx_restrict nlist,
1073 rvec * gmx_restrict xx,
1074 rvec * gmx_restrict ff,
1075 t_forcerec * gmx_restrict fr,
1076 t_mdatoms * gmx_restrict mdatoms,
1077 nb_kernel_data_t * gmx_restrict kernel_data,
1078 t_nrnb * gmx_restrict nrnb)
1080 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1081 * just 0 for non-waters.
1082 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1083 * jnr indices corresponding to data put in the four positions in the SIMD register.
1085 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1086 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1087 int jnrA,jnrB,jnrC,jnrD;
1088 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1089 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1090 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1091 real rcutoff_scalar;
1092 real *shiftvec,*fshift,*x,*f;
1093 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1094 real scratch[4*DIM];
1095 __m128 fscal,rcutoff,rcutoff2,jidxall;
1097 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1099 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1101 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1102 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1103 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1104 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1105 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1106 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1107 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1108 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1109 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1110 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1111 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1112 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1113 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1114 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1115 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1116 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1117 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1120 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1123 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1124 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1126 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1127 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1129 __m128 dummy_mask,cutoff_mask;
1130 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1131 __m128 one = _mm_set1_ps(1.0);
1132 __m128 two = _mm_set1_ps(2.0);
1138 jindex = nlist->jindex;
1140 shiftidx = nlist->shift;
1142 shiftvec = fr->shift_vec[0];
1143 fshift = fr->fshift[0];
1144 facel = _mm_set1_ps(fr->epsfac);
1145 charge = mdatoms->chargeA;
1146 nvdwtype = fr->ntype;
1147 vdwparam = fr->nbfp;
1148 vdwtype = mdatoms->typeA;
1150 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1151 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
1152 beta2 = _mm_mul_ps(beta,beta);
1153 beta3 = _mm_mul_ps(beta,beta2);
1154 ewtab = fr->ic->tabq_coul_F;
1155 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1156 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1158 /* Setup water-specific parameters */
1159 inr = nlist->iinr[0];
1160 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1161 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1162 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1163 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1165 jq0 = _mm_set1_ps(charge[inr+0]);
1166 jq1 = _mm_set1_ps(charge[inr+1]);
1167 jq2 = _mm_set1_ps(charge[inr+2]);
1168 vdwjidx0A = 2*vdwtype[inr+0];
1169 qq00 = _mm_mul_ps(iq0,jq0);
1170 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1171 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1172 qq01 = _mm_mul_ps(iq0,jq1);
1173 qq02 = _mm_mul_ps(iq0,jq2);
1174 qq10 = _mm_mul_ps(iq1,jq0);
1175 qq11 = _mm_mul_ps(iq1,jq1);
1176 qq12 = _mm_mul_ps(iq1,jq2);
1177 qq20 = _mm_mul_ps(iq2,jq0);
1178 qq21 = _mm_mul_ps(iq2,jq1);
1179 qq22 = _mm_mul_ps(iq2,jq2);
1181 /* Avoid stupid compiler warnings */
1182 jnrA = jnrB = jnrC = jnrD = 0;
1183 j_coord_offsetA = 0;
1184 j_coord_offsetB = 0;
1185 j_coord_offsetC = 0;
1186 j_coord_offsetD = 0;
1191 for(iidx=0;iidx<4*DIM;iidx++)
1193 scratch[iidx] = 0.0;
1196 /* Start outer loop over neighborlists */
1197 for(iidx=0; iidx<nri; iidx++)
1199 /* Load shift vector for this list */
1200 i_shift_offset = DIM*shiftidx[iidx];
1202 /* Load limits for loop over neighbors */
1203 j_index_start = jindex[iidx];
1204 j_index_end = jindex[iidx+1];
1206 /* Get outer coordinate index */
1208 i_coord_offset = DIM*inr;
1210 /* Load i particle coords and add shift vector */
1211 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1212 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1214 fix0 = _mm_setzero_ps();
1215 fiy0 = _mm_setzero_ps();
1216 fiz0 = _mm_setzero_ps();
1217 fix1 = _mm_setzero_ps();
1218 fiy1 = _mm_setzero_ps();
1219 fiz1 = _mm_setzero_ps();
1220 fix2 = _mm_setzero_ps();
1221 fiy2 = _mm_setzero_ps();
1222 fiz2 = _mm_setzero_ps();
1224 /* Start inner kernel loop */
1225 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1228 /* Get j neighbor index, and coordinate index */
1230 jnrB = jjnr[jidx+1];
1231 jnrC = jjnr[jidx+2];
1232 jnrD = jjnr[jidx+3];
1233 j_coord_offsetA = DIM*jnrA;
1234 j_coord_offsetB = DIM*jnrB;
1235 j_coord_offsetC = DIM*jnrC;
1236 j_coord_offsetD = DIM*jnrD;
1238 /* load j atom coordinates */
1239 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1240 x+j_coord_offsetC,x+j_coord_offsetD,
1241 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1243 /* Calculate displacement vector */
1244 dx00 = _mm_sub_ps(ix0,jx0);
1245 dy00 = _mm_sub_ps(iy0,jy0);
1246 dz00 = _mm_sub_ps(iz0,jz0);
1247 dx01 = _mm_sub_ps(ix0,jx1);
1248 dy01 = _mm_sub_ps(iy0,jy1);
1249 dz01 = _mm_sub_ps(iz0,jz1);
1250 dx02 = _mm_sub_ps(ix0,jx2);
1251 dy02 = _mm_sub_ps(iy0,jy2);
1252 dz02 = _mm_sub_ps(iz0,jz2);
1253 dx10 = _mm_sub_ps(ix1,jx0);
1254 dy10 = _mm_sub_ps(iy1,jy0);
1255 dz10 = _mm_sub_ps(iz1,jz0);
1256 dx11 = _mm_sub_ps(ix1,jx1);
1257 dy11 = _mm_sub_ps(iy1,jy1);
1258 dz11 = _mm_sub_ps(iz1,jz1);
1259 dx12 = _mm_sub_ps(ix1,jx2);
1260 dy12 = _mm_sub_ps(iy1,jy2);
1261 dz12 = _mm_sub_ps(iz1,jz2);
1262 dx20 = _mm_sub_ps(ix2,jx0);
1263 dy20 = _mm_sub_ps(iy2,jy0);
1264 dz20 = _mm_sub_ps(iz2,jz0);
1265 dx21 = _mm_sub_ps(ix2,jx1);
1266 dy21 = _mm_sub_ps(iy2,jy1);
1267 dz21 = _mm_sub_ps(iz2,jz1);
1268 dx22 = _mm_sub_ps(ix2,jx2);
1269 dy22 = _mm_sub_ps(iy2,jy2);
1270 dz22 = _mm_sub_ps(iz2,jz2);
1272 /* Calculate squared distance and things based on it */
1273 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1274 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1275 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1276 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1277 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1278 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1279 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1280 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1281 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1283 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1284 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1285 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1286 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1287 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1288 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1289 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1290 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1291 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1293 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1294 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1295 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1296 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1297 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1298 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1299 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1300 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1301 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1303 fjx0 = _mm_setzero_ps();
1304 fjy0 = _mm_setzero_ps();
1305 fjz0 = _mm_setzero_ps();
1306 fjx1 = _mm_setzero_ps();
1307 fjy1 = _mm_setzero_ps();
1308 fjz1 = _mm_setzero_ps();
1309 fjx2 = _mm_setzero_ps();
1310 fjy2 = _mm_setzero_ps();
1311 fjz2 = _mm_setzero_ps();
1313 /**************************
1314 * CALCULATE INTERACTIONS *
1315 **************************/
1317 r00 = _mm_mul_ps(rsq00,rinv00);
1319 /* EWALD ELECTROSTATICS */
1321 /* Analytical PME correction */
1322 zeta2 = _mm_mul_ps(beta2,rsq00);
1323 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1324 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1325 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1326 felec = _mm_mul_ps(qq00,felec);
1328 /* LENNARD-JONES DISPERSION/REPULSION */
1330 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1331 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1333 fscal = _mm_add_ps(felec,fvdw);
1335 /* Update vectorial force */
1336 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1337 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1338 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1340 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1341 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1342 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1344 /**************************
1345 * CALCULATE INTERACTIONS *
1346 **************************/
1348 r01 = _mm_mul_ps(rsq01,rinv01);
1350 /* EWALD ELECTROSTATICS */
1352 /* Analytical PME correction */
1353 zeta2 = _mm_mul_ps(beta2,rsq01);
1354 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1355 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1356 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1357 felec = _mm_mul_ps(qq01,felec);
1361 /* Update vectorial force */
1362 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1363 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1364 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1366 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1367 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1368 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1370 /**************************
1371 * CALCULATE INTERACTIONS *
1372 **************************/
1374 r02 = _mm_mul_ps(rsq02,rinv02);
1376 /* EWALD ELECTROSTATICS */
1378 /* Analytical PME correction */
1379 zeta2 = _mm_mul_ps(beta2,rsq02);
1380 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1381 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1382 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1383 felec = _mm_mul_ps(qq02,felec);
1387 /* Update vectorial force */
1388 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1389 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1390 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1392 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1393 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1394 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1396 /**************************
1397 * CALCULATE INTERACTIONS *
1398 **************************/
1400 r10 = _mm_mul_ps(rsq10,rinv10);
1402 /* EWALD ELECTROSTATICS */
1404 /* Analytical PME correction */
1405 zeta2 = _mm_mul_ps(beta2,rsq10);
1406 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1407 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1408 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1409 felec = _mm_mul_ps(qq10,felec);
1413 /* Update vectorial force */
1414 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1415 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1416 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1418 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1419 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1420 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1422 /**************************
1423 * CALCULATE INTERACTIONS *
1424 **************************/
1426 r11 = _mm_mul_ps(rsq11,rinv11);
1428 /* EWALD ELECTROSTATICS */
1430 /* Analytical PME correction */
1431 zeta2 = _mm_mul_ps(beta2,rsq11);
1432 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1433 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1434 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1435 felec = _mm_mul_ps(qq11,felec);
1439 /* Update vectorial force */
1440 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1441 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1442 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1444 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1445 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1446 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 r12 = _mm_mul_ps(rsq12,rinv12);
1454 /* EWALD ELECTROSTATICS */
1456 /* Analytical PME correction */
1457 zeta2 = _mm_mul_ps(beta2,rsq12);
1458 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1459 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1460 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1461 felec = _mm_mul_ps(qq12,felec);
1465 /* Update vectorial force */
1466 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1467 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1468 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1470 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1471 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1472 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1474 /**************************
1475 * CALCULATE INTERACTIONS *
1476 **************************/
1478 r20 = _mm_mul_ps(rsq20,rinv20);
1480 /* EWALD ELECTROSTATICS */
1482 /* Analytical PME correction */
1483 zeta2 = _mm_mul_ps(beta2,rsq20);
1484 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1485 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1486 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1487 felec = _mm_mul_ps(qq20,felec);
1491 /* Update vectorial force */
1492 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1493 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1494 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1496 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1497 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1498 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 r21 = _mm_mul_ps(rsq21,rinv21);
1506 /* EWALD ELECTROSTATICS */
1508 /* Analytical PME correction */
1509 zeta2 = _mm_mul_ps(beta2,rsq21);
1510 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1511 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1512 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1513 felec = _mm_mul_ps(qq21,felec);
1517 /* Update vectorial force */
1518 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1519 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1520 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1522 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1523 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1524 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1526 /**************************
1527 * CALCULATE INTERACTIONS *
1528 **************************/
1530 r22 = _mm_mul_ps(rsq22,rinv22);
1532 /* EWALD ELECTROSTATICS */
1534 /* Analytical PME correction */
1535 zeta2 = _mm_mul_ps(beta2,rsq22);
1536 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1537 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1538 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1539 felec = _mm_mul_ps(qq22,felec);
1543 /* Update vectorial force */
1544 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1545 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1546 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1548 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1549 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1550 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1552 fjptrA = f+j_coord_offsetA;
1553 fjptrB = f+j_coord_offsetB;
1554 fjptrC = f+j_coord_offsetC;
1555 fjptrD = f+j_coord_offsetD;
1557 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1558 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1560 /* Inner loop uses 259 flops */
1563 if(jidx<j_index_end)
1566 /* Get j neighbor index, and coordinate index */
1567 jnrlistA = jjnr[jidx];
1568 jnrlistB = jjnr[jidx+1];
1569 jnrlistC = jjnr[jidx+2];
1570 jnrlistD = jjnr[jidx+3];
1571 /* Sign of each element will be negative for non-real atoms.
1572 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1573 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1575 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1576 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1577 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1578 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1579 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1580 j_coord_offsetA = DIM*jnrA;
1581 j_coord_offsetB = DIM*jnrB;
1582 j_coord_offsetC = DIM*jnrC;
1583 j_coord_offsetD = DIM*jnrD;
1585 /* load j atom coordinates */
1586 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1587 x+j_coord_offsetC,x+j_coord_offsetD,
1588 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1590 /* Calculate displacement vector */
1591 dx00 = _mm_sub_ps(ix0,jx0);
1592 dy00 = _mm_sub_ps(iy0,jy0);
1593 dz00 = _mm_sub_ps(iz0,jz0);
1594 dx01 = _mm_sub_ps(ix0,jx1);
1595 dy01 = _mm_sub_ps(iy0,jy1);
1596 dz01 = _mm_sub_ps(iz0,jz1);
1597 dx02 = _mm_sub_ps(ix0,jx2);
1598 dy02 = _mm_sub_ps(iy0,jy2);
1599 dz02 = _mm_sub_ps(iz0,jz2);
1600 dx10 = _mm_sub_ps(ix1,jx0);
1601 dy10 = _mm_sub_ps(iy1,jy0);
1602 dz10 = _mm_sub_ps(iz1,jz0);
1603 dx11 = _mm_sub_ps(ix1,jx1);
1604 dy11 = _mm_sub_ps(iy1,jy1);
1605 dz11 = _mm_sub_ps(iz1,jz1);
1606 dx12 = _mm_sub_ps(ix1,jx2);
1607 dy12 = _mm_sub_ps(iy1,jy2);
1608 dz12 = _mm_sub_ps(iz1,jz2);
1609 dx20 = _mm_sub_ps(ix2,jx0);
1610 dy20 = _mm_sub_ps(iy2,jy0);
1611 dz20 = _mm_sub_ps(iz2,jz0);
1612 dx21 = _mm_sub_ps(ix2,jx1);
1613 dy21 = _mm_sub_ps(iy2,jy1);
1614 dz21 = _mm_sub_ps(iz2,jz1);
1615 dx22 = _mm_sub_ps(ix2,jx2);
1616 dy22 = _mm_sub_ps(iy2,jy2);
1617 dz22 = _mm_sub_ps(iz2,jz2);
1619 /* Calculate squared distance and things based on it */
1620 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1621 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1622 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1623 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1624 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1625 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1626 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1627 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1628 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1630 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1631 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1632 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1633 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1634 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1635 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1636 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1637 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1638 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1640 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1641 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1642 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1643 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1644 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1645 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1646 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1647 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1648 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1650 fjx0 = _mm_setzero_ps();
1651 fjy0 = _mm_setzero_ps();
1652 fjz0 = _mm_setzero_ps();
1653 fjx1 = _mm_setzero_ps();
1654 fjy1 = _mm_setzero_ps();
1655 fjz1 = _mm_setzero_ps();
1656 fjx2 = _mm_setzero_ps();
1657 fjy2 = _mm_setzero_ps();
1658 fjz2 = _mm_setzero_ps();
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r00 = _mm_mul_ps(rsq00,rinv00);
1665 r00 = _mm_andnot_ps(dummy_mask,r00);
1667 /* EWALD ELECTROSTATICS */
1669 /* Analytical PME correction */
1670 zeta2 = _mm_mul_ps(beta2,rsq00);
1671 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1672 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1673 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1674 felec = _mm_mul_ps(qq00,felec);
1676 /* LENNARD-JONES DISPERSION/REPULSION */
1678 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1679 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1681 fscal = _mm_add_ps(felec,fvdw);
1683 fscal = _mm_andnot_ps(dummy_mask,fscal);
1685 /* Update vectorial force */
1686 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1687 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1688 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1690 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1691 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1692 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 r01 = _mm_mul_ps(rsq01,rinv01);
1699 r01 = _mm_andnot_ps(dummy_mask,r01);
1701 /* EWALD ELECTROSTATICS */
1703 /* Analytical PME correction */
1704 zeta2 = _mm_mul_ps(beta2,rsq01);
1705 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1706 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1707 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1708 felec = _mm_mul_ps(qq01,felec);
1712 fscal = _mm_andnot_ps(dummy_mask,fscal);
1714 /* Update vectorial force */
1715 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1716 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1717 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1719 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1720 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1721 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 r02 = _mm_mul_ps(rsq02,rinv02);
1728 r02 = _mm_andnot_ps(dummy_mask,r02);
1730 /* EWALD ELECTROSTATICS */
1732 /* Analytical PME correction */
1733 zeta2 = _mm_mul_ps(beta2,rsq02);
1734 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1735 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1736 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1737 felec = _mm_mul_ps(qq02,felec);
1741 fscal = _mm_andnot_ps(dummy_mask,fscal);
1743 /* Update vectorial force */
1744 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1745 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1746 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1748 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1749 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1750 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 r10 = _mm_mul_ps(rsq10,rinv10);
1757 r10 = _mm_andnot_ps(dummy_mask,r10);
1759 /* EWALD ELECTROSTATICS */
1761 /* Analytical PME correction */
1762 zeta2 = _mm_mul_ps(beta2,rsq10);
1763 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1764 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1765 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1766 felec = _mm_mul_ps(qq10,felec);
1770 fscal = _mm_andnot_ps(dummy_mask,fscal);
1772 /* Update vectorial force */
1773 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1774 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1775 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1777 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1778 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1779 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 r11 = _mm_mul_ps(rsq11,rinv11);
1786 r11 = _mm_andnot_ps(dummy_mask,r11);
1788 /* EWALD ELECTROSTATICS */
1790 /* Analytical PME correction */
1791 zeta2 = _mm_mul_ps(beta2,rsq11);
1792 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1793 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1794 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1795 felec = _mm_mul_ps(qq11,felec);
1799 fscal = _mm_andnot_ps(dummy_mask,fscal);
1801 /* Update vectorial force */
1802 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1803 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1804 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1806 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1807 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1808 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1810 /**************************
1811 * CALCULATE INTERACTIONS *
1812 **************************/
1814 r12 = _mm_mul_ps(rsq12,rinv12);
1815 r12 = _mm_andnot_ps(dummy_mask,r12);
1817 /* EWALD ELECTROSTATICS */
1819 /* Analytical PME correction */
1820 zeta2 = _mm_mul_ps(beta2,rsq12);
1821 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1822 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1823 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1824 felec = _mm_mul_ps(qq12,felec);
1828 fscal = _mm_andnot_ps(dummy_mask,fscal);
1830 /* Update vectorial force */
1831 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1832 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1833 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1835 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1836 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1837 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 r20 = _mm_mul_ps(rsq20,rinv20);
1844 r20 = _mm_andnot_ps(dummy_mask,r20);
1846 /* EWALD ELECTROSTATICS */
1848 /* Analytical PME correction */
1849 zeta2 = _mm_mul_ps(beta2,rsq20);
1850 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1851 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1852 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1853 felec = _mm_mul_ps(qq20,felec);
1857 fscal = _mm_andnot_ps(dummy_mask,fscal);
1859 /* Update vectorial force */
1860 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1861 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1862 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1864 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1865 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1866 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1868 /**************************
1869 * CALCULATE INTERACTIONS *
1870 **************************/
1872 r21 = _mm_mul_ps(rsq21,rinv21);
1873 r21 = _mm_andnot_ps(dummy_mask,r21);
1875 /* EWALD ELECTROSTATICS */
1877 /* Analytical PME correction */
1878 zeta2 = _mm_mul_ps(beta2,rsq21);
1879 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1880 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1881 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1882 felec = _mm_mul_ps(qq21,felec);
1886 fscal = _mm_andnot_ps(dummy_mask,fscal);
1888 /* Update vectorial force */
1889 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1890 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1891 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1893 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1894 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1895 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1897 /**************************
1898 * CALCULATE INTERACTIONS *
1899 **************************/
1901 r22 = _mm_mul_ps(rsq22,rinv22);
1902 r22 = _mm_andnot_ps(dummy_mask,r22);
1904 /* EWALD ELECTROSTATICS */
1906 /* Analytical PME correction */
1907 zeta2 = _mm_mul_ps(beta2,rsq22);
1908 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1909 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1910 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1911 felec = _mm_mul_ps(qq22,felec);
1915 fscal = _mm_andnot_ps(dummy_mask,fscal);
1917 /* Update vectorial force */
1918 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1919 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1920 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1922 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1923 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1924 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1926 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1927 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1928 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1929 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1931 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1932 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1934 /* Inner loop uses 268 flops */
1937 /* End of innermost loop */
1939 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1940 f+i_coord_offset,fshift+i_shift_offset);
1942 /* Increment number of inner iterations */
1943 inneriter += j_index_end - j_index_start;
1945 /* Outer loop uses 18 flops */
1948 /* Increment number of outer iterations */
1951 /* Update outer/inner flops */
1953 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*268);