2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LJEwald
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
130 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
131 __m128 one_half = _mm_set1_ps(0.5);
132 __m128 minus_one = _mm_set1_ps(-1.0);
134 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
135 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
137 __m128 dummy_mask,cutoff_mask;
138 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
139 __m128 one = _mm_set1_ps(1.0);
140 __m128 two = _mm_set1_ps(2.0);
146 jindex = nlist->jindex;
148 shiftidx = nlist->shift;
150 shiftvec = fr->shift_vec[0];
151 fshift = fr->fshift[0];
152 facel = _mm_set1_ps(fr->epsfac);
153 charge = mdatoms->chargeA;
154 nvdwtype = fr->ntype;
156 vdwtype = mdatoms->typeA;
157 vdwgridparam = fr->ljpme_c6grid;
158 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
159 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
160 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
162 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
163 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
164 beta2 = _mm_mul_ps(beta,beta);
165 beta3 = _mm_mul_ps(beta,beta2);
166 ewtab = fr->ic->tabq_coul_FDV0;
167 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
168 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
170 /* Setup water-specific parameters */
171 inr = nlist->iinr[0];
172 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
173 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
174 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
175 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
177 jq1 = _mm_set1_ps(charge[inr+1]);
178 jq2 = _mm_set1_ps(charge[inr+2]);
179 jq3 = _mm_set1_ps(charge[inr+3]);
180 vdwjidx0A = 2*vdwtype[inr+0];
181 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
182 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
183 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
184 qq11 = _mm_mul_ps(iq1,jq1);
185 qq12 = _mm_mul_ps(iq1,jq2);
186 qq13 = _mm_mul_ps(iq1,jq3);
187 qq21 = _mm_mul_ps(iq2,jq1);
188 qq22 = _mm_mul_ps(iq2,jq2);
189 qq23 = _mm_mul_ps(iq2,jq3);
190 qq31 = _mm_mul_ps(iq3,jq1);
191 qq32 = _mm_mul_ps(iq3,jq2);
192 qq33 = _mm_mul_ps(iq3,jq3);
194 /* Avoid stupid compiler warnings */
195 jnrA = jnrB = jnrC = jnrD = 0;
204 for(iidx=0;iidx<4*DIM;iidx++)
209 /* Start outer loop over neighborlists */
210 for(iidx=0; iidx<nri; iidx++)
212 /* Load shift vector for this list */
213 i_shift_offset = DIM*shiftidx[iidx];
215 /* Load limits for loop over neighbors */
216 j_index_start = jindex[iidx];
217 j_index_end = jindex[iidx+1];
219 /* Get outer coordinate index */
221 i_coord_offset = DIM*inr;
223 /* Load i particle coords and add shift vector */
224 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
225 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
227 fix0 = _mm_setzero_ps();
228 fiy0 = _mm_setzero_ps();
229 fiz0 = _mm_setzero_ps();
230 fix1 = _mm_setzero_ps();
231 fiy1 = _mm_setzero_ps();
232 fiz1 = _mm_setzero_ps();
233 fix2 = _mm_setzero_ps();
234 fiy2 = _mm_setzero_ps();
235 fiz2 = _mm_setzero_ps();
236 fix3 = _mm_setzero_ps();
237 fiy3 = _mm_setzero_ps();
238 fiz3 = _mm_setzero_ps();
240 /* Reset potential sums */
241 velecsum = _mm_setzero_ps();
242 vvdwsum = _mm_setzero_ps();
244 /* Start inner kernel loop */
245 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
248 /* Get j neighbor index, and coordinate index */
253 j_coord_offsetA = DIM*jnrA;
254 j_coord_offsetB = DIM*jnrB;
255 j_coord_offsetC = DIM*jnrC;
256 j_coord_offsetD = DIM*jnrD;
258 /* load j atom coordinates */
259 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
260 x+j_coord_offsetC,x+j_coord_offsetD,
261 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
262 &jy2,&jz2,&jx3,&jy3,&jz3);
264 /* Calculate displacement vector */
265 dx00 = _mm_sub_ps(ix0,jx0);
266 dy00 = _mm_sub_ps(iy0,jy0);
267 dz00 = _mm_sub_ps(iz0,jz0);
268 dx11 = _mm_sub_ps(ix1,jx1);
269 dy11 = _mm_sub_ps(iy1,jy1);
270 dz11 = _mm_sub_ps(iz1,jz1);
271 dx12 = _mm_sub_ps(ix1,jx2);
272 dy12 = _mm_sub_ps(iy1,jy2);
273 dz12 = _mm_sub_ps(iz1,jz2);
274 dx13 = _mm_sub_ps(ix1,jx3);
275 dy13 = _mm_sub_ps(iy1,jy3);
276 dz13 = _mm_sub_ps(iz1,jz3);
277 dx21 = _mm_sub_ps(ix2,jx1);
278 dy21 = _mm_sub_ps(iy2,jy1);
279 dz21 = _mm_sub_ps(iz2,jz1);
280 dx22 = _mm_sub_ps(ix2,jx2);
281 dy22 = _mm_sub_ps(iy2,jy2);
282 dz22 = _mm_sub_ps(iz2,jz2);
283 dx23 = _mm_sub_ps(ix2,jx3);
284 dy23 = _mm_sub_ps(iy2,jy3);
285 dz23 = _mm_sub_ps(iz2,jz3);
286 dx31 = _mm_sub_ps(ix3,jx1);
287 dy31 = _mm_sub_ps(iy3,jy1);
288 dz31 = _mm_sub_ps(iz3,jz1);
289 dx32 = _mm_sub_ps(ix3,jx2);
290 dy32 = _mm_sub_ps(iy3,jy2);
291 dz32 = _mm_sub_ps(iz3,jz2);
292 dx33 = _mm_sub_ps(ix3,jx3);
293 dy33 = _mm_sub_ps(iy3,jy3);
294 dz33 = _mm_sub_ps(iz3,jz3);
296 /* Calculate squared distance and things based on it */
297 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
298 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
299 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
300 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
301 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
302 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
303 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
304 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
305 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
306 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
308 rinv00 = gmx_mm_invsqrt_ps(rsq00);
309 rinv11 = gmx_mm_invsqrt_ps(rsq11);
310 rinv12 = gmx_mm_invsqrt_ps(rsq12);
311 rinv13 = gmx_mm_invsqrt_ps(rsq13);
312 rinv21 = gmx_mm_invsqrt_ps(rsq21);
313 rinv22 = gmx_mm_invsqrt_ps(rsq22);
314 rinv23 = gmx_mm_invsqrt_ps(rsq23);
315 rinv31 = gmx_mm_invsqrt_ps(rsq31);
316 rinv32 = gmx_mm_invsqrt_ps(rsq32);
317 rinv33 = gmx_mm_invsqrt_ps(rsq33);
319 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
320 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
321 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
322 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
323 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
324 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
325 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
326 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
327 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
328 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
330 fjx0 = _mm_setzero_ps();
331 fjy0 = _mm_setzero_ps();
332 fjz0 = _mm_setzero_ps();
333 fjx1 = _mm_setzero_ps();
334 fjy1 = _mm_setzero_ps();
335 fjz1 = _mm_setzero_ps();
336 fjx2 = _mm_setzero_ps();
337 fjy2 = _mm_setzero_ps();
338 fjz2 = _mm_setzero_ps();
339 fjx3 = _mm_setzero_ps();
340 fjy3 = _mm_setzero_ps();
341 fjz3 = _mm_setzero_ps();
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 r00 = _mm_mul_ps(rsq00,rinv00);
349 /* Analytical LJ-PME */
350 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
351 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
352 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
353 exponent = gmx_simd_exp_r(ewcljrsq);
354 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
355 poly = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
356 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
357 vvdw6 = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
358 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
359 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
360 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
361 fvdw = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
363 /* Update potential sum for this i atom from the interaction with this j atom. */
364 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
368 /* Update vectorial force */
369 fix0 = _mm_macc_ps(dx00,fscal,fix0);
370 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
371 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
373 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
374 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
375 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 r11 = _mm_mul_ps(rsq11,rinv11);
383 /* EWALD ELECTROSTATICS */
385 /* Analytical PME correction */
386 zeta2 = _mm_mul_ps(beta2,rsq11);
387 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
388 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
389 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
390 felec = _mm_mul_ps(qq11,felec);
391 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
392 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
393 velec = _mm_mul_ps(qq11,velec);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velecsum = _mm_add_ps(velecsum,velec);
400 /* Update vectorial force */
401 fix1 = _mm_macc_ps(dx11,fscal,fix1);
402 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
403 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
405 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
406 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
407 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r12 = _mm_mul_ps(rsq12,rinv12);
415 /* EWALD ELECTROSTATICS */
417 /* Analytical PME correction */
418 zeta2 = _mm_mul_ps(beta2,rsq12);
419 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
420 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
421 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
422 felec = _mm_mul_ps(qq12,felec);
423 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
424 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
425 velec = _mm_mul_ps(qq12,velec);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm_add_ps(velecsum,velec);
432 /* Update vectorial force */
433 fix1 = _mm_macc_ps(dx12,fscal,fix1);
434 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
435 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
437 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
438 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
439 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r13 = _mm_mul_ps(rsq13,rinv13);
447 /* EWALD ELECTROSTATICS */
449 /* Analytical PME correction */
450 zeta2 = _mm_mul_ps(beta2,rsq13);
451 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
452 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
453 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
454 felec = _mm_mul_ps(qq13,felec);
455 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
456 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
457 velec = _mm_mul_ps(qq13,velec);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velecsum = _mm_add_ps(velecsum,velec);
464 /* Update vectorial force */
465 fix1 = _mm_macc_ps(dx13,fscal,fix1);
466 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
467 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
469 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
470 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
471 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 r21 = _mm_mul_ps(rsq21,rinv21);
479 /* EWALD ELECTROSTATICS */
481 /* Analytical PME correction */
482 zeta2 = _mm_mul_ps(beta2,rsq21);
483 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
484 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
485 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
486 felec = _mm_mul_ps(qq21,felec);
487 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
488 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
489 velec = _mm_mul_ps(qq21,velec);
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velecsum = _mm_add_ps(velecsum,velec);
496 /* Update vectorial force */
497 fix2 = _mm_macc_ps(dx21,fscal,fix2);
498 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
499 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
501 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
502 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
503 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r22 = _mm_mul_ps(rsq22,rinv22);
511 /* EWALD ELECTROSTATICS */
513 /* Analytical PME correction */
514 zeta2 = _mm_mul_ps(beta2,rsq22);
515 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
516 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
517 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
518 felec = _mm_mul_ps(qq22,felec);
519 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
520 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
521 velec = _mm_mul_ps(qq22,velec);
523 /* Update potential sum for this i atom from the interaction with this j atom. */
524 velecsum = _mm_add_ps(velecsum,velec);
528 /* Update vectorial force */
529 fix2 = _mm_macc_ps(dx22,fscal,fix2);
530 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
531 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
533 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
534 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
535 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
537 /**************************
538 * CALCULATE INTERACTIONS *
539 **************************/
541 r23 = _mm_mul_ps(rsq23,rinv23);
543 /* EWALD ELECTROSTATICS */
545 /* Analytical PME correction */
546 zeta2 = _mm_mul_ps(beta2,rsq23);
547 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
548 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
549 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
550 felec = _mm_mul_ps(qq23,felec);
551 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
552 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
553 velec = _mm_mul_ps(qq23,velec);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velecsum = _mm_add_ps(velecsum,velec);
560 /* Update vectorial force */
561 fix2 = _mm_macc_ps(dx23,fscal,fix2);
562 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
563 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
565 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
566 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
567 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 r31 = _mm_mul_ps(rsq31,rinv31);
575 /* EWALD ELECTROSTATICS */
577 /* Analytical PME correction */
578 zeta2 = _mm_mul_ps(beta2,rsq31);
579 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
580 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
581 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
582 felec = _mm_mul_ps(qq31,felec);
583 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
584 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
585 velec = _mm_mul_ps(qq31,velec);
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velecsum = _mm_add_ps(velecsum,velec);
592 /* Update vectorial force */
593 fix3 = _mm_macc_ps(dx31,fscal,fix3);
594 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
595 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
597 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
598 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
599 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r32 = _mm_mul_ps(rsq32,rinv32);
607 /* EWALD ELECTROSTATICS */
609 /* Analytical PME correction */
610 zeta2 = _mm_mul_ps(beta2,rsq32);
611 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
612 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
613 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
614 felec = _mm_mul_ps(qq32,felec);
615 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
616 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
617 velec = _mm_mul_ps(qq32,velec);
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velecsum = _mm_add_ps(velecsum,velec);
624 /* Update vectorial force */
625 fix3 = _mm_macc_ps(dx32,fscal,fix3);
626 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
627 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
629 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
630 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
631 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 r33 = _mm_mul_ps(rsq33,rinv33);
639 /* EWALD ELECTROSTATICS */
641 /* Analytical PME correction */
642 zeta2 = _mm_mul_ps(beta2,rsq33);
643 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
644 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
645 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
646 felec = _mm_mul_ps(qq33,felec);
647 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
648 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
649 velec = _mm_mul_ps(qq33,velec);
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
656 /* Update vectorial force */
657 fix3 = _mm_macc_ps(dx33,fscal,fix3);
658 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
659 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
661 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
662 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
663 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
665 fjptrA = f+j_coord_offsetA;
666 fjptrB = f+j_coord_offsetB;
667 fjptrC = f+j_coord_offsetC;
668 fjptrD = f+j_coord_offsetD;
670 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
671 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
672 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
674 /* Inner loop uses 314 flops */
680 /* Get j neighbor index, and coordinate index */
681 jnrlistA = jjnr[jidx];
682 jnrlistB = jjnr[jidx+1];
683 jnrlistC = jjnr[jidx+2];
684 jnrlistD = jjnr[jidx+3];
685 /* Sign of each element will be negative for non-real atoms.
686 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
687 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
689 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
690 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
691 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
692 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
693 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
694 j_coord_offsetA = DIM*jnrA;
695 j_coord_offsetB = DIM*jnrB;
696 j_coord_offsetC = DIM*jnrC;
697 j_coord_offsetD = DIM*jnrD;
699 /* load j atom coordinates */
700 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
701 x+j_coord_offsetC,x+j_coord_offsetD,
702 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
703 &jy2,&jz2,&jx3,&jy3,&jz3);
705 /* Calculate displacement vector */
706 dx00 = _mm_sub_ps(ix0,jx0);
707 dy00 = _mm_sub_ps(iy0,jy0);
708 dz00 = _mm_sub_ps(iz0,jz0);
709 dx11 = _mm_sub_ps(ix1,jx1);
710 dy11 = _mm_sub_ps(iy1,jy1);
711 dz11 = _mm_sub_ps(iz1,jz1);
712 dx12 = _mm_sub_ps(ix1,jx2);
713 dy12 = _mm_sub_ps(iy1,jy2);
714 dz12 = _mm_sub_ps(iz1,jz2);
715 dx13 = _mm_sub_ps(ix1,jx3);
716 dy13 = _mm_sub_ps(iy1,jy3);
717 dz13 = _mm_sub_ps(iz1,jz3);
718 dx21 = _mm_sub_ps(ix2,jx1);
719 dy21 = _mm_sub_ps(iy2,jy1);
720 dz21 = _mm_sub_ps(iz2,jz1);
721 dx22 = _mm_sub_ps(ix2,jx2);
722 dy22 = _mm_sub_ps(iy2,jy2);
723 dz22 = _mm_sub_ps(iz2,jz2);
724 dx23 = _mm_sub_ps(ix2,jx3);
725 dy23 = _mm_sub_ps(iy2,jy3);
726 dz23 = _mm_sub_ps(iz2,jz3);
727 dx31 = _mm_sub_ps(ix3,jx1);
728 dy31 = _mm_sub_ps(iy3,jy1);
729 dz31 = _mm_sub_ps(iz3,jz1);
730 dx32 = _mm_sub_ps(ix3,jx2);
731 dy32 = _mm_sub_ps(iy3,jy2);
732 dz32 = _mm_sub_ps(iz3,jz2);
733 dx33 = _mm_sub_ps(ix3,jx3);
734 dy33 = _mm_sub_ps(iy3,jy3);
735 dz33 = _mm_sub_ps(iz3,jz3);
737 /* Calculate squared distance and things based on it */
738 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
739 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
740 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
741 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
742 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
743 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
744 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
745 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
746 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
747 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
749 rinv00 = gmx_mm_invsqrt_ps(rsq00);
750 rinv11 = gmx_mm_invsqrt_ps(rsq11);
751 rinv12 = gmx_mm_invsqrt_ps(rsq12);
752 rinv13 = gmx_mm_invsqrt_ps(rsq13);
753 rinv21 = gmx_mm_invsqrt_ps(rsq21);
754 rinv22 = gmx_mm_invsqrt_ps(rsq22);
755 rinv23 = gmx_mm_invsqrt_ps(rsq23);
756 rinv31 = gmx_mm_invsqrt_ps(rsq31);
757 rinv32 = gmx_mm_invsqrt_ps(rsq32);
758 rinv33 = gmx_mm_invsqrt_ps(rsq33);
760 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
761 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
762 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
763 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
764 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
765 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
766 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
767 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
768 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
769 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
771 fjx0 = _mm_setzero_ps();
772 fjy0 = _mm_setzero_ps();
773 fjz0 = _mm_setzero_ps();
774 fjx1 = _mm_setzero_ps();
775 fjy1 = _mm_setzero_ps();
776 fjz1 = _mm_setzero_ps();
777 fjx2 = _mm_setzero_ps();
778 fjy2 = _mm_setzero_ps();
779 fjz2 = _mm_setzero_ps();
780 fjx3 = _mm_setzero_ps();
781 fjy3 = _mm_setzero_ps();
782 fjz3 = _mm_setzero_ps();
784 /**************************
785 * CALCULATE INTERACTIONS *
786 **************************/
788 r00 = _mm_mul_ps(rsq00,rinv00);
789 r00 = _mm_andnot_ps(dummy_mask,r00);
791 /* Analytical LJ-PME */
792 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
793 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
794 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
795 exponent = gmx_simd_exp_r(ewcljrsq);
796 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
797 poly = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
798 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
799 vvdw6 = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
800 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
801 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
802 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
803 fvdw = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
805 /* Update potential sum for this i atom from the interaction with this j atom. */
806 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
807 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
811 fscal = _mm_andnot_ps(dummy_mask,fscal);
813 /* Update vectorial force */
814 fix0 = _mm_macc_ps(dx00,fscal,fix0);
815 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
816 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
818 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
819 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
820 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 r11 = _mm_mul_ps(rsq11,rinv11);
827 r11 = _mm_andnot_ps(dummy_mask,r11);
829 /* EWALD ELECTROSTATICS */
831 /* Analytical PME correction */
832 zeta2 = _mm_mul_ps(beta2,rsq11);
833 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
834 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
835 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
836 felec = _mm_mul_ps(qq11,felec);
837 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
838 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
839 velec = _mm_mul_ps(qq11,velec);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec = _mm_andnot_ps(dummy_mask,velec);
843 velecsum = _mm_add_ps(velecsum,velec);
847 fscal = _mm_andnot_ps(dummy_mask,fscal);
849 /* Update vectorial force */
850 fix1 = _mm_macc_ps(dx11,fscal,fix1);
851 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
852 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
854 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
855 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
856 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
858 /**************************
859 * CALCULATE INTERACTIONS *
860 **************************/
862 r12 = _mm_mul_ps(rsq12,rinv12);
863 r12 = _mm_andnot_ps(dummy_mask,r12);
865 /* EWALD ELECTROSTATICS */
867 /* Analytical PME correction */
868 zeta2 = _mm_mul_ps(beta2,rsq12);
869 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
870 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
871 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
872 felec = _mm_mul_ps(qq12,felec);
873 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
874 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
875 velec = _mm_mul_ps(qq12,velec);
877 /* Update potential sum for this i atom from the interaction with this j atom. */
878 velec = _mm_andnot_ps(dummy_mask,velec);
879 velecsum = _mm_add_ps(velecsum,velec);
883 fscal = _mm_andnot_ps(dummy_mask,fscal);
885 /* Update vectorial force */
886 fix1 = _mm_macc_ps(dx12,fscal,fix1);
887 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
888 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
890 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
891 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
892 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 r13 = _mm_mul_ps(rsq13,rinv13);
899 r13 = _mm_andnot_ps(dummy_mask,r13);
901 /* EWALD ELECTROSTATICS */
903 /* Analytical PME correction */
904 zeta2 = _mm_mul_ps(beta2,rsq13);
905 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
906 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
907 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
908 felec = _mm_mul_ps(qq13,felec);
909 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
910 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
911 velec = _mm_mul_ps(qq13,velec);
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm_andnot_ps(dummy_mask,velec);
915 velecsum = _mm_add_ps(velecsum,velec);
919 fscal = _mm_andnot_ps(dummy_mask,fscal);
921 /* Update vectorial force */
922 fix1 = _mm_macc_ps(dx13,fscal,fix1);
923 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
924 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
926 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
927 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
928 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
930 /**************************
931 * CALCULATE INTERACTIONS *
932 **************************/
934 r21 = _mm_mul_ps(rsq21,rinv21);
935 r21 = _mm_andnot_ps(dummy_mask,r21);
937 /* EWALD ELECTROSTATICS */
939 /* Analytical PME correction */
940 zeta2 = _mm_mul_ps(beta2,rsq21);
941 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
942 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
943 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
944 felec = _mm_mul_ps(qq21,felec);
945 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
946 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
947 velec = _mm_mul_ps(qq21,velec);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_andnot_ps(dummy_mask,velec);
951 velecsum = _mm_add_ps(velecsum,velec);
955 fscal = _mm_andnot_ps(dummy_mask,fscal);
957 /* Update vectorial force */
958 fix2 = _mm_macc_ps(dx21,fscal,fix2);
959 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
960 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
962 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
963 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
964 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 r22 = _mm_mul_ps(rsq22,rinv22);
971 r22 = _mm_andnot_ps(dummy_mask,r22);
973 /* EWALD ELECTROSTATICS */
975 /* Analytical PME correction */
976 zeta2 = _mm_mul_ps(beta2,rsq22);
977 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
978 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
979 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
980 felec = _mm_mul_ps(qq22,felec);
981 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
982 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
983 velec = _mm_mul_ps(qq22,velec);
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm_andnot_ps(dummy_mask,velec);
987 velecsum = _mm_add_ps(velecsum,velec);
991 fscal = _mm_andnot_ps(dummy_mask,fscal);
993 /* Update vectorial force */
994 fix2 = _mm_macc_ps(dx22,fscal,fix2);
995 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
996 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
998 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
999 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1000 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r23 = _mm_mul_ps(rsq23,rinv23);
1007 r23 = _mm_andnot_ps(dummy_mask,r23);
1009 /* EWALD ELECTROSTATICS */
1011 /* Analytical PME correction */
1012 zeta2 = _mm_mul_ps(beta2,rsq23);
1013 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1014 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1015 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1016 felec = _mm_mul_ps(qq23,felec);
1017 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1018 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1019 velec = _mm_mul_ps(qq23,velec);
1021 /* Update potential sum for this i atom from the interaction with this j atom. */
1022 velec = _mm_andnot_ps(dummy_mask,velec);
1023 velecsum = _mm_add_ps(velecsum,velec);
1027 fscal = _mm_andnot_ps(dummy_mask,fscal);
1029 /* Update vectorial force */
1030 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1031 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1032 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1034 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1035 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1036 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 r31 = _mm_mul_ps(rsq31,rinv31);
1043 r31 = _mm_andnot_ps(dummy_mask,r31);
1045 /* EWALD ELECTROSTATICS */
1047 /* Analytical PME correction */
1048 zeta2 = _mm_mul_ps(beta2,rsq31);
1049 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1050 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1051 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1052 felec = _mm_mul_ps(qq31,felec);
1053 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1054 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1055 velec = _mm_mul_ps(qq31,velec);
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec = _mm_andnot_ps(dummy_mask,velec);
1059 velecsum = _mm_add_ps(velecsum,velec);
1063 fscal = _mm_andnot_ps(dummy_mask,fscal);
1065 /* Update vectorial force */
1066 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1067 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1068 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1070 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1071 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1072 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 r32 = _mm_mul_ps(rsq32,rinv32);
1079 r32 = _mm_andnot_ps(dummy_mask,r32);
1081 /* EWALD ELECTROSTATICS */
1083 /* Analytical PME correction */
1084 zeta2 = _mm_mul_ps(beta2,rsq32);
1085 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1086 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1087 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1088 felec = _mm_mul_ps(qq32,felec);
1089 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1090 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1091 velec = _mm_mul_ps(qq32,velec);
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm_andnot_ps(dummy_mask,velec);
1095 velecsum = _mm_add_ps(velecsum,velec);
1099 fscal = _mm_andnot_ps(dummy_mask,fscal);
1101 /* Update vectorial force */
1102 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1103 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1104 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1106 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1107 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1108 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r33 = _mm_mul_ps(rsq33,rinv33);
1115 r33 = _mm_andnot_ps(dummy_mask,r33);
1117 /* EWALD ELECTROSTATICS */
1119 /* Analytical PME correction */
1120 zeta2 = _mm_mul_ps(beta2,rsq33);
1121 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1122 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1123 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1124 felec = _mm_mul_ps(qq33,felec);
1125 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1126 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1127 velec = _mm_mul_ps(qq33,velec);
1129 /* Update potential sum for this i atom from the interaction with this j atom. */
1130 velec = _mm_andnot_ps(dummy_mask,velec);
1131 velecsum = _mm_add_ps(velecsum,velec);
1135 fscal = _mm_andnot_ps(dummy_mask,fscal);
1137 /* Update vectorial force */
1138 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1139 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1140 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1142 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1143 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1144 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1146 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1147 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1148 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1149 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1151 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1152 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1153 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1155 /* Inner loop uses 324 flops */
1158 /* End of innermost loop */
1160 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1161 f+i_coord_offset,fshift+i_shift_offset);
1164 /* Update potential energies */
1165 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1166 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1168 /* Increment number of inner iterations */
1169 inneriter += j_index_end - j_index_start;
1171 /* Outer loop uses 26 flops */
1174 /* Increment number of outer iterations */
1177 /* Update outer/inner flops */
1179 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*324);
1182 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
1183 * Electrostatics interaction: Ewald
1184 * VdW interaction: LJEwald
1185 * Geometry: Water4-Water4
1186 * Calculate force/pot: Force
1189 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
1190 (t_nblist * gmx_restrict nlist,
1191 rvec * gmx_restrict xx,
1192 rvec * gmx_restrict ff,
1193 t_forcerec * gmx_restrict fr,
1194 t_mdatoms * gmx_restrict mdatoms,
1195 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1196 t_nrnb * gmx_restrict nrnb)
1198 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1199 * just 0 for non-waters.
1200 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1201 * jnr indices corresponding to data put in the four positions in the SIMD register.
1203 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1204 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1205 int jnrA,jnrB,jnrC,jnrD;
1206 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1207 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1208 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1209 real rcutoff_scalar;
1210 real *shiftvec,*fshift,*x,*f;
1211 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1212 real scratch[4*DIM];
1213 __m128 fscal,rcutoff,rcutoff2,jidxall;
1215 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1217 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1219 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1221 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1222 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1223 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1224 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1225 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1226 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1227 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1228 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1229 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1230 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1231 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1232 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1233 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1234 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1235 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1236 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1237 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1238 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1239 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1240 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1243 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1246 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1247 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1259 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1260 __m128 one_half = _mm_set1_ps(0.5);
1261 __m128 minus_one = _mm_set1_ps(-1.0);
1263 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1264 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1266 __m128 dummy_mask,cutoff_mask;
1267 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1268 __m128 one = _mm_set1_ps(1.0);
1269 __m128 two = _mm_set1_ps(2.0);
1275 jindex = nlist->jindex;
1277 shiftidx = nlist->shift;
1279 shiftvec = fr->shift_vec[0];
1280 fshift = fr->fshift[0];
1281 facel = _mm_set1_ps(fr->epsfac);
1282 charge = mdatoms->chargeA;
1283 nvdwtype = fr->ntype;
1284 vdwparam = fr->nbfp;
1285 vdwtype = mdatoms->typeA;
1286 vdwgridparam = fr->ljpme_c6grid;
1287 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1288 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1289 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1291 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1292 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1293 beta2 = _mm_mul_ps(beta,beta);
1294 beta3 = _mm_mul_ps(beta,beta2);
1295 ewtab = fr->ic->tabq_coul_F;
1296 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1297 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1299 /* Setup water-specific parameters */
1300 inr = nlist->iinr[0];
1301 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1302 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1303 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1304 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1306 jq1 = _mm_set1_ps(charge[inr+1]);
1307 jq2 = _mm_set1_ps(charge[inr+2]);
1308 jq3 = _mm_set1_ps(charge[inr+3]);
1309 vdwjidx0A = 2*vdwtype[inr+0];
1310 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1311 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1312 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1313 qq11 = _mm_mul_ps(iq1,jq1);
1314 qq12 = _mm_mul_ps(iq1,jq2);
1315 qq13 = _mm_mul_ps(iq1,jq3);
1316 qq21 = _mm_mul_ps(iq2,jq1);
1317 qq22 = _mm_mul_ps(iq2,jq2);
1318 qq23 = _mm_mul_ps(iq2,jq3);
1319 qq31 = _mm_mul_ps(iq3,jq1);
1320 qq32 = _mm_mul_ps(iq3,jq2);
1321 qq33 = _mm_mul_ps(iq3,jq3);
1323 /* Avoid stupid compiler warnings */
1324 jnrA = jnrB = jnrC = jnrD = 0;
1325 j_coord_offsetA = 0;
1326 j_coord_offsetB = 0;
1327 j_coord_offsetC = 0;
1328 j_coord_offsetD = 0;
1333 for(iidx=0;iidx<4*DIM;iidx++)
1335 scratch[iidx] = 0.0;
1338 /* Start outer loop over neighborlists */
1339 for(iidx=0; iidx<nri; iidx++)
1341 /* Load shift vector for this list */
1342 i_shift_offset = DIM*shiftidx[iidx];
1344 /* Load limits for loop over neighbors */
1345 j_index_start = jindex[iidx];
1346 j_index_end = jindex[iidx+1];
1348 /* Get outer coordinate index */
1350 i_coord_offset = DIM*inr;
1352 /* Load i particle coords and add shift vector */
1353 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1354 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1356 fix0 = _mm_setzero_ps();
1357 fiy0 = _mm_setzero_ps();
1358 fiz0 = _mm_setzero_ps();
1359 fix1 = _mm_setzero_ps();
1360 fiy1 = _mm_setzero_ps();
1361 fiz1 = _mm_setzero_ps();
1362 fix2 = _mm_setzero_ps();
1363 fiy2 = _mm_setzero_ps();
1364 fiz2 = _mm_setzero_ps();
1365 fix3 = _mm_setzero_ps();
1366 fiy3 = _mm_setzero_ps();
1367 fiz3 = _mm_setzero_ps();
1369 /* Start inner kernel loop */
1370 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1373 /* Get j neighbor index, and coordinate index */
1375 jnrB = jjnr[jidx+1];
1376 jnrC = jjnr[jidx+2];
1377 jnrD = jjnr[jidx+3];
1378 j_coord_offsetA = DIM*jnrA;
1379 j_coord_offsetB = DIM*jnrB;
1380 j_coord_offsetC = DIM*jnrC;
1381 j_coord_offsetD = DIM*jnrD;
1383 /* load j atom coordinates */
1384 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1385 x+j_coord_offsetC,x+j_coord_offsetD,
1386 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1387 &jy2,&jz2,&jx3,&jy3,&jz3);
1389 /* Calculate displacement vector */
1390 dx00 = _mm_sub_ps(ix0,jx0);
1391 dy00 = _mm_sub_ps(iy0,jy0);
1392 dz00 = _mm_sub_ps(iz0,jz0);
1393 dx11 = _mm_sub_ps(ix1,jx1);
1394 dy11 = _mm_sub_ps(iy1,jy1);
1395 dz11 = _mm_sub_ps(iz1,jz1);
1396 dx12 = _mm_sub_ps(ix1,jx2);
1397 dy12 = _mm_sub_ps(iy1,jy2);
1398 dz12 = _mm_sub_ps(iz1,jz2);
1399 dx13 = _mm_sub_ps(ix1,jx3);
1400 dy13 = _mm_sub_ps(iy1,jy3);
1401 dz13 = _mm_sub_ps(iz1,jz3);
1402 dx21 = _mm_sub_ps(ix2,jx1);
1403 dy21 = _mm_sub_ps(iy2,jy1);
1404 dz21 = _mm_sub_ps(iz2,jz1);
1405 dx22 = _mm_sub_ps(ix2,jx2);
1406 dy22 = _mm_sub_ps(iy2,jy2);
1407 dz22 = _mm_sub_ps(iz2,jz2);
1408 dx23 = _mm_sub_ps(ix2,jx3);
1409 dy23 = _mm_sub_ps(iy2,jy3);
1410 dz23 = _mm_sub_ps(iz2,jz3);
1411 dx31 = _mm_sub_ps(ix3,jx1);
1412 dy31 = _mm_sub_ps(iy3,jy1);
1413 dz31 = _mm_sub_ps(iz3,jz1);
1414 dx32 = _mm_sub_ps(ix3,jx2);
1415 dy32 = _mm_sub_ps(iy3,jy2);
1416 dz32 = _mm_sub_ps(iz3,jz2);
1417 dx33 = _mm_sub_ps(ix3,jx3);
1418 dy33 = _mm_sub_ps(iy3,jy3);
1419 dz33 = _mm_sub_ps(iz3,jz3);
1421 /* Calculate squared distance and things based on it */
1422 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1423 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1424 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1425 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1426 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1427 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1428 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1429 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1430 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1431 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1433 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1434 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1435 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1436 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1437 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1438 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1439 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1440 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1441 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1442 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1444 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1445 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1446 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1447 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1448 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1449 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1450 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1451 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1452 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1453 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1455 fjx0 = _mm_setzero_ps();
1456 fjy0 = _mm_setzero_ps();
1457 fjz0 = _mm_setzero_ps();
1458 fjx1 = _mm_setzero_ps();
1459 fjy1 = _mm_setzero_ps();
1460 fjz1 = _mm_setzero_ps();
1461 fjx2 = _mm_setzero_ps();
1462 fjy2 = _mm_setzero_ps();
1463 fjz2 = _mm_setzero_ps();
1464 fjx3 = _mm_setzero_ps();
1465 fjy3 = _mm_setzero_ps();
1466 fjz3 = _mm_setzero_ps();
1468 /**************************
1469 * CALCULATE INTERACTIONS *
1470 **************************/
1472 r00 = _mm_mul_ps(rsq00,rinv00);
1474 /* Analytical LJ-PME */
1475 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1476 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1477 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1478 exponent = gmx_simd_exp_r(ewcljrsq);
1479 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1480 poly = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1481 /* f6A = 6 * C6grid * (1 - poly) */
1482 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1483 /* f6B = C6grid * exponent * beta^6 */
1484 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1485 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1486 fvdw = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1490 /* Update vectorial force */
1491 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1492 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1493 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1495 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1496 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1497 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 r11 = _mm_mul_ps(rsq11,rinv11);
1505 /* EWALD ELECTROSTATICS */
1507 /* Analytical PME correction */
1508 zeta2 = _mm_mul_ps(beta2,rsq11);
1509 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1510 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1511 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1512 felec = _mm_mul_ps(qq11,felec);
1516 /* Update vectorial force */
1517 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1518 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1519 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1521 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1522 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1523 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r12 = _mm_mul_ps(rsq12,rinv12);
1531 /* EWALD ELECTROSTATICS */
1533 /* Analytical PME correction */
1534 zeta2 = _mm_mul_ps(beta2,rsq12);
1535 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1536 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1537 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1538 felec = _mm_mul_ps(qq12,felec);
1542 /* Update vectorial force */
1543 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1544 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1545 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1547 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1548 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1549 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1551 /**************************
1552 * CALCULATE INTERACTIONS *
1553 **************************/
1555 r13 = _mm_mul_ps(rsq13,rinv13);
1557 /* EWALD ELECTROSTATICS */
1559 /* Analytical PME correction */
1560 zeta2 = _mm_mul_ps(beta2,rsq13);
1561 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1562 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1563 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1564 felec = _mm_mul_ps(qq13,felec);
1568 /* Update vectorial force */
1569 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1570 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1571 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1573 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1574 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1575 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1577 /**************************
1578 * CALCULATE INTERACTIONS *
1579 **************************/
1581 r21 = _mm_mul_ps(rsq21,rinv21);
1583 /* EWALD ELECTROSTATICS */
1585 /* Analytical PME correction */
1586 zeta2 = _mm_mul_ps(beta2,rsq21);
1587 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1588 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1589 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1590 felec = _mm_mul_ps(qq21,felec);
1594 /* Update vectorial force */
1595 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1596 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1597 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1599 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1600 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1601 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1603 /**************************
1604 * CALCULATE INTERACTIONS *
1605 **************************/
1607 r22 = _mm_mul_ps(rsq22,rinv22);
1609 /* EWALD ELECTROSTATICS */
1611 /* Analytical PME correction */
1612 zeta2 = _mm_mul_ps(beta2,rsq22);
1613 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1614 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1615 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1616 felec = _mm_mul_ps(qq22,felec);
1620 /* Update vectorial force */
1621 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1622 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1623 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1625 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1626 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1627 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1629 /**************************
1630 * CALCULATE INTERACTIONS *
1631 **************************/
1633 r23 = _mm_mul_ps(rsq23,rinv23);
1635 /* EWALD ELECTROSTATICS */
1637 /* Analytical PME correction */
1638 zeta2 = _mm_mul_ps(beta2,rsq23);
1639 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1640 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1641 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1642 felec = _mm_mul_ps(qq23,felec);
1646 /* Update vectorial force */
1647 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1648 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1649 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1651 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1652 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1653 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 r31 = _mm_mul_ps(rsq31,rinv31);
1661 /* EWALD ELECTROSTATICS */
1663 /* Analytical PME correction */
1664 zeta2 = _mm_mul_ps(beta2,rsq31);
1665 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1666 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1667 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1668 felec = _mm_mul_ps(qq31,felec);
1672 /* Update vectorial force */
1673 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1674 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1675 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1677 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1678 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1679 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1681 /**************************
1682 * CALCULATE INTERACTIONS *
1683 **************************/
1685 r32 = _mm_mul_ps(rsq32,rinv32);
1687 /* EWALD ELECTROSTATICS */
1689 /* Analytical PME correction */
1690 zeta2 = _mm_mul_ps(beta2,rsq32);
1691 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1692 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1693 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1694 felec = _mm_mul_ps(qq32,felec);
1698 /* Update vectorial force */
1699 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1700 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1701 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1703 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1704 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1705 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r33 = _mm_mul_ps(rsq33,rinv33);
1713 /* EWALD ELECTROSTATICS */
1715 /* Analytical PME correction */
1716 zeta2 = _mm_mul_ps(beta2,rsq33);
1717 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1718 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1719 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1720 felec = _mm_mul_ps(qq33,felec);
1724 /* Update vectorial force */
1725 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1726 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1727 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1729 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1730 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1731 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1733 fjptrA = f+j_coord_offsetA;
1734 fjptrB = f+j_coord_offsetB;
1735 fjptrC = f+j_coord_offsetC;
1736 fjptrD = f+j_coord_offsetD;
1738 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1739 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1740 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1742 /* Inner loop uses 302 flops */
1745 if(jidx<j_index_end)
1748 /* Get j neighbor index, and coordinate index */
1749 jnrlistA = jjnr[jidx];
1750 jnrlistB = jjnr[jidx+1];
1751 jnrlistC = jjnr[jidx+2];
1752 jnrlistD = jjnr[jidx+3];
1753 /* Sign of each element will be negative for non-real atoms.
1754 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1755 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1757 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1758 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1759 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1760 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1761 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1762 j_coord_offsetA = DIM*jnrA;
1763 j_coord_offsetB = DIM*jnrB;
1764 j_coord_offsetC = DIM*jnrC;
1765 j_coord_offsetD = DIM*jnrD;
1767 /* load j atom coordinates */
1768 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1769 x+j_coord_offsetC,x+j_coord_offsetD,
1770 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1771 &jy2,&jz2,&jx3,&jy3,&jz3);
1773 /* Calculate displacement vector */
1774 dx00 = _mm_sub_ps(ix0,jx0);
1775 dy00 = _mm_sub_ps(iy0,jy0);
1776 dz00 = _mm_sub_ps(iz0,jz0);
1777 dx11 = _mm_sub_ps(ix1,jx1);
1778 dy11 = _mm_sub_ps(iy1,jy1);
1779 dz11 = _mm_sub_ps(iz1,jz1);
1780 dx12 = _mm_sub_ps(ix1,jx2);
1781 dy12 = _mm_sub_ps(iy1,jy2);
1782 dz12 = _mm_sub_ps(iz1,jz2);
1783 dx13 = _mm_sub_ps(ix1,jx3);
1784 dy13 = _mm_sub_ps(iy1,jy3);
1785 dz13 = _mm_sub_ps(iz1,jz3);
1786 dx21 = _mm_sub_ps(ix2,jx1);
1787 dy21 = _mm_sub_ps(iy2,jy1);
1788 dz21 = _mm_sub_ps(iz2,jz1);
1789 dx22 = _mm_sub_ps(ix2,jx2);
1790 dy22 = _mm_sub_ps(iy2,jy2);
1791 dz22 = _mm_sub_ps(iz2,jz2);
1792 dx23 = _mm_sub_ps(ix2,jx3);
1793 dy23 = _mm_sub_ps(iy2,jy3);
1794 dz23 = _mm_sub_ps(iz2,jz3);
1795 dx31 = _mm_sub_ps(ix3,jx1);
1796 dy31 = _mm_sub_ps(iy3,jy1);
1797 dz31 = _mm_sub_ps(iz3,jz1);
1798 dx32 = _mm_sub_ps(ix3,jx2);
1799 dy32 = _mm_sub_ps(iy3,jy2);
1800 dz32 = _mm_sub_ps(iz3,jz2);
1801 dx33 = _mm_sub_ps(ix3,jx3);
1802 dy33 = _mm_sub_ps(iy3,jy3);
1803 dz33 = _mm_sub_ps(iz3,jz3);
1805 /* Calculate squared distance and things based on it */
1806 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1807 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1808 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1809 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1810 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1811 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1812 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1813 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1814 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1815 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1817 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1818 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1819 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1820 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1821 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1822 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1823 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1824 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1825 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1826 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1828 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1829 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1830 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1831 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1832 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1833 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1834 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1835 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1836 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1837 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1839 fjx0 = _mm_setzero_ps();
1840 fjy0 = _mm_setzero_ps();
1841 fjz0 = _mm_setzero_ps();
1842 fjx1 = _mm_setzero_ps();
1843 fjy1 = _mm_setzero_ps();
1844 fjz1 = _mm_setzero_ps();
1845 fjx2 = _mm_setzero_ps();
1846 fjy2 = _mm_setzero_ps();
1847 fjz2 = _mm_setzero_ps();
1848 fjx3 = _mm_setzero_ps();
1849 fjy3 = _mm_setzero_ps();
1850 fjz3 = _mm_setzero_ps();
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r00 = _mm_mul_ps(rsq00,rinv00);
1857 r00 = _mm_andnot_ps(dummy_mask,r00);
1859 /* Analytical LJ-PME */
1860 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1861 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1862 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1863 exponent = gmx_simd_exp_r(ewcljrsq);
1864 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1865 poly = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1866 /* f6A = 6 * C6grid * (1 - poly) */
1867 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1868 /* f6B = C6grid * exponent * beta^6 */
1869 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1870 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1871 fvdw = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1875 fscal = _mm_andnot_ps(dummy_mask,fscal);
1877 /* Update vectorial force */
1878 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1879 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1880 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1882 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1883 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1884 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 r11 = _mm_mul_ps(rsq11,rinv11);
1891 r11 = _mm_andnot_ps(dummy_mask,r11);
1893 /* EWALD ELECTROSTATICS */
1895 /* Analytical PME correction */
1896 zeta2 = _mm_mul_ps(beta2,rsq11);
1897 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1898 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1899 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1900 felec = _mm_mul_ps(qq11,felec);
1904 fscal = _mm_andnot_ps(dummy_mask,fscal);
1906 /* Update vectorial force */
1907 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1908 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1909 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1911 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1912 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1913 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1915 /**************************
1916 * CALCULATE INTERACTIONS *
1917 **************************/
1919 r12 = _mm_mul_ps(rsq12,rinv12);
1920 r12 = _mm_andnot_ps(dummy_mask,r12);
1922 /* EWALD ELECTROSTATICS */
1924 /* Analytical PME correction */
1925 zeta2 = _mm_mul_ps(beta2,rsq12);
1926 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1927 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1928 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1929 felec = _mm_mul_ps(qq12,felec);
1933 fscal = _mm_andnot_ps(dummy_mask,fscal);
1935 /* Update vectorial force */
1936 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1937 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1938 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1940 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1941 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1942 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 r13 = _mm_mul_ps(rsq13,rinv13);
1949 r13 = _mm_andnot_ps(dummy_mask,r13);
1951 /* EWALD ELECTROSTATICS */
1953 /* Analytical PME correction */
1954 zeta2 = _mm_mul_ps(beta2,rsq13);
1955 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1956 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1957 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1958 felec = _mm_mul_ps(qq13,felec);
1962 fscal = _mm_andnot_ps(dummy_mask,fscal);
1964 /* Update vectorial force */
1965 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1966 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1967 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1969 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1970 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1971 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1973 /**************************
1974 * CALCULATE INTERACTIONS *
1975 **************************/
1977 r21 = _mm_mul_ps(rsq21,rinv21);
1978 r21 = _mm_andnot_ps(dummy_mask,r21);
1980 /* EWALD ELECTROSTATICS */
1982 /* Analytical PME correction */
1983 zeta2 = _mm_mul_ps(beta2,rsq21);
1984 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1985 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1986 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1987 felec = _mm_mul_ps(qq21,felec);
1991 fscal = _mm_andnot_ps(dummy_mask,fscal);
1993 /* Update vectorial force */
1994 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1995 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1996 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1998 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1999 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2000 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2002 /**************************
2003 * CALCULATE INTERACTIONS *
2004 **************************/
2006 r22 = _mm_mul_ps(rsq22,rinv22);
2007 r22 = _mm_andnot_ps(dummy_mask,r22);
2009 /* EWALD ELECTROSTATICS */
2011 /* Analytical PME correction */
2012 zeta2 = _mm_mul_ps(beta2,rsq22);
2013 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2014 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2015 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2016 felec = _mm_mul_ps(qq22,felec);
2020 fscal = _mm_andnot_ps(dummy_mask,fscal);
2022 /* Update vectorial force */
2023 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2024 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2025 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2027 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2028 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2029 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 r23 = _mm_mul_ps(rsq23,rinv23);
2036 r23 = _mm_andnot_ps(dummy_mask,r23);
2038 /* EWALD ELECTROSTATICS */
2040 /* Analytical PME correction */
2041 zeta2 = _mm_mul_ps(beta2,rsq23);
2042 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2043 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2044 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2045 felec = _mm_mul_ps(qq23,felec);
2049 fscal = _mm_andnot_ps(dummy_mask,fscal);
2051 /* Update vectorial force */
2052 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2053 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2054 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2056 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2057 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2058 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r31 = _mm_mul_ps(rsq31,rinv31);
2065 r31 = _mm_andnot_ps(dummy_mask,r31);
2067 /* EWALD ELECTROSTATICS */
2069 /* Analytical PME correction */
2070 zeta2 = _mm_mul_ps(beta2,rsq31);
2071 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2072 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2073 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2074 felec = _mm_mul_ps(qq31,felec);
2078 fscal = _mm_andnot_ps(dummy_mask,fscal);
2080 /* Update vectorial force */
2081 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2082 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2083 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2085 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2086 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2087 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r32 = _mm_mul_ps(rsq32,rinv32);
2094 r32 = _mm_andnot_ps(dummy_mask,r32);
2096 /* EWALD ELECTROSTATICS */
2098 /* Analytical PME correction */
2099 zeta2 = _mm_mul_ps(beta2,rsq32);
2100 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2101 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2102 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2103 felec = _mm_mul_ps(qq32,felec);
2107 fscal = _mm_andnot_ps(dummy_mask,fscal);
2109 /* Update vectorial force */
2110 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2111 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2112 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2114 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2115 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2116 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 r33 = _mm_mul_ps(rsq33,rinv33);
2123 r33 = _mm_andnot_ps(dummy_mask,r33);
2125 /* EWALD ELECTROSTATICS */
2127 /* Analytical PME correction */
2128 zeta2 = _mm_mul_ps(beta2,rsq33);
2129 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2130 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2131 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2132 felec = _mm_mul_ps(qq33,felec);
2136 fscal = _mm_andnot_ps(dummy_mask,fscal);
2138 /* Update vectorial force */
2139 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2140 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2141 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2143 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2144 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2145 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2147 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2148 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2149 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2150 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2152 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2153 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2154 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2156 /* Inner loop uses 312 flops */
2159 /* End of innermost loop */
2161 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2162 f+i_coord_offset,fshift+i_shift_offset);
2164 /* Increment number of inner iterations */
2165 inneriter += j_index_end - j_index_start;
2167 /* Outer loop uses 24 flops */
2170 /* Increment number of outer iterations */
2173 /* Update outer/inner flops */
2175 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*312);