2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121 __m128 dummy_mask,cutoff_mask;
122 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123 __m128 one = _mm_set1_ps(1.0);
124 __m128 two = _mm_set1_ps(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm_set1_ps(fr->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
143 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
144 beta2 = _mm_mul_ps(beta,beta);
145 beta3 = _mm_mul_ps(beta,beta2);
146 ewtab = fr->ic->tabq_coul_FDV0;
147 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
148 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
150 /* Setup water-specific parameters */
151 inr = nlist->iinr[0];
152 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
153 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
154 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
155 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
157 jq1 = _mm_set1_ps(charge[inr+1]);
158 jq2 = _mm_set1_ps(charge[inr+2]);
159 jq3 = _mm_set1_ps(charge[inr+3]);
160 vdwjidx0A = 2*vdwtype[inr+0];
161 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
162 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
163 qq11 = _mm_mul_ps(iq1,jq1);
164 qq12 = _mm_mul_ps(iq1,jq2);
165 qq13 = _mm_mul_ps(iq1,jq3);
166 qq21 = _mm_mul_ps(iq2,jq1);
167 qq22 = _mm_mul_ps(iq2,jq2);
168 qq23 = _mm_mul_ps(iq2,jq3);
169 qq31 = _mm_mul_ps(iq3,jq1);
170 qq32 = _mm_mul_ps(iq3,jq2);
171 qq33 = _mm_mul_ps(iq3,jq3);
173 /* Avoid stupid compiler warnings */
174 jnrA = jnrB = jnrC = jnrD = 0;
183 for(iidx=0;iidx<4*DIM;iidx++)
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
206 fix0 = _mm_setzero_ps();
207 fiy0 = _mm_setzero_ps();
208 fiz0 = _mm_setzero_ps();
209 fix1 = _mm_setzero_ps();
210 fiy1 = _mm_setzero_ps();
211 fiz1 = _mm_setzero_ps();
212 fix2 = _mm_setzero_ps();
213 fiy2 = _mm_setzero_ps();
214 fiz2 = _mm_setzero_ps();
215 fix3 = _mm_setzero_ps();
216 fiy3 = _mm_setzero_ps();
217 fiz3 = _mm_setzero_ps();
219 /* Reset potential sums */
220 velecsum = _mm_setzero_ps();
221 vvdwsum = _mm_setzero_ps();
223 /* Start inner kernel loop */
224 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA = DIM*jnrA;
233 j_coord_offsetB = DIM*jnrB;
234 j_coord_offsetC = DIM*jnrC;
235 j_coord_offsetD = DIM*jnrD;
237 /* load j atom coordinates */
238 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239 x+j_coord_offsetC,x+j_coord_offsetD,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241 &jy2,&jz2,&jx3,&jy3,&jz3);
243 /* Calculate displacement vector */
244 dx00 = _mm_sub_ps(ix0,jx0);
245 dy00 = _mm_sub_ps(iy0,jy0);
246 dz00 = _mm_sub_ps(iz0,jz0);
247 dx11 = _mm_sub_ps(ix1,jx1);
248 dy11 = _mm_sub_ps(iy1,jy1);
249 dz11 = _mm_sub_ps(iz1,jz1);
250 dx12 = _mm_sub_ps(ix1,jx2);
251 dy12 = _mm_sub_ps(iy1,jy2);
252 dz12 = _mm_sub_ps(iz1,jz2);
253 dx13 = _mm_sub_ps(ix1,jx3);
254 dy13 = _mm_sub_ps(iy1,jy3);
255 dz13 = _mm_sub_ps(iz1,jz3);
256 dx21 = _mm_sub_ps(ix2,jx1);
257 dy21 = _mm_sub_ps(iy2,jy1);
258 dz21 = _mm_sub_ps(iz2,jz1);
259 dx22 = _mm_sub_ps(ix2,jx2);
260 dy22 = _mm_sub_ps(iy2,jy2);
261 dz22 = _mm_sub_ps(iz2,jz2);
262 dx23 = _mm_sub_ps(ix2,jx3);
263 dy23 = _mm_sub_ps(iy2,jy3);
264 dz23 = _mm_sub_ps(iz2,jz3);
265 dx31 = _mm_sub_ps(ix3,jx1);
266 dy31 = _mm_sub_ps(iy3,jy1);
267 dz31 = _mm_sub_ps(iz3,jz1);
268 dx32 = _mm_sub_ps(ix3,jx2);
269 dy32 = _mm_sub_ps(iy3,jy2);
270 dz32 = _mm_sub_ps(iz3,jz2);
271 dx33 = _mm_sub_ps(ix3,jx3);
272 dy33 = _mm_sub_ps(iy3,jy3);
273 dz33 = _mm_sub_ps(iz3,jz3);
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
277 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
278 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
279 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
280 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
281 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
282 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
283 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
284 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
285 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
287 rinv11 = gmx_mm_invsqrt_ps(rsq11);
288 rinv12 = gmx_mm_invsqrt_ps(rsq12);
289 rinv13 = gmx_mm_invsqrt_ps(rsq13);
290 rinv21 = gmx_mm_invsqrt_ps(rsq21);
291 rinv22 = gmx_mm_invsqrt_ps(rsq22);
292 rinv23 = gmx_mm_invsqrt_ps(rsq23);
293 rinv31 = gmx_mm_invsqrt_ps(rsq31);
294 rinv32 = gmx_mm_invsqrt_ps(rsq32);
295 rinv33 = gmx_mm_invsqrt_ps(rsq33);
297 rinvsq00 = gmx_mm_inv_ps(rsq00);
298 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
299 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
300 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
301 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
302 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
303 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
304 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
305 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
306 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
308 fjx0 = _mm_setzero_ps();
309 fjy0 = _mm_setzero_ps();
310 fjz0 = _mm_setzero_ps();
311 fjx1 = _mm_setzero_ps();
312 fjy1 = _mm_setzero_ps();
313 fjz1 = _mm_setzero_ps();
314 fjx2 = _mm_setzero_ps();
315 fjy2 = _mm_setzero_ps();
316 fjz2 = _mm_setzero_ps();
317 fjx3 = _mm_setzero_ps();
318 fjy3 = _mm_setzero_ps();
319 fjz3 = _mm_setzero_ps();
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 /* LENNARD-JONES DISPERSION/REPULSION */
327 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
328 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
329 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
330 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
331 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
338 /* Update vectorial force */
339 fix0 = _mm_macc_ps(dx00,fscal,fix0);
340 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
341 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
343 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
344 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
345 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r11 = _mm_mul_ps(rsq11,rinv11);
353 /* EWALD ELECTROSTATICS */
355 /* Analytical PME correction */
356 zeta2 = _mm_mul_ps(beta2,rsq11);
357 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
358 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
359 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
360 felec = _mm_mul_ps(qq11,felec);
361 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
362 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
363 velec = _mm_mul_ps(qq11,velec);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum = _mm_add_ps(velecsum,velec);
370 /* Update vectorial force */
371 fix1 = _mm_macc_ps(dx11,fscal,fix1);
372 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
373 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
375 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
376 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
377 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 r12 = _mm_mul_ps(rsq12,rinv12);
385 /* EWALD ELECTROSTATICS */
387 /* Analytical PME correction */
388 zeta2 = _mm_mul_ps(beta2,rsq12);
389 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
390 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
391 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
392 felec = _mm_mul_ps(qq12,felec);
393 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
394 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
395 velec = _mm_mul_ps(qq12,velec);
397 /* Update potential sum for this i atom from the interaction with this j atom. */
398 velecsum = _mm_add_ps(velecsum,velec);
402 /* Update vectorial force */
403 fix1 = _mm_macc_ps(dx12,fscal,fix1);
404 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
405 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
407 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
408 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
409 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 r13 = _mm_mul_ps(rsq13,rinv13);
417 /* EWALD ELECTROSTATICS */
419 /* Analytical PME correction */
420 zeta2 = _mm_mul_ps(beta2,rsq13);
421 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
422 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
423 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
424 felec = _mm_mul_ps(qq13,felec);
425 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
426 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
427 velec = _mm_mul_ps(qq13,velec);
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velecsum = _mm_add_ps(velecsum,velec);
434 /* Update vectorial force */
435 fix1 = _mm_macc_ps(dx13,fscal,fix1);
436 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
437 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
439 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
440 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
441 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 r21 = _mm_mul_ps(rsq21,rinv21);
449 /* EWALD ELECTROSTATICS */
451 /* Analytical PME correction */
452 zeta2 = _mm_mul_ps(beta2,rsq21);
453 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
454 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
455 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
456 felec = _mm_mul_ps(qq21,felec);
457 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
458 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
459 velec = _mm_mul_ps(qq21,velec);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm_add_ps(velecsum,velec);
466 /* Update vectorial force */
467 fix2 = _mm_macc_ps(dx21,fscal,fix2);
468 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
469 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
471 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
472 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
473 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r22 = _mm_mul_ps(rsq22,rinv22);
481 /* EWALD ELECTROSTATICS */
483 /* Analytical PME correction */
484 zeta2 = _mm_mul_ps(beta2,rsq22);
485 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
486 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
487 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
488 felec = _mm_mul_ps(qq22,felec);
489 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
490 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
491 velec = _mm_mul_ps(qq22,velec);
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm_add_ps(velecsum,velec);
498 /* Update vectorial force */
499 fix2 = _mm_macc_ps(dx22,fscal,fix2);
500 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
501 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
503 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
504 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
505 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 r23 = _mm_mul_ps(rsq23,rinv23);
513 /* EWALD ELECTROSTATICS */
515 /* Analytical PME correction */
516 zeta2 = _mm_mul_ps(beta2,rsq23);
517 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
518 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
519 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
520 felec = _mm_mul_ps(qq23,felec);
521 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
522 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
523 velec = _mm_mul_ps(qq23,velec);
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
530 /* Update vectorial force */
531 fix2 = _mm_macc_ps(dx23,fscal,fix2);
532 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
533 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
535 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
536 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
537 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 r31 = _mm_mul_ps(rsq31,rinv31);
545 /* EWALD ELECTROSTATICS */
547 /* Analytical PME correction */
548 zeta2 = _mm_mul_ps(beta2,rsq31);
549 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
550 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
551 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
552 felec = _mm_mul_ps(qq31,felec);
553 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
554 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
555 velec = _mm_mul_ps(qq31,velec);
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velecsum = _mm_add_ps(velecsum,velec);
562 /* Update vectorial force */
563 fix3 = _mm_macc_ps(dx31,fscal,fix3);
564 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
565 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
567 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
568 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
569 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 r32 = _mm_mul_ps(rsq32,rinv32);
577 /* EWALD ELECTROSTATICS */
579 /* Analytical PME correction */
580 zeta2 = _mm_mul_ps(beta2,rsq32);
581 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
582 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
583 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
584 felec = _mm_mul_ps(qq32,felec);
585 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
586 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
587 velec = _mm_mul_ps(qq32,velec);
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velecsum = _mm_add_ps(velecsum,velec);
594 /* Update vectorial force */
595 fix3 = _mm_macc_ps(dx32,fscal,fix3);
596 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
597 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
599 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
600 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
601 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 r33 = _mm_mul_ps(rsq33,rinv33);
609 /* EWALD ELECTROSTATICS */
611 /* Analytical PME correction */
612 zeta2 = _mm_mul_ps(beta2,rsq33);
613 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
614 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
615 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
616 felec = _mm_mul_ps(qq33,felec);
617 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
618 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
619 velec = _mm_mul_ps(qq33,velec);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velecsum = _mm_add_ps(velecsum,velec);
626 /* Update vectorial force */
627 fix3 = _mm_macc_ps(dx33,fscal,fix3);
628 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
629 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
631 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
632 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
633 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
635 fjptrA = f+j_coord_offsetA;
636 fjptrB = f+j_coord_offsetB;
637 fjptrC = f+j_coord_offsetC;
638 fjptrD = f+j_coord_offsetD;
640 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
641 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
642 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
644 /* Inner loop uses 299 flops */
650 /* Get j neighbor index, and coordinate index */
651 jnrlistA = jjnr[jidx];
652 jnrlistB = jjnr[jidx+1];
653 jnrlistC = jjnr[jidx+2];
654 jnrlistD = jjnr[jidx+3];
655 /* Sign of each element will be negative for non-real atoms.
656 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
657 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
659 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
660 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
661 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
662 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
663 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
664 j_coord_offsetA = DIM*jnrA;
665 j_coord_offsetB = DIM*jnrB;
666 j_coord_offsetC = DIM*jnrC;
667 j_coord_offsetD = DIM*jnrD;
669 /* load j atom coordinates */
670 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
671 x+j_coord_offsetC,x+j_coord_offsetD,
672 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
673 &jy2,&jz2,&jx3,&jy3,&jz3);
675 /* Calculate displacement vector */
676 dx00 = _mm_sub_ps(ix0,jx0);
677 dy00 = _mm_sub_ps(iy0,jy0);
678 dz00 = _mm_sub_ps(iz0,jz0);
679 dx11 = _mm_sub_ps(ix1,jx1);
680 dy11 = _mm_sub_ps(iy1,jy1);
681 dz11 = _mm_sub_ps(iz1,jz1);
682 dx12 = _mm_sub_ps(ix1,jx2);
683 dy12 = _mm_sub_ps(iy1,jy2);
684 dz12 = _mm_sub_ps(iz1,jz2);
685 dx13 = _mm_sub_ps(ix1,jx3);
686 dy13 = _mm_sub_ps(iy1,jy3);
687 dz13 = _mm_sub_ps(iz1,jz3);
688 dx21 = _mm_sub_ps(ix2,jx1);
689 dy21 = _mm_sub_ps(iy2,jy1);
690 dz21 = _mm_sub_ps(iz2,jz1);
691 dx22 = _mm_sub_ps(ix2,jx2);
692 dy22 = _mm_sub_ps(iy2,jy2);
693 dz22 = _mm_sub_ps(iz2,jz2);
694 dx23 = _mm_sub_ps(ix2,jx3);
695 dy23 = _mm_sub_ps(iy2,jy3);
696 dz23 = _mm_sub_ps(iz2,jz3);
697 dx31 = _mm_sub_ps(ix3,jx1);
698 dy31 = _mm_sub_ps(iy3,jy1);
699 dz31 = _mm_sub_ps(iz3,jz1);
700 dx32 = _mm_sub_ps(ix3,jx2);
701 dy32 = _mm_sub_ps(iy3,jy2);
702 dz32 = _mm_sub_ps(iz3,jz2);
703 dx33 = _mm_sub_ps(ix3,jx3);
704 dy33 = _mm_sub_ps(iy3,jy3);
705 dz33 = _mm_sub_ps(iz3,jz3);
707 /* Calculate squared distance and things based on it */
708 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
709 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
710 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
711 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
712 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
713 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
714 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
715 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
716 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
717 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
719 rinv11 = gmx_mm_invsqrt_ps(rsq11);
720 rinv12 = gmx_mm_invsqrt_ps(rsq12);
721 rinv13 = gmx_mm_invsqrt_ps(rsq13);
722 rinv21 = gmx_mm_invsqrt_ps(rsq21);
723 rinv22 = gmx_mm_invsqrt_ps(rsq22);
724 rinv23 = gmx_mm_invsqrt_ps(rsq23);
725 rinv31 = gmx_mm_invsqrt_ps(rsq31);
726 rinv32 = gmx_mm_invsqrt_ps(rsq32);
727 rinv33 = gmx_mm_invsqrt_ps(rsq33);
729 rinvsq00 = gmx_mm_inv_ps(rsq00);
730 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
731 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
732 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
733 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
734 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
735 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
736 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
737 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
738 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
740 fjx0 = _mm_setzero_ps();
741 fjy0 = _mm_setzero_ps();
742 fjz0 = _mm_setzero_ps();
743 fjx1 = _mm_setzero_ps();
744 fjy1 = _mm_setzero_ps();
745 fjz1 = _mm_setzero_ps();
746 fjx2 = _mm_setzero_ps();
747 fjy2 = _mm_setzero_ps();
748 fjz2 = _mm_setzero_ps();
749 fjx3 = _mm_setzero_ps();
750 fjy3 = _mm_setzero_ps();
751 fjz3 = _mm_setzero_ps();
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 /* LENNARD-JONES DISPERSION/REPULSION */
759 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
760 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
761 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
762 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
763 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
767 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
771 fscal = _mm_andnot_ps(dummy_mask,fscal);
773 /* Update vectorial force */
774 fix0 = _mm_macc_ps(dx00,fscal,fix0);
775 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
776 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
778 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
779 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
780 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
782 /**************************
783 * CALCULATE INTERACTIONS *
784 **************************/
786 r11 = _mm_mul_ps(rsq11,rinv11);
787 r11 = _mm_andnot_ps(dummy_mask,r11);
789 /* EWALD ELECTROSTATICS */
791 /* Analytical PME correction */
792 zeta2 = _mm_mul_ps(beta2,rsq11);
793 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
794 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
795 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
796 felec = _mm_mul_ps(qq11,felec);
797 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
798 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
799 velec = _mm_mul_ps(qq11,velec);
801 /* Update potential sum for this i atom from the interaction with this j atom. */
802 velec = _mm_andnot_ps(dummy_mask,velec);
803 velecsum = _mm_add_ps(velecsum,velec);
807 fscal = _mm_andnot_ps(dummy_mask,fscal);
809 /* Update vectorial force */
810 fix1 = _mm_macc_ps(dx11,fscal,fix1);
811 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
812 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
814 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
815 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
816 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
818 /**************************
819 * CALCULATE INTERACTIONS *
820 **************************/
822 r12 = _mm_mul_ps(rsq12,rinv12);
823 r12 = _mm_andnot_ps(dummy_mask,r12);
825 /* EWALD ELECTROSTATICS */
827 /* Analytical PME correction */
828 zeta2 = _mm_mul_ps(beta2,rsq12);
829 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
830 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
831 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
832 felec = _mm_mul_ps(qq12,felec);
833 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
834 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
835 velec = _mm_mul_ps(qq12,velec);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm_andnot_ps(dummy_mask,velec);
839 velecsum = _mm_add_ps(velecsum,velec);
843 fscal = _mm_andnot_ps(dummy_mask,fscal);
845 /* Update vectorial force */
846 fix1 = _mm_macc_ps(dx12,fscal,fix1);
847 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
848 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
850 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
851 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
852 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 r13 = _mm_mul_ps(rsq13,rinv13);
859 r13 = _mm_andnot_ps(dummy_mask,r13);
861 /* EWALD ELECTROSTATICS */
863 /* Analytical PME correction */
864 zeta2 = _mm_mul_ps(beta2,rsq13);
865 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
866 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
867 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
868 felec = _mm_mul_ps(qq13,felec);
869 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
870 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
871 velec = _mm_mul_ps(qq13,velec);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_andnot_ps(dummy_mask,velec);
875 velecsum = _mm_add_ps(velecsum,velec);
879 fscal = _mm_andnot_ps(dummy_mask,fscal);
881 /* Update vectorial force */
882 fix1 = _mm_macc_ps(dx13,fscal,fix1);
883 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
884 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
886 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
887 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
888 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
890 /**************************
891 * CALCULATE INTERACTIONS *
892 **************************/
894 r21 = _mm_mul_ps(rsq21,rinv21);
895 r21 = _mm_andnot_ps(dummy_mask,r21);
897 /* EWALD ELECTROSTATICS */
899 /* Analytical PME correction */
900 zeta2 = _mm_mul_ps(beta2,rsq21);
901 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
902 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
903 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
904 felec = _mm_mul_ps(qq21,felec);
905 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
906 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
907 velec = _mm_mul_ps(qq21,velec);
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_andnot_ps(dummy_mask,velec);
911 velecsum = _mm_add_ps(velecsum,velec);
915 fscal = _mm_andnot_ps(dummy_mask,fscal);
917 /* Update vectorial force */
918 fix2 = _mm_macc_ps(dx21,fscal,fix2);
919 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
920 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
922 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
923 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
924 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
926 /**************************
927 * CALCULATE INTERACTIONS *
928 **************************/
930 r22 = _mm_mul_ps(rsq22,rinv22);
931 r22 = _mm_andnot_ps(dummy_mask,r22);
933 /* EWALD ELECTROSTATICS */
935 /* Analytical PME correction */
936 zeta2 = _mm_mul_ps(beta2,rsq22);
937 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
938 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
939 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
940 felec = _mm_mul_ps(qq22,felec);
941 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
942 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
943 velec = _mm_mul_ps(qq22,velec);
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm_andnot_ps(dummy_mask,velec);
947 velecsum = _mm_add_ps(velecsum,velec);
951 fscal = _mm_andnot_ps(dummy_mask,fscal);
953 /* Update vectorial force */
954 fix2 = _mm_macc_ps(dx22,fscal,fix2);
955 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
956 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
958 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
959 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
960 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
962 /**************************
963 * CALCULATE INTERACTIONS *
964 **************************/
966 r23 = _mm_mul_ps(rsq23,rinv23);
967 r23 = _mm_andnot_ps(dummy_mask,r23);
969 /* EWALD ELECTROSTATICS */
971 /* Analytical PME correction */
972 zeta2 = _mm_mul_ps(beta2,rsq23);
973 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
974 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
975 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
976 felec = _mm_mul_ps(qq23,felec);
977 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
978 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
979 velec = _mm_mul_ps(qq23,velec);
981 /* Update potential sum for this i atom from the interaction with this j atom. */
982 velec = _mm_andnot_ps(dummy_mask,velec);
983 velecsum = _mm_add_ps(velecsum,velec);
987 fscal = _mm_andnot_ps(dummy_mask,fscal);
989 /* Update vectorial force */
990 fix2 = _mm_macc_ps(dx23,fscal,fix2);
991 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
992 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
994 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
995 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
996 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 r31 = _mm_mul_ps(rsq31,rinv31);
1003 r31 = _mm_andnot_ps(dummy_mask,r31);
1005 /* EWALD ELECTROSTATICS */
1007 /* Analytical PME correction */
1008 zeta2 = _mm_mul_ps(beta2,rsq31);
1009 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1010 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1011 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1012 felec = _mm_mul_ps(qq31,felec);
1013 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1014 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1015 velec = _mm_mul_ps(qq31,velec);
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm_andnot_ps(dummy_mask,velec);
1019 velecsum = _mm_add_ps(velecsum,velec);
1023 fscal = _mm_andnot_ps(dummy_mask,fscal);
1025 /* Update vectorial force */
1026 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1027 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1028 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1030 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1031 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1032 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 r32 = _mm_mul_ps(rsq32,rinv32);
1039 r32 = _mm_andnot_ps(dummy_mask,r32);
1041 /* EWALD ELECTROSTATICS */
1043 /* Analytical PME correction */
1044 zeta2 = _mm_mul_ps(beta2,rsq32);
1045 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1046 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1047 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1048 felec = _mm_mul_ps(qq32,felec);
1049 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1050 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1051 velec = _mm_mul_ps(qq32,velec);
1053 /* Update potential sum for this i atom from the interaction with this j atom. */
1054 velec = _mm_andnot_ps(dummy_mask,velec);
1055 velecsum = _mm_add_ps(velecsum,velec);
1059 fscal = _mm_andnot_ps(dummy_mask,fscal);
1061 /* Update vectorial force */
1062 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1063 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1064 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1066 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1067 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1068 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 r33 = _mm_mul_ps(rsq33,rinv33);
1075 r33 = _mm_andnot_ps(dummy_mask,r33);
1077 /* EWALD ELECTROSTATICS */
1079 /* Analytical PME correction */
1080 zeta2 = _mm_mul_ps(beta2,rsq33);
1081 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1082 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1083 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1084 felec = _mm_mul_ps(qq33,felec);
1085 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1086 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1087 velec = _mm_mul_ps(qq33,velec);
1089 /* Update potential sum for this i atom from the interaction with this j atom. */
1090 velec = _mm_andnot_ps(dummy_mask,velec);
1091 velecsum = _mm_add_ps(velecsum,velec);
1095 fscal = _mm_andnot_ps(dummy_mask,fscal);
1097 /* Update vectorial force */
1098 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1099 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1100 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1102 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1103 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1104 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1106 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1107 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1108 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1109 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1111 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1112 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1113 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1115 /* Inner loop uses 308 flops */
1118 /* End of innermost loop */
1120 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1121 f+i_coord_offset,fshift+i_shift_offset);
1124 /* Update potential energies */
1125 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1126 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1128 /* Increment number of inner iterations */
1129 inneriter += j_index_end - j_index_start;
1131 /* Outer loop uses 26 flops */
1134 /* Increment number of outer iterations */
1137 /* Update outer/inner flops */
1139 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*308);
1142 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1143 * Electrostatics interaction: Ewald
1144 * VdW interaction: LennardJones
1145 * Geometry: Water4-Water4
1146 * Calculate force/pot: Force
1149 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1150 (t_nblist * gmx_restrict nlist,
1151 rvec * gmx_restrict xx,
1152 rvec * gmx_restrict ff,
1153 t_forcerec * gmx_restrict fr,
1154 t_mdatoms * gmx_restrict mdatoms,
1155 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1156 t_nrnb * gmx_restrict nrnb)
1158 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1159 * just 0 for non-waters.
1160 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1161 * jnr indices corresponding to data put in the four positions in the SIMD register.
1163 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1164 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1165 int jnrA,jnrB,jnrC,jnrD;
1166 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1167 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1168 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1169 real rcutoff_scalar;
1170 real *shiftvec,*fshift,*x,*f;
1171 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1172 real scratch[4*DIM];
1173 __m128 fscal,rcutoff,rcutoff2,jidxall;
1175 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1177 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1179 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1181 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1182 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1183 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1184 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1185 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1186 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1187 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1188 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1189 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1190 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1191 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1192 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1193 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1194 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1195 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1196 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1197 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1198 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1199 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1200 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1203 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1206 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1207 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1209 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1210 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1212 __m128 dummy_mask,cutoff_mask;
1213 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1214 __m128 one = _mm_set1_ps(1.0);
1215 __m128 two = _mm_set1_ps(2.0);
1221 jindex = nlist->jindex;
1223 shiftidx = nlist->shift;
1225 shiftvec = fr->shift_vec[0];
1226 fshift = fr->fshift[0];
1227 facel = _mm_set1_ps(fr->epsfac);
1228 charge = mdatoms->chargeA;
1229 nvdwtype = fr->ntype;
1230 vdwparam = fr->nbfp;
1231 vdwtype = mdatoms->typeA;
1233 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1234 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1235 beta2 = _mm_mul_ps(beta,beta);
1236 beta3 = _mm_mul_ps(beta,beta2);
1237 ewtab = fr->ic->tabq_coul_F;
1238 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1239 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1241 /* Setup water-specific parameters */
1242 inr = nlist->iinr[0];
1243 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1244 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1245 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1246 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1248 jq1 = _mm_set1_ps(charge[inr+1]);
1249 jq2 = _mm_set1_ps(charge[inr+2]);
1250 jq3 = _mm_set1_ps(charge[inr+3]);
1251 vdwjidx0A = 2*vdwtype[inr+0];
1252 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1253 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1254 qq11 = _mm_mul_ps(iq1,jq1);
1255 qq12 = _mm_mul_ps(iq1,jq2);
1256 qq13 = _mm_mul_ps(iq1,jq3);
1257 qq21 = _mm_mul_ps(iq2,jq1);
1258 qq22 = _mm_mul_ps(iq2,jq2);
1259 qq23 = _mm_mul_ps(iq2,jq3);
1260 qq31 = _mm_mul_ps(iq3,jq1);
1261 qq32 = _mm_mul_ps(iq3,jq2);
1262 qq33 = _mm_mul_ps(iq3,jq3);
1264 /* Avoid stupid compiler warnings */
1265 jnrA = jnrB = jnrC = jnrD = 0;
1266 j_coord_offsetA = 0;
1267 j_coord_offsetB = 0;
1268 j_coord_offsetC = 0;
1269 j_coord_offsetD = 0;
1274 for(iidx=0;iidx<4*DIM;iidx++)
1276 scratch[iidx] = 0.0;
1279 /* Start outer loop over neighborlists */
1280 for(iidx=0; iidx<nri; iidx++)
1282 /* Load shift vector for this list */
1283 i_shift_offset = DIM*shiftidx[iidx];
1285 /* Load limits for loop over neighbors */
1286 j_index_start = jindex[iidx];
1287 j_index_end = jindex[iidx+1];
1289 /* Get outer coordinate index */
1291 i_coord_offset = DIM*inr;
1293 /* Load i particle coords and add shift vector */
1294 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1295 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1297 fix0 = _mm_setzero_ps();
1298 fiy0 = _mm_setzero_ps();
1299 fiz0 = _mm_setzero_ps();
1300 fix1 = _mm_setzero_ps();
1301 fiy1 = _mm_setzero_ps();
1302 fiz1 = _mm_setzero_ps();
1303 fix2 = _mm_setzero_ps();
1304 fiy2 = _mm_setzero_ps();
1305 fiz2 = _mm_setzero_ps();
1306 fix3 = _mm_setzero_ps();
1307 fiy3 = _mm_setzero_ps();
1308 fiz3 = _mm_setzero_ps();
1310 /* Start inner kernel loop */
1311 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1314 /* Get j neighbor index, and coordinate index */
1316 jnrB = jjnr[jidx+1];
1317 jnrC = jjnr[jidx+2];
1318 jnrD = jjnr[jidx+3];
1319 j_coord_offsetA = DIM*jnrA;
1320 j_coord_offsetB = DIM*jnrB;
1321 j_coord_offsetC = DIM*jnrC;
1322 j_coord_offsetD = DIM*jnrD;
1324 /* load j atom coordinates */
1325 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1326 x+j_coord_offsetC,x+j_coord_offsetD,
1327 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1328 &jy2,&jz2,&jx3,&jy3,&jz3);
1330 /* Calculate displacement vector */
1331 dx00 = _mm_sub_ps(ix0,jx0);
1332 dy00 = _mm_sub_ps(iy0,jy0);
1333 dz00 = _mm_sub_ps(iz0,jz0);
1334 dx11 = _mm_sub_ps(ix1,jx1);
1335 dy11 = _mm_sub_ps(iy1,jy1);
1336 dz11 = _mm_sub_ps(iz1,jz1);
1337 dx12 = _mm_sub_ps(ix1,jx2);
1338 dy12 = _mm_sub_ps(iy1,jy2);
1339 dz12 = _mm_sub_ps(iz1,jz2);
1340 dx13 = _mm_sub_ps(ix1,jx3);
1341 dy13 = _mm_sub_ps(iy1,jy3);
1342 dz13 = _mm_sub_ps(iz1,jz3);
1343 dx21 = _mm_sub_ps(ix2,jx1);
1344 dy21 = _mm_sub_ps(iy2,jy1);
1345 dz21 = _mm_sub_ps(iz2,jz1);
1346 dx22 = _mm_sub_ps(ix2,jx2);
1347 dy22 = _mm_sub_ps(iy2,jy2);
1348 dz22 = _mm_sub_ps(iz2,jz2);
1349 dx23 = _mm_sub_ps(ix2,jx3);
1350 dy23 = _mm_sub_ps(iy2,jy3);
1351 dz23 = _mm_sub_ps(iz2,jz3);
1352 dx31 = _mm_sub_ps(ix3,jx1);
1353 dy31 = _mm_sub_ps(iy3,jy1);
1354 dz31 = _mm_sub_ps(iz3,jz1);
1355 dx32 = _mm_sub_ps(ix3,jx2);
1356 dy32 = _mm_sub_ps(iy3,jy2);
1357 dz32 = _mm_sub_ps(iz3,jz2);
1358 dx33 = _mm_sub_ps(ix3,jx3);
1359 dy33 = _mm_sub_ps(iy3,jy3);
1360 dz33 = _mm_sub_ps(iz3,jz3);
1362 /* Calculate squared distance and things based on it */
1363 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1364 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1365 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1366 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1367 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1368 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1369 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1370 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1371 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1372 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1374 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1375 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1376 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1377 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1378 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1379 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1380 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1381 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1382 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1384 rinvsq00 = gmx_mm_inv_ps(rsq00);
1385 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1386 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1387 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1388 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1389 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1390 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1391 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1392 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1393 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1395 fjx0 = _mm_setzero_ps();
1396 fjy0 = _mm_setzero_ps();
1397 fjz0 = _mm_setzero_ps();
1398 fjx1 = _mm_setzero_ps();
1399 fjy1 = _mm_setzero_ps();
1400 fjz1 = _mm_setzero_ps();
1401 fjx2 = _mm_setzero_ps();
1402 fjy2 = _mm_setzero_ps();
1403 fjz2 = _mm_setzero_ps();
1404 fjx3 = _mm_setzero_ps();
1405 fjy3 = _mm_setzero_ps();
1406 fjz3 = _mm_setzero_ps();
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 /* LENNARD-JONES DISPERSION/REPULSION */
1414 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1415 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1419 /* Update vectorial force */
1420 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1421 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1422 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1424 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1425 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1426 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1428 /**************************
1429 * CALCULATE INTERACTIONS *
1430 **************************/
1432 r11 = _mm_mul_ps(rsq11,rinv11);
1434 /* EWALD ELECTROSTATICS */
1436 /* Analytical PME correction */
1437 zeta2 = _mm_mul_ps(beta2,rsq11);
1438 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1439 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1440 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1441 felec = _mm_mul_ps(qq11,felec);
1445 /* Update vectorial force */
1446 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1447 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1448 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1450 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1451 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1452 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1454 /**************************
1455 * CALCULATE INTERACTIONS *
1456 **************************/
1458 r12 = _mm_mul_ps(rsq12,rinv12);
1460 /* EWALD ELECTROSTATICS */
1462 /* Analytical PME correction */
1463 zeta2 = _mm_mul_ps(beta2,rsq12);
1464 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1465 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1466 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1467 felec = _mm_mul_ps(qq12,felec);
1471 /* Update vectorial force */
1472 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1473 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1474 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1476 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1477 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1478 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1480 /**************************
1481 * CALCULATE INTERACTIONS *
1482 **************************/
1484 r13 = _mm_mul_ps(rsq13,rinv13);
1486 /* EWALD ELECTROSTATICS */
1488 /* Analytical PME correction */
1489 zeta2 = _mm_mul_ps(beta2,rsq13);
1490 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1491 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1492 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1493 felec = _mm_mul_ps(qq13,felec);
1497 /* Update vectorial force */
1498 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1499 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1500 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1502 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1503 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1504 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 r21 = _mm_mul_ps(rsq21,rinv21);
1512 /* EWALD ELECTROSTATICS */
1514 /* Analytical PME correction */
1515 zeta2 = _mm_mul_ps(beta2,rsq21);
1516 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1517 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1518 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1519 felec = _mm_mul_ps(qq21,felec);
1523 /* Update vectorial force */
1524 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1525 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1526 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1528 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1529 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1530 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1532 /**************************
1533 * CALCULATE INTERACTIONS *
1534 **************************/
1536 r22 = _mm_mul_ps(rsq22,rinv22);
1538 /* EWALD ELECTROSTATICS */
1540 /* Analytical PME correction */
1541 zeta2 = _mm_mul_ps(beta2,rsq22);
1542 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1543 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1544 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1545 felec = _mm_mul_ps(qq22,felec);
1549 /* Update vectorial force */
1550 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1551 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1552 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1554 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1555 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1556 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 r23 = _mm_mul_ps(rsq23,rinv23);
1564 /* EWALD ELECTROSTATICS */
1566 /* Analytical PME correction */
1567 zeta2 = _mm_mul_ps(beta2,rsq23);
1568 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1569 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1570 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1571 felec = _mm_mul_ps(qq23,felec);
1575 /* Update vectorial force */
1576 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1577 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1578 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1580 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1581 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1582 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 r31 = _mm_mul_ps(rsq31,rinv31);
1590 /* EWALD ELECTROSTATICS */
1592 /* Analytical PME correction */
1593 zeta2 = _mm_mul_ps(beta2,rsq31);
1594 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1595 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1596 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1597 felec = _mm_mul_ps(qq31,felec);
1601 /* Update vectorial force */
1602 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1603 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1604 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1606 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1607 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1608 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r32 = _mm_mul_ps(rsq32,rinv32);
1616 /* EWALD ELECTROSTATICS */
1618 /* Analytical PME correction */
1619 zeta2 = _mm_mul_ps(beta2,rsq32);
1620 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1621 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1622 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1623 felec = _mm_mul_ps(qq32,felec);
1627 /* Update vectorial force */
1628 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1629 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1630 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1632 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1633 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1634 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1636 /**************************
1637 * CALCULATE INTERACTIONS *
1638 **************************/
1640 r33 = _mm_mul_ps(rsq33,rinv33);
1642 /* EWALD ELECTROSTATICS */
1644 /* Analytical PME correction */
1645 zeta2 = _mm_mul_ps(beta2,rsq33);
1646 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1647 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1648 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1649 felec = _mm_mul_ps(qq33,felec);
1653 /* Update vectorial force */
1654 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1655 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1656 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1658 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1659 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1660 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1662 fjptrA = f+j_coord_offsetA;
1663 fjptrB = f+j_coord_offsetB;
1664 fjptrC = f+j_coord_offsetC;
1665 fjptrD = f+j_coord_offsetD;
1667 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1668 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1669 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1671 /* Inner loop uses 285 flops */
1674 if(jidx<j_index_end)
1677 /* Get j neighbor index, and coordinate index */
1678 jnrlistA = jjnr[jidx];
1679 jnrlistB = jjnr[jidx+1];
1680 jnrlistC = jjnr[jidx+2];
1681 jnrlistD = jjnr[jidx+3];
1682 /* Sign of each element will be negative for non-real atoms.
1683 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1684 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1686 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1687 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1688 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1689 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1690 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1691 j_coord_offsetA = DIM*jnrA;
1692 j_coord_offsetB = DIM*jnrB;
1693 j_coord_offsetC = DIM*jnrC;
1694 j_coord_offsetD = DIM*jnrD;
1696 /* load j atom coordinates */
1697 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1698 x+j_coord_offsetC,x+j_coord_offsetD,
1699 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1700 &jy2,&jz2,&jx3,&jy3,&jz3);
1702 /* Calculate displacement vector */
1703 dx00 = _mm_sub_ps(ix0,jx0);
1704 dy00 = _mm_sub_ps(iy0,jy0);
1705 dz00 = _mm_sub_ps(iz0,jz0);
1706 dx11 = _mm_sub_ps(ix1,jx1);
1707 dy11 = _mm_sub_ps(iy1,jy1);
1708 dz11 = _mm_sub_ps(iz1,jz1);
1709 dx12 = _mm_sub_ps(ix1,jx2);
1710 dy12 = _mm_sub_ps(iy1,jy2);
1711 dz12 = _mm_sub_ps(iz1,jz2);
1712 dx13 = _mm_sub_ps(ix1,jx3);
1713 dy13 = _mm_sub_ps(iy1,jy3);
1714 dz13 = _mm_sub_ps(iz1,jz3);
1715 dx21 = _mm_sub_ps(ix2,jx1);
1716 dy21 = _mm_sub_ps(iy2,jy1);
1717 dz21 = _mm_sub_ps(iz2,jz1);
1718 dx22 = _mm_sub_ps(ix2,jx2);
1719 dy22 = _mm_sub_ps(iy2,jy2);
1720 dz22 = _mm_sub_ps(iz2,jz2);
1721 dx23 = _mm_sub_ps(ix2,jx3);
1722 dy23 = _mm_sub_ps(iy2,jy3);
1723 dz23 = _mm_sub_ps(iz2,jz3);
1724 dx31 = _mm_sub_ps(ix3,jx1);
1725 dy31 = _mm_sub_ps(iy3,jy1);
1726 dz31 = _mm_sub_ps(iz3,jz1);
1727 dx32 = _mm_sub_ps(ix3,jx2);
1728 dy32 = _mm_sub_ps(iy3,jy2);
1729 dz32 = _mm_sub_ps(iz3,jz2);
1730 dx33 = _mm_sub_ps(ix3,jx3);
1731 dy33 = _mm_sub_ps(iy3,jy3);
1732 dz33 = _mm_sub_ps(iz3,jz3);
1734 /* Calculate squared distance and things based on it */
1735 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1736 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1737 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1738 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1739 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1740 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1741 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1742 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1743 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1744 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1746 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1747 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1748 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1749 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1750 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1751 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1752 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1753 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1754 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1756 rinvsq00 = gmx_mm_inv_ps(rsq00);
1757 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1758 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1759 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1760 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1761 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1762 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1763 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1764 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1765 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1767 fjx0 = _mm_setzero_ps();
1768 fjy0 = _mm_setzero_ps();
1769 fjz0 = _mm_setzero_ps();
1770 fjx1 = _mm_setzero_ps();
1771 fjy1 = _mm_setzero_ps();
1772 fjz1 = _mm_setzero_ps();
1773 fjx2 = _mm_setzero_ps();
1774 fjy2 = _mm_setzero_ps();
1775 fjz2 = _mm_setzero_ps();
1776 fjx3 = _mm_setzero_ps();
1777 fjy3 = _mm_setzero_ps();
1778 fjz3 = _mm_setzero_ps();
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 /* LENNARD-JONES DISPERSION/REPULSION */
1786 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1787 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1791 fscal = _mm_andnot_ps(dummy_mask,fscal);
1793 /* Update vectorial force */
1794 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1795 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1796 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1798 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1799 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1800 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 r11 = _mm_mul_ps(rsq11,rinv11);
1807 r11 = _mm_andnot_ps(dummy_mask,r11);
1809 /* EWALD ELECTROSTATICS */
1811 /* Analytical PME correction */
1812 zeta2 = _mm_mul_ps(beta2,rsq11);
1813 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1814 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1815 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1816 felec = _mm_mul_ps(qq11,felec);
1820 fscal = _mm_andnot_ps(dummy_mask,fscal);
1822 /* Update vectorial force */
1823 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1824 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1825 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1827 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1828 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1829 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1831 /**************************
1832 * CALCULATE INTERACTIONS *
1833 **************************/
1835 r12 = _mm_mul_ps(rsq12,rinv12);
1836 r12 = _mm_andnot_ps(dummy_mask,r12);
1838 /* EWALD ELECTROSTATICS */
1840 /* Analytical PME correction */
1841 zeta2 = _mm_mul_ps(beta2,rsq12);
1842 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1843 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1844 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1845 felec = _mm_mul_ps(qq12,felec);
1849 fscal = _mm_andnot_ps(dummy_mask,fscal);
1851 /* Update vectorial force */
1852 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1853 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1854 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1856 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1857 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1858 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1860 /**************************
1861 * CALCULATE INTERACTIONS *
1862 **************************/
1864 r13 = _mm_mul_ps(rsq13,rinv13);
1865 r13 = _mm_andnot_ps(dummy_mask,r13);
1867 /* EWALD ELECTROSTATICS */
1869 /* Analytical PME correction */
1870 zeta2 = _mm_mul_ps(beta2,rsq13);
1871 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1872 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1873 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1874 felec = _mm_mul_ps(qq13,felec);
1878 fscal = _mm_andnot_ps(dummy_mask,fscal);
1880 /* Update vectorial force */
1881 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1882 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1883 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1885 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1886 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1887 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 r21 = _mm_mul_ps(rsq21,rinv21);
1894 r21 = _mm_andnot_ps(dummy_mask,r21);
1896 /* EWALD ELECTROSTATICS */
1898 /* Analytical PME correction */
1899 zeta2 = _mm_mul_ps(beta2,rsq21);
1900 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1901 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1902 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1903 felec = _mm_mul_ps(qq21,felec);
1907 fscal = _mm_andnot_ps(dummy_mask,fscal);
1909 /* Update vectorial force */
1910 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1911 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1912 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1914 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1915 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1916 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 r22 = _mm_mul_ps(rsq22,rinv22);
1923 r22 = _mm_andnot_ps(dummy_mask,r22);
1925 /* EWALD ELECTROSTATICS */
1927 /* Analytical PME correction */
1928 zeta2 = _mm_mul_ps(beta2,rsq22);
1929 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1930 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1931 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1932 felec = _mm_mul_ps(qq22,felec);
1936 fscal = _mm_andnot_ps(dummy_mask,fscal);
1938 /* Update vectorial force */
1939 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1940 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1941 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1943 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1944 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1945 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1947 /**************************
1948 * CALCULATE INTERACTIONS *
1949 **************************/
1951 r23 = _mm_mul_ps(rsq23,rinv23);
1952 r23 = _mm_andnot_ps(dummy_mask,r23);
1954 /* EWALD ELECTROSTATICS */
1956 /* Analytical PME correction */
1957 zeta2 = _mm_mul_ps(beta2,rsq23);
1958 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1959 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1960 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1961 felec = _mm_mul_ps(qq23,felec);
1965 fscal = _mm_andnot_ps(dummy_mask,fscal);
1967 /* Update vectorial force */
1968 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1969 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1970 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1972 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1973 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1974 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 r31 = _mm_mul_ps(rsq31,rinv31);
1981 r31 = _mm_andnot_ps(dummy_mask,r31);
1983 /* EWALD ELECTROSTATICS */
1985 /* Analytical PME correction */
1986 zeta2 = _mm_mul_ps(beta2,rsq31);
1987 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1988 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1989 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1990 felec = _mm_mul_ps(qq31,felec);
1994 fscal = _mm_andnot_ps(dummy_mask,fscal);
1996 /* Update vectorial force */
1997 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1998 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1999 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2001 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2002 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2003 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 r32 = _mm_mul_ps(rsq32,rinv32);
2010 r32 = _mm_andnot_ps(dummy_mask,r32);
2012 /* EWALD ELECTROSTATICS */
2014 /* Analytical PME correction */
2015 zeta2 = _mm_mul_ps(beta2,rsq32);
2016 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2017 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2018 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2019 felec = _mm_mul_ps(qq32,felec);
2023 fscal = _mm_andnot_ps(dummy_mask,fscal);
2025 /* Update vectorial force */
2026 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2027 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2028 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2030 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2031 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2032 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r33 = _mm_mul_ps(rsq33,rinv33);
2039 r33 = _mm_andnot_ps(dummy_mask,r33);
2041 /* EWALD ELECTROSTATICS */
2043 /* Analytical PME correction */
2044 zeta2 = _mm_mul_ps(beta2,rsq33);
2045 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2046 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2047 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048 felec = _mm_mul_ps(qq33,felec);
2052 fscal = _mm_andnot_ps(dummy_mask,fscal);
2054 /* Update vectorial force */
2055 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2056 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2057 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2059 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2060 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2061 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2063 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2064 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2065 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2066 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2068 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2069 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2070 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2072 /* Inner loop uses 294 flops */
2075 /* End of innermost loop */
2077 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2078 f+i_coord_offset,fshift+i_shift_offset);
2080 /* Increment number of inner iterations */
2081 inneriter += j_index_end - j_index_start;
2083 /* Outer loop uses 24 flops */
2086 /* Increment number of outer iterations */
2089 /* Update outer/inner flops */
2091 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*294);