2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
51 * Electrostatics interaction: Coulomb
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
111 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->ic->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
140 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
141 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
142 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
144 jq1 = _mm_set1_ps(charge[inr+1]);
145 jq2 = _mm_set1_ps(charge[inr+2]);
146 jq3 = _mm_set1_ps(charge[inr+3]);
147 vdwjidx0A = 2*vdwtype[inr+0];
148 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
149 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
150 qq11 = _mm_mul_ps(iq1,jq1);
151 qq12 = _mm_mul_ps(iq1,jq2);
152 qq13 = _mm_mul_ps(iq1,jq3);
153 qq21 = _mm_mul_ps(iq2,jq1);
154 qq22 = _mm_mul_ps(iq2,jq2);
155 qq23 = _mm_mul_ps(iq2,jq3);
156 qq31 = _mm_mul_ps(iq3,jq1);
157 qq32 = _mm_mul_ps(iq3,jq2);
158 qq33 = _mm_mul_ps(iq3,jq3);
160 /* Avoid stupid compiler warnings */
161 jnrA = jnrB = jnrC = jnrD = 0;
170 for(iidx=0;iidx<4*DIM;iidx++)
175 /* Start outer loop over neighborlists */
176 for(iidx=0; iidx<nri; iidx++)
178 /* Load shift vector for this list */
179 i_shift_offset = DIM*shiftidx[iidx];
181 /* Load limits for loop over neighbors */
182 j_index_start = jindex[iidx];
183 j_index_end = jindex[iidx+1];
185 /* Get outer coordinate index */
187 i_coord_offset = DIM*inr;
189 /* Load i particle coords and add shift vector */
190 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
191 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
193 fix0 = _mm_setzero_ps();
194 fiy0 = _mm_setzero_ps();
195 fiz0 = _mm_setzero_ps();
196 fix1 = _mm_setzero_ps();
197 fiy1 = _mm_setzero_ps();
198 fiz1 = _mm_setzero_ps();
199 fix2 = _mm_setzero_ps();
200 fiy2 = _mm_setzero_ps();
201 fiz2 = _mm_setzero_ps();
202 fix3 = _mm_setzero_ps();
203 fiy3 = _mm_setzero_ps();
204 fiz3 = _mm_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_ps();
208 vvdwsum = _mm_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
228 &jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx11 = _mm_sub_ps(ix1,jx1);
235 dy11 = _mm_sub_ps(iy1,jy1);
236 dz11 = _mm_sub_ps(iz1,jz1);
237 dx12 = _mm_sub_ps(ix1,jx2);
238 dy12 = _mm_sub_ps(iy1,jy2);
239 dz12 = _mm_sub_ps(iz1,jz2);
240 dx13 = _mm_sub_ps(ix1,jx3);
241 dy13 = _mm_sub_ps(iy1,jy3);
242 dz13 = _mm_sub_ps(iz1,jz3);
243 dx21 = _mm_sub_ps(ix2,jx1);
244 dy21 = _mm_sub_ps(iy2,jy1);
245 dz21 = _mm_sub_ps(iz2,jz1);
246 dx22 = _mm_sub_ps(ix2,jx2);
247 dy22 = _mm_sub_ps(iy2,jy2);
248 dz22 = _mm_sub_ps(iz2,jz2);
249 dx23 = _mm_sub_ps(ix2,jx3);
250 dy23 = _mm_sub_ps(iy2,jy3);
251 dz23 = _mm_sub_ps(iz2,jz3);
252 dx31 = _mm_sub_ps(ix3,jx1);
253 dy31 = _mm_sub_ps(iy3,jy1);
254 dz31 = _mm_sub_ps(iz3,jz1);
255 dx32 = _mm_sub_ps(ix3,jx2);
256 dy32 = _mm_sub_ps(iy3,jy2);
257 dz32 = _mm_sub_ps(iz3,jz2);
258 dx33 = _mm_sub_ps(ix3,jx3);
259 dy33 = _mm_sub_ps(iy3,jy3);
260 dz33 = _mm_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
270 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
271 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
272 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
274 rinv11 = avx128fma_invsqrt_f(rsq11);
275 rinv12 = avx128fma_invsqrt_f(rsq12);
276 rinv13 = avx128fma_invsqrt_f(rsq13);
277 rinv21 = avx128fma_invsqrt_f(rsq21);
278 rinv22 = avx128fma_invsqrt_f(rsq22);
279 rinv23 = avx128fma_invsqrt_f(rsq23);
280 rinv31 = avx128fma_invsqrt_f(rsq31);
281 rinv32 = avx128fma_invsqrt_f(rsq32);
282 rinv33 = avx128fma_invsqrt_f(rsq33);
284 rinvsq00 = avx128fma_inv_f(rsq00);
285 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
286 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
287 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
288 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
289 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
290 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
291 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
292 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
293 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
295 fjx0 = _mm_setzero_ps();
296 fjy0 = _mm_setzero_ps();
297 fjz0 = _mm_setzero_ps();
298 fjx1 = _mm_setzero_ps();
299 fjy1 = _mm_setzero_ps();
300 fjz1 = _mm_setzero_ps();
301 fjx2 = _mm_setzero_ps();
302 fjy2 = _mm_setzero_ps();
303 fjz2 = _mm_setzero_ps();
304 fjx3 = _mm_setzero_ps();
305 fjy3 = _mm_setzero_ps();
306 fjz3 = _mm_setzero_ps();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 /* LENNARD-JONES DISPERSION/REPULSION */
314 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
315 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
316 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
317 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
318 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
325 /* Update vectorial force */
326 fix0 = _mm_macc_ps(dx00,fscal,fix0);
327 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
328 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
330 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
331 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
332 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 /* COULOMB ELECTROSTATICS */
339 velec = _mm_mul_ps(qq11,rinv11);
340 felec = _mm_mul_ps(velec,rinvsq11);
342 /* Update potential sum for this i atom from the interaction with this j atom. */
343 velecsum = _mm_add_ps(velecsum,velec);
347 /* Update vectorial force */
348 fix1 = _mm_macc_ps(dx11,fscal,fix1);
349 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
350 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
352 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
353 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
354 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
356 /**************************
357 * CALCULATE INTERACTIONS *
358 **************************/
360 /* COULOMB ELECTROSTATICS */
361 velec = _mm_mul_ps(qq12,rinv12);
362 felec = _mm_mul_ps(velec,rinvsq12);
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum = _mm_add_ps(velecsum,velec);
369 /* Update vectorial force */
370 fix1 = _mm_macc_ps(dx12,fscal,fix1);
371 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
372 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
374 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
375 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
376 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 /* COULOMB ELECTROSTATICS */
383 velec = _mm_mul_ps(qq13,rinv13);
384 felec = _mm_mul_ps(velec,rinvsq13);
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum = _mm_add_ps(velecsum,velec);
391 /* Update vectorial force */
392 fix1 = _mm_macc_ps(dx13,fscal,fix1);
393 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
394 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
396 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
397 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
398 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 /* COULOMB ELECTROSTATICS */
405 velec = _mm_mul_ps(qq21,rinv21);
406 felec = _mm_mul_ps(velec,rinvsq21);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velecsum = _mm_add_ps(velecsum,velec);
413 /* Update vectorial force */
414 fix2 = _mm_macc_ps(dx21,fscal,fix2);
415 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
416 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
418 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
419 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
420 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 /* COULOMB ELECTROSTATICS */
427 velec = _mm_mul_ps(qq22,rinv22);
428 felec = _mm_mul_ps(velec,rinvsq22);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm_add_ps(velecsum,velec);
435 /* Update vectorial force */
436 fix2 = _mm_macc_ps(dx22,fscal,fix2);
437 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
438 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
440 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
441 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
442 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 /* COULOMB ELECTROSTATICS */
449 velec = _mm_mul_ps(qq23,rinv23);
450 felec = _mm_mul_ps(velec,rinvsq23);
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velecsum = _mm_add_ps(velecsum,velec);
457 /* Update vectorial force */
458 fix2 = _mm_macc_ps(dx23,fscal,fix2);
459 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
460 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
462 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
463 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
464 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 /* COULOMB ELECTROSTATICS */
471 velec = _mm_mul_ps(qq31,rinv31);
472 felec = _mm_mul_ps(velec,rinvsq31);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velecsum = _mm_add_ps(velecsum,velec);
479 /* Update vectorial force */
480 fix3 = _mm_macc_ps(dx31,fscal,fix3);
481 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
482 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
484 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
485 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
486 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 /* COULOMB ELECTROSTATICS */
493 velec = _mm_mul_ps(qq32,rinv32);
494 felec = _mm_mul_ps(velec,rinvsq32);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velecsum = _mm_add_ps(velecsum,velec);
501 /* Update vectorial force */
502 fix3 = _mm_macc_ps(dx32,fscal,fix3);
503 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
504 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
506 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
507 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
508 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
510 /**************************
511 * CALCULATE INTERACTIONS *
512 **************************/
514 /* COULOMB ELECTROSTATICS */
515 velec = _mm_mul_ps(qq33,rinv33);
516 felec = _mm_mul_ps(velec,rinvsq33);
518 /* Update potential sum for this i atom from the interaction with this j atom. */
519 velecsum = _mm_add_ps(velecsum,velec);
523 /* Update vectorial force */
524 fix3 = _mm_macc_ps(dx33,fscal,fix3);
525 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
526 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
528 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
529 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
530 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
532 fjptrA = f+j_coord_offsetA;
533 fjptrB = f+j_coord_offsetB;
534 fjptrC = f+j_coord_offsetC;
535 fjptrD = f+j_coord_offsetD;
537 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
538 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
539 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
541 /* Inner loop uses 317 flops */
547 /* Get j neighbor index, and coordinate index */
548 jnrlistA = jjnr[jidx];
549 jnrlistB = jjnr[jidx+1];
550 jnrlistC = jjnr[jidx+2];
551 jnrlistD = jjnr[jidx+3];
552 /* Sign of each element will be negative for non-real atoms.
553 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
554 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
556 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
557 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
558 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
559 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
560 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
561 j_coord_offsetA = DIM*jnrA;
562 j_coord_offsetB = DIM*jnrB;
563 j_coord_offsetC = DIM*jnrC;
564 j_coord_offsetD = DIM*jnrD;
566 /* load j atom coordinates */
567 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
568 x+j_coord_offsetC,x+j_coord_offsetD,
569 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
570 &jy2,&jz2,&jx3,&jy3,&jz3);
572 /* Calculate displacement vector */
573 dx00 = _mm_sub_ps(ix0,jx0);
574 dy00 = _mm_sub_ps(iy0,jy0);
575 dz00 = _mm_sub_ps(iz0,jz0);
576 dx11 = _mm_sub_ps(ix1,jx1);
577 dy11 = _mm_sub_ps(iy1,jy1);
578 dz11 = _mm_sub_ps(iz1,jz1);
579 dx12 = _mm_sub_ps(ix1,jx2);
580 dy12 = _mm_sub_ps(iy1,jy2);
581 dz12 = _mm_sub_ps(iz1,jz2);
582 dx13 = _mm_sub_ps(ix1,jx3);
583 dy13 = _mm_sub_ps(iy1,jy3);
584 dz13 = _mm_sub_ps(iz1,jz3);
585 dx21 = _mm_sub_ps(ix2,jx1);
586 dy21 = _mm_sub_ps(iy2,jy1);
587 dz21 = _mm_sub_ps(iz2,jz1);
588 dx22 = _mm_sub_ps(ix2,jx2);
589 dy22 = _mm_sub_ps(iy2,jy2);
590 dz22 = _mm_sub_ps(iz2,jz2);
591 dx23 = _mm_sub_ps(ix2,jx3);
592 dy23 = _mm_sub_ps(iy2,jy3);
593 dz23 = _mm_sub_ps(iz2,jz3);
594 dx31 = _mm_sub_ps(ix3,jx1);
595 dy31 = _mm_sub_ps(iy3,jy1);
596 dz31 = _mm_sub_ps(iz3,jz1);
597 dx32 = _mm_sub_ps(ix3,jx2);
598 dy32 = _mm_sub_ps(iy3,jy2);
599 dz32 = _mm_sub_ps(iz3,jz2);
600 dx33 = _mm_sub_ps(ix3,jx3);
601 dy33 = _mm_sub_ps(iy3,jy3);
602 dz33 = _mm_sub_ps(iz3,jz3);
604 /* Calculate squared distance and things based on it */
605 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
606 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
607 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
608 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
609 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
610 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
611 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
612 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
613 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
614 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
616 rinv11 = avx128fma_invsqrt_f(rsq11);
617 rinv12 = avx128fma_invsqrt_f(rsq12);
618 rinv13 = avx128fma_invsqrt_f(rsq13);
619 rinv21 = avx128fma_invsqrt_f(rsq21);
620 rinv22 = avx128fma_invsqrt_f(rsq22);
621 rinv23 = avx128fma_invsqrt_f(rsq23);
622 rinv31 = avx128fma_invsqrt_f(rsq31);
623 rinv32 = avx128fma_invsqrt_f(rsq32);
624 rinv33 = avx128fma_invsqrt_f(rsq33);
626 rinvsq00 = avx128fma_inv_f(rsq00);
627 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
628 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
629 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
630 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
631 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
632 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
633 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
634 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
635 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
637 fjx0 = _mm_setzero_ps();
638 fjy0 = _mm_setzero_ps();
639 fjz0 = _mm_setzero_ps();
640 fjx1 = _mm_setzero_ps();
641 fjy1 = _mm_setzero_ps();
642 fjz1 = _mm_setzero_ps();
643 fjx2 = _mm_setzero_ps();
644 fjy2 = _mm_setzero_ps();
645 fjz2 = _mm_setzero_ps();
646 fjx3 = _mm_setzero_ps();
647 fjy3 = _mm_setzero_ps();
648 fjz3 = _mm_setzero_ps();
650 /**************************
651 * CALCULATE INTERACTIONS *
652 **************************/
654 /* LENNARD-JONES DISPERSION/REPULSION */
656 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
657 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
658 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
659 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
660 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
662 /* Update potential sum for this i atom from the interaction with this j atom. */
663 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
664 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
668 fscal = _mm_andnot_ps(dummy_mask,fscal);
670 /* Update vectorial force */
671 fix0 = _mm_macc_ps(dx00,fscal,fix0);
672 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
673 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
675 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
676 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
677 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 /* COULOMB ELECTROSTATICS */
684 velec = _mm_mul_ps(qq11,rinv11);
685 felec = _mm_mul_ps(velec,rinvsq11);
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_andnot_ps(dummy_mask,velec);
689 velecsum = _mm_add_ps(velecsum,velec);
693 fscal = _mm_andnot_ps(dummy_mask,fscal);
695 /* Update vectorial force */
696 fix1 = _mm_macc_ps(dx11,fscal,fix1);
697 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
698 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
700 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
701 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
702 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
704 /**************************
705 * CALCULATE INTERACTIONS *
706 **************************/
708 /* COULOMB ELECTROSTATICS */
709 velec = _mm_mul_ps(qq12,rinv12);
710 felec = _mm_mul_ps(velec,rinvsq12);
712 /* Update potential sum for this i atom from the interaction with this j atom. */
713 velec = _mm_andnot_ps(dummy_mask,velec);
714 velecsum = _mm_add_ps(velecsum,velec);
718 fscal = _mm_andnot_ps(dummy_mask,fscal);
720 /* Update vectorial force */
721 fix1 = _mm_macc_ps(dx12,fscal,fix1);
722 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
723 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
725 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
726 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
727 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
729 /**************************
730 * CALCULATE INTERACTIONS *
731 **************************/
733 /* COULOMB ELECTROSTATICS */
734 velec = _mm_mul_ps(qq13,rinv13);
735 felec = _mm_mul_ps(velec,rinvsq13);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec = _mm_andnot_ps(dummy_mask,velec);
739 velecsum = _mm_add_ps(velecsum,velec);
743 fscal = _mm_andnot_ps(dummy_mask,fscal);
745 /* Update vectorial force */
746 fix1 = _mm_macc_ps(dx13,fscal,fix1);
747 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
748 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
750 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
751 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
752 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 /* COULOMB ELECTROSTATICS */
759 velec = _mm_mul_ps(qq21,rinv21);
760 felec = _mm_mul_ps(velec,rinvsq21);
762 /* Update potential sum for this i atom from the interaction with this j atom. */
763 velec = _mm_andnot_ps(dummy_mask,velec);
764 velecsum = _mm_add_ps(velecsum,velec);
768 fscal = _mm_andnot_ps(dummy_mask,fscal);
770 /* Update vectorial force */
771 fix2 = _mm_macc_ps(dx21,fscal,fix2);
772 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
773 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
775 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
776 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
777 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
779 /**************************
780 * CALCULATE INTERACTIONS *
781 **************************/
783 /* COULOMB ELECTROSTATICS */
784 velec = _mm_mul_ps(qq22,rinv22);
785 felec = _mm_mul_ps(velec,rinvsq22);
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm_andnot_ps(dummy_mask,velec);
789 velecsum = _mm_add_ps(velecsum,velec);
793 fscal = _mm_andnot_ps(dummy_mask,fscal);
795 /* Update vectorial force */
796 fix2 = _mm_macc_ps(dx22,fscal,fix2);
797 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
798 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
800 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
801 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
802 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
804 /**************************
805 * CALCULATE INTERACTIONS *
806 **************************/
808 /* COULOMB ELECTROSTATICS */
809 velec = _mm_mul_ps(qq23,rinv23);
810 felec = _mm_mul_ps(velec,rinvsq23);
812 /* Update potential sum for this i atom from the interaction with this j atom. */
813 velec = _mm_andnot_ps(dummy_mask,velec);
814 velecsum = _mm_add_ps(velecsum,velec);
818 fscal = _mm_andnot_ps(dummy_mask,fscal);
820 /* Update vectorial force */
821 fix2 = _mm_macc_ps(dx23,fscal,fix2);
822 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
823 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
825 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
826 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
827 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 /* COULOMB ELECTROSTATICS */
834 velec = _mm_mul_ps(qq31,rinv31);
835 felec = _mm_mul_ps(velec,rinvsq31);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec = _mm_andnot_ps(dummy_mask,velec);
839 velecsum = _mm_add_ps(velecsum,velec);
843 fscal = _mm_andnot_ps(dummy_mask,fscal);
845 /* Update vectorial force */
846 fix3 = _mm_macc_ps(dx31,fscal,fix3);
847 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
848 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
850 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
851 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
852 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 /* COULOMB ELECTROSTATICS */
859 velec = _mm_mul_ps(qq32,rinv32);
860 felec = _mm_mul_ps(velec,rinvsq32);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm_andnot_ps(dummy_mask,velec);
864 velecsum = _mm_add_ps(velecsum,velec);
868 fscal = _mm_andnot_ps(dummy_mask,fscal);
870 /* Update vectorial force */
871 fix3 = _mm_macc_ps(dx32,fscal,fix3);
872 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
873 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
875 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
876 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
877 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
883 /* COULOMB ELECTROSTATICS */
884 velec = _mm_mul_ps(qq33,rinv33);
885 felec = _mm_mul_ps(velec,rinvsq33);
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm_andnot_ps(dummy_mask,velec);
889 velecsum = _mm_add_ps(velecsum,velec);
893 fscal = _mm_andnot_ps(dummy_mask,fscal);
895 /* Update vectorial force */
896 fix3 = _mm_macc_ps(dx33,fscal,fix3);
897 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
898 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
900 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
901 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
902 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
904 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
905 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
906 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
907 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
909 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
910 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
911 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
913 /* Inner loop uses 317 flops */
916 /* End of innermost loop */
918 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
919 f+i_coord_offset,fshift+i_shift_offset);
922 /* Update potential energies */
923 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
924 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
926 /* Increment number of inner iterations */
927 inneriter += j_index_end - j_index_start;
929 /* Outer loop uses 26 flops */
932 /* Increment number of outer iterations */
935 /* Update outer/inner flops */
937 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
940 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
941 * Electrostatics interaction: Coulomb
942 * VdW interaction: LennardJones
943 * Geometry: Water4-Water4
944 * Calculate force/pot: Force
947 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
948 (t_nblist * gmx_restrict nlist,
949 rvec * gmx_restrict xx,
950 rvec * gmx_restrict ff,
951 struct t_forcerec * gmx_restrict fr,
952 t_mdatoms * gmx_restrict mdatoms,
953 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
954 t_nrnb * gmx_restrict nrnb)
956 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
957 * just 0 for non-waters.
958 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
959 * jnr indices corresponding to data put in the four positions in the SIMD register.
961 int i_shift_offset,i_coord_offset,outeriter,inneriter;
962 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
963 int jnrA,jnrB,jnrC,jnrD;
964 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
965 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
966 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
968 real *shiftvec,*fshift,*x,*f;
969 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
971 __m128 fscal,rcutoff,rcutoff2,jidxall;
973 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
975 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
977 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
979 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
980 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
981 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
982 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
983 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
984 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
985 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
986 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
987 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
988 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
989 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
990 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
991 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
992 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
993 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
994 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
995 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
996 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
997 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
998 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1001 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1004 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1005 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1006 __m128 dummy_mask,cutoff_mask;
1007 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1008 __m128 one = _mm_set1_ps(1.0);
1009 __m128 two = _mm_set1_ps(2.0);
1015 jindex = nlist->jindex;
1017 shiftidx = nlist->shift;
1019 shiftvec = fr->shift_vec[0];
1020 fshift = fr->fshift[0];
1021 facel = _mm_set1_ps(fr->ic->epsfac);
1022 charge = mdatoms->chargeA;
1023 nvdwtype = fr->ntype;
1024 vdwparam = fr->nbfp;
1025 vdwtype = mdatoms->typeA;
1027 /* Setup water-specific parameters */
1028 inr = nlist->iinr[0];
1029 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1030 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1031 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1032 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1034 jq1 = _mm_set1_ps(charge[inr+1]);
1035 jq2 = _mm_set1_ps(charge[inr+2]);
1036 jq3 = _mm_set1_ps(charge[inr+3]);
1037 vdwjidx0A = 2*vdwtype[inr+0];
1038 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1039 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1040 qq11 = _mm_mul_ps(iq1,jq1);
1041 qq12 = _mm_mul_ps(iq1,jq2);
1042 qq13 = _mm_mul_ps(iq1,jq3);
1043 qq21 = _mm_mul_ps(iq2,jq1);
1044 qq22 = _mm_mul_ps(iq2,jq2);
1045 qq23 = _mm_mul_ps(iq2,jq3);
1046 qq31 = _mm_mul_ps(iq3,jq1);
1047 qq32 = _mm_mul_ps(iq3,jq2);
1048 qq33 = _mm_mul_ps(iq3,jq3);
1050 /* Avoid stupid compiler warnings */
1051 jnrA = jnrB = jnrC = jnrD = 0;
1052 j_coord_offsetA = 0;
1053 j_coord_offsetB = 0;
1054 j_coord_offsetC = 0;
1055 j_coord_offsetD = 0;
1060 for(iidx=0;iidx<4*DIM;iidx++)
1062 scratch[iidx] = 0.0;
1065 /* Start outer loop over neighborlists */
1066 for(iidx=0; iidx<nri; iidx++)
1068 /* Load shift vector for this list */
1069 i_shift_offset = DIM*shiftidx[iidx];
1071 /* Load limits for loop over neighbors */
1072 j_index_start = jindex[iidx];
1073 j_index_end = jindex[iidx+1];
1075 /* Get outer coordinate index */
1077 i_coord_offset = DIM*inr;
1079 /* Load i particle coords and add shift vector */
1080 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1081 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1083 fix0 = _mm_setzero_ps();
1084 fiy0 = _mm_setzero_ps();
1085 fiz0 = _mm_setzero_ps();
1086 fix1 = _mm_setzero_ps();
1087 fiy1 = _mm_setzero_ps();
1088 fiz1 = _mm_setzero_ps();
1089 fix2 = _mm_setzero_ps();
1090 fiy2 = _mm_setzero_ps();
1091 fiz2 = _mm_setzero_ps();
1092 fix3 = _mm_setzero_ps();
1093 fiy3 = _mm_setzero_ps();
1094 fiz3 = _mm_setzero_ps();
1096 /* Start inner kernel loop */
1097 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1100 /* Get j neighbor index, and coordinate index */
1102 jnrB = jjnr[jidx+1];
1103 jnrC = jjnr[jidx+2];
1104 jnrD = jjnr[jidx+3];
1105 j_coord_offsetA = DIM*jnrA;
1106 j_coord_offsetB = DIM*jnrB;
1107 j_coord_offsetC = DIM*jnrC;
1108 j_coord_offsetD = DIM*jnrD;
1110 /* load j atom coordinates */
1111 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1112 x+j_coord_offsetC,x+j_coord_offsetD,
1113 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1114 &jy2,&jz2,&jx3,&jy3,&jz3);
1116 /* Calculate displacement vector */
1117 dx00 = _mm_sub_ps(ix0,jx0);
1118 dy00 = _mm_sub_ps(iy0,jy0);
1119 dz00 = _mm_sub_ps(iz0,jz0);
1120 dx11 = _mm_sub_ps(ix1,jx1);
1121 dy11 = _mm_sub_ps(iy1,jy1);
1122 dz11 = _mm_sub_ps(iz1,jz1);
1123 dx12 = _mm_sub_ps(ix1,jx2);
1124 dy12 = _mm_sub_ps(iy1,jy2);
1125 dz12 = _mm_sub_ps(iz1,jz2);
1126 dx13 = _mm_sub_ps(ix1,jx3);
1127 dy13 = _mm_sub_ps(iy1,jy3);
1128 dz13 = _mm_sub_ps(iz1,jz3);
1129 dx21 = _mm_sub_ps(ix2,jx1);
1130 dy21 = _mm_sub_ps(iy2,jy1);
1131 dz21 = _mm_sub_ps(iz2,jz1);
1132 dx22 = _mm_sub_ps(ix2,jx2);
1133 dy22 = _mm_sub_ps(iy2,jy2);
1134 dz22 = _mm_sub_ps(iz2,jz2);
1135 dx23 = _mm_sub_ps(ix2,jx3);
1136 dy23 = _mm_sub_ps(iy2,jy3);
1137 dz23 = _mm_sub_ps(iz2,jz3);
1138 dx31 = _mm_sub_ps(ix3,jx1);
1139 dy31 = _mm_sub_ps(iy3,jy1);
1140 dz31 = _mm_sub_ps(iz3,jz1);
1141 dx32 = _mm_sub_ps(ix3,jx2);
1142 dy32 = _mm_sub_ps(iy3,jy2);
1143 dz32 = _mm_sub_ps(iz3,jz2);
1144 dx33 = _mm_sub_ps(ix3,jx3);
1145 dy33 = _mm_sub_ps(iy3,jy3);
1146 dz33 = _mm_sub_ps(iz3,jz3);
1148 /* Calculate squared distance and things based on it */
1149 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1150 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1151 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1152 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1153 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1154 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1155 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1156 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1157 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1158 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1160 rinv11 = avx128fma_invsqrt_f(rsq11);
1161 rinv12 = avx128fma_invsqrt_f(rsq12);
1162 rinv13 = avx128fma_invsqrt_f(rsq13);
1163 rinv21 = avx128fma_invsqrt_f(rsq21);
1164 rinv22 = avx128fma_invsqrt_f(rsq22);
1165 rinv23 = avx128fma_invsqrt_f(rsq23);
1166 rinv31 = avx128fma_invsqrt_f(rsq31);
1167 rinv32 = avx128fma_invsqrt_f(rsq32);
1168 rinv33 = avx128fma_invsqrt_f(rsq33);
1170 rinvsq00 = avx128fma_inv_f(rsq00);
1171 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1172 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1173 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1174 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1175 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1176 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1177 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1178 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1179 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1181 fjx0 = _mm_setzero_ps();
1182 fjy0 = _mm_setzero_ps();
1183 fjz0 = _mm_setzero_ps();
1184 fjx1 = _mm_setzero_ps();
1185 fjy1 = _mm_setzero_ps();
1186 fjz1 = _mm_setzero_ps();
1187 fjx2 = _mm_setzero_ps();
1188 fjy2 = _mm_setzero_ps();
1189 fjz2 = _mm_setzero_ps();
1190 fjx3 = _mm_setzero_ps();
1191 fjy3 = _mm_setzero_ps();
1192 fjz3 = _mm_setzero_ps();
1194 /**************************
1195 * CALCULATE INTERACTIONS *
1196 **************************/
1198 /* LENNARD-JONES DISPERSION/REPULSION */
1200 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1201 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1205 /* Update vectorial force */
1206 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1207 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1208 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1210 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1211 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1212 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1214 /**************************
1215 * CALCULATE INTERACTIONS *
1216 **************************/
1218 /* COULOMB ELECTROSTATICS */
1219 velec = _mm_mul_ps(qq11,rinv11);
1220 felec = _mm_mul_ps(velec,rinvsq11);
1224 /* Update vectorial force */
1225 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1226 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1227 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1229 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1230 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1231 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1233 /**************************
1234 * CALCULATE INTERACTIONS *
1235 **************************/
1237 /* COULOMB ELECTROSTATICS */
1238 velec = _mm_mul_ps(qq12,rinv12);
1239 felec = _mm_mul_ps(velec,rinvsq12);
1243 /* Update vectorial force */
1244 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1245 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1246 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1248 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1249 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1250 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 /* COULOMB ELECTROSTATICS */
1257 velec = _mm_mul_ps(qq13,rinv13);
1258 felec = _mm_mul_ps(velec,rinvsq13);
1262 /* Update vectorial force */
1263 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1264 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1265 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1267 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1268 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1269 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 /* COULOMB ELECTROSTATICS */
1276 velec = _mm_mul_ps(qq21,rinv21);
1277 felec = _mm_mul_ps(velec,rinvsq21);
1281 /* Update vectorial force */
1282 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1283 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1284 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1286 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1287 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1288 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1290 /**************************
1291 * CALCULATE INTERACTIONS *
1292 **************************/
1294 /* COULOMB ELECTROSTATICS */
1295 velec = _mm_mul_ps(qq22,rinv22);
1296 felec = _mm_mul_ps(velec,rinvsq22);
1300 /* Update vectorial force */
1301 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1302 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1303 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1305 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1306 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1307 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1309 /**************************
1310 * CALCULATE INTERACTIONS *
1311 **************************/
1313 /* COULOMB ELECTROSTATICS */
1314 velec = _mm_mul_ps(qq23,rinv23);
1315 felec = _mm_mul_ps(velec,rinvsq23);
1319 /* Update vectorial force */
1320 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1321 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1322 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1324 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1325 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1326 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 /* COULOMB ELECTROSTATICS */
1333 velec = _mm_mul_ps(qq31,rinv31);
1334 felec = _mm_mul_ps(velec,rinvsq31);
1338 /* Update vectorial force */
1339 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1340 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1341 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1343 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1344 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1345 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1347 /**************************
1348 * CALCULATE INTERACTIONS *
1349 **************************/
1351 /* COULOMB ELECTROSTATICS */
1352 velec = _mm_mul_ps(qq32,rinv32);
1353 felec = _mm_mul_ps(velec,rinvsq32);
1357 /* Update vectorial force */
1358 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1359 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1360 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1362 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1363 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1364 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 /* COULOMB ELECTROSTATICS */
1371 velec = _mm_mul_ps(qq33,rinv33);
1372 felec = _mm_mul_ps(velec,rinvsq33);
1376 /* Update vectorial force */
1377 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1378 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1379 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1381 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1382 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1383 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1385 fjptrA = f+j_coord_offsetA;
1386 fjptrB = f+j_coord_offsetB;
1387 fjptrC = f+j_coord_offsetC;
1388 fjptrD = f+j_coord_offsetD;
1390 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1391 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1392 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1394 /* Inner loop uses 303 flops */
1397 if(jidx<j_index_end)
1400 /* Get j neighbor index, and coordinate index */
1401 jnrlistA = jjnr[jidx];
1402 jnrlistB = jjnr[jidx+1];
1403 jnrlistC = jjnr[jidx+2];
1404 jnrlistD = jjnr[jidx+3];
1405 /* Sign of each element will be negative for non-real atoms.
1406 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1407 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1409 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1410 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1411 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1412 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1413 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1414 j_coord_offsetA = DIM*jnrA;
1415 j_coord_offsetB = DIM*jnrB;
1416 j_coord_offsetC = DIM*jnrC;
1417 j_coord_offsetD = DIM*jnrD;
1419 /* load j atom coordinates */
1420 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1421 x+j_coord_offsetC,x+j_coord_offsetD,
1422 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1423 &jy2,&jz2,&jx3,&jy3,&jz3);
1425 /* Calculate displacement vector */
1426 dx00 = _mm_sub_ps(ix0,jx0);
1427 dy00 = _mm_sub_ps(iy0,jy0);
1428 dz00 = _mm_sub_ps(iz0,jz0);
1429 dx11 = _mm_sub_ps(ix1,jx1);
1430 dy11 = _mm_sub_ps(iy1,jy1);
1431 dz11 = _mm_sub_ps(iz1,jz1);
1432 dx12 = _mm_sub_ps(ix1,jx2);
1433 dy12 = _mm_sub_ps(iy1,jy2);
1434 dz12 = _mm_sub_ps(iz1,jz2);
1435 dx13 = _mm_sub_ps(ix1,jx3);
1436 dy13 = _mm_sub_ps(iy1,jy3);
1437 dz13 = _mm_sub_ps(iz1,jz3);
1438 dx21 = _mm_sub_ps(ix2,jx1);
1439 dy21 = _mm_sub_ps(iy2,jy1);
1440 dz21 = _mm_sub_ps(iz2,jz1);
1441 dx22 = _mm_sub_ps(ix2,jx2);
1442 dy22 = _mm_sub_ps(iy2,jy2);
1443 dz22 = _mm_sub_ps(iz2,jz2);
1444 dx23 = _mm_sub_ps(ix2,jx3);
1445 dy23 = _mm_sub_ps(iy2,jy3);
1446 dz23 = _mm_sub_ps(iz2,jz3);
1447 dx31 = _mm_sub_ps(ix3,jx1);
1448 dy31 = _mm_sub_ps(iy3,jy1);
1449 dz31 = _mm_sub_ps(iz3,jz1);
1450 dx32 = _mm_sub_ps(ix3,jx2);
1451 dy32 = _mm_sub_ps(iy3,jy2);
1452 dz32 = _mm_sub_ps(iz3,jz2);
1453 dx33 = _mm_sub_ps(ix3,jx3);
1454 dy33 = _mm_sub_ps(iy3,jy3);
1455 dz33 = _mm_sub_ps(iz3,jz3);
1457 /* Calculate squared distance and things based on it */
1458 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1459 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1460 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1461 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1462 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1463 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1464 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1465 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1466 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1467 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1469 rinv11 = avx128fma_invsqrt_f(rsq11);
1470 rinv12 = avx128fma_invsqrt_f(rsq12);
1471 rinv13 = avx128fma_invsqrt_f(rsq13);
1472 rinv21 = avx128fma_invsqrt_f(rsq21);
1473 rinv22 = avx128fma_invsqrt_f(rsq22);
1474 rinv23 = avx128fma_invsqrt_f(rsq23);
1475 rinv31 = avx128fma_invsqrt_f(rsq31);
1476 rinv32 = avx128fma_invsqrt_f(rsq32);
1477 rinv33 = avx128fma_invsqrt_f(rsq33);
1479 rinvsq00 = avx128fma_inv_f(rsq00);
1480 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1481 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1482 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1483 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1484 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1485 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1486 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1487 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1488 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1490 fjx0 = _mm_setzero_ps();
1491 fjy0 = _mm_setzero_ps();
1492 fjz0 = _mm_setzero_ps();
1493 fjx1 = _mm_setzero_ps();
1494 fjy1 = _mm_setzero_ps();
1495 fjz1 = _mm_setzero_ps();
1496 fjx2 = _mm_setzero_ps();
1497 fjy2 = _mm_setzero_ps();
1498 fjz2 = _mm_setzero_ps();
1499 fjx3 = _mm_setzero_ps();
1500 fjy3 = _mm_setzero_ps();
1501 fjz3 = _mm_setzero_ps();
1503 /**************************
1504 * CALCULATE INTERACTIONS *
1505 **************************/
1507 /* LENNARD-JONES DISPERSION/REPULSION */
1509 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1510 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1514 fscal = _mm_andnot_ps(dummy_mask,fscal);
1516 /* Update vectorial force */
1517 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1518 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1519 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1521 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1522 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1523 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 /* COULOMB ELECTROSTATICS */
1530 velec = _mm_mul_ps(qq11,rinv11);
1531 felec = _mm_mul_ps(velec,rinvsq11);
1535 fscal = _mm_andnot_ps(dummy_mask,fscal);
1537 /* Update vectorial force */
1538 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1539 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1540 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1542 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1543 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1544 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 /* COULOMB ELECTROSTATICS */
1551 velec = _mm_mul_ps(qq12,rinv12);
1552 felec = _mm_mul_ps(velec,rinvsq12);
1556 fscal = _mm_andnot_ps(dummy_mask,fscal);
1558 /* Update vectorial force */
1559 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1560 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1561 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1563 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1564 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1565 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1567 /**************************
1568 * CALCULATE INTERACTIONS *
1569 **************************/
1571 /* COULOMB ELECTROSTATICS */
1572 velec = _mm_mul_ps(qq13,rinv13);
1573 felec = _mm_mul_ps(velec,rinvsq13);
1577 fscal = _mm_andnot_ps(dummy_mask,fscal);
1579 /* Update vectorial force */
1580 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1581 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1582 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1584 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1585 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1586 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 /* COULOMB ELECTROSTATICS */
1593 velec = _mm_mul_ps(qq21,rinv21);
1594 felec = _mm_mul_ps(velec,rinvsq21);
1598 fscal = _mm_andnot_ps(dummy_mask,fscal);
1600 /* Update vectorial force */
1601 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1602 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1603 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1605 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1606 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1607 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 /* COULOMB ELECTROSTATICS */
1614 velec = _mm_mul_ps(qq22,rinv22);
1615 felec = _mm_mul_ps(velec,rinvsq22);
1619 fscal = _mm_andnot_ps(dummy_mask,fscal);
1621 /* Update vectorial force */
1622 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1623 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1624 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1626 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1627 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1628 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 /* COULOMB ELECTROSTATICS */
1635 velec = _mm_mul_ps(qq23,rinv23);
1636 felec = _mm_mul_ps(velec,rinvsq23);
1640 fscal = _mm_andnot_ps(dummy_mask,fscal);
1642 /* Update vectorial force */
1643 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1644 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1645 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1647 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1648 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1649 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 /* COULOMB ELECTROSTATICS */
1656 velec = _mm_mul_ps(qq31,rinv31);
1657 felec = _mm_mul_ps(velec,rinvsq31);
1661 fscal = _mm_andnot_ps(dummy_mask,fscal);
1663 /* Update vectorial force */
1664 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1665 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1666 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1668 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1669 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1670 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 /* COULOMB ELECTROSTATICS */
1677 velec = _mm_mul_ps(qq32,rinv32);
1678 felec = _mm_mul_ps(velec,rinvsq32);
1682 fscal = _mm_andnot_ps(dummy_mask,fscal);
1684 /* Update vectorial force */
1685 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1686 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1687 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1689 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1690 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1691 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1693 /**************************
1694 * CALCULATE INTERACTIONS *
1695 **************************/
1697 /* COULOMB ELECTROSTATICS */
1698 velec = _mm_mul_ps(qq33,rinv33);
1699 felec = _mm_mul_ps(velec,rinvsq33);
1703 fscal = _mm_andnot_ps(dummy_mask,fscal);
1705 /* Update vectorial force */
1706 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1707 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1708 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1710 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1711 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1712 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1714 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1715 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1716 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1717 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1719 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1720 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1721 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1723 /* Inner loop uses 303 flops */
1726 /* End of innermost loop */
1728 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1729 f+i_coord_offset,fshift+i_shift_offset);
1731 /* Increment number of inner iterations */
1732 inneriter += j_index_end - j_index_start;
1734 /* Outer loop uses 24 flops */
1737 /* Increment number of outer iterations */
1740 /* Update outer/inner flops */
1742 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);