2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
98 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
102 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
103 __m128 dummy_mask,cutoff_mask;
104 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105 __m128 one = _mm_set1_ps(1.0);
106 __m128 two = _mm_set1_ps(2.0);
112 jindex = nlist->jindex;
114 shiftidx = nlist->shift;
116 shiftvec = fr->shift_vec[0];
117 fshift = fr->fshift[0];
118 facel = _mm_set1_ps(fr->epsfac);
119 charge = mdatoms->chargeA;
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 jq1 = _mm_set1_ps(charge[inr+1]);
132 jq2 = _mm_set1_ps(charge[inr+2]);
133 jq3 = _mm_set1_ps(charge[inr+3]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq11 = _mm_mul_ps(iq1,jq1);
138 qq12 = _mm_mul_ps(iq1,jq2);
139 qq13 = _mm_mul_ps(iq1,jq3);
140 qq21 = _mm_mul_ps(iq2,jq1);
141 qq22 = _mm_mul_ps(iq2,jq2);
142 qq23 = _mm_mul_ps(iq2,jq3);
143 qq31 = _mm_mul_ps(iq3,jq1);
144 qq32 = _mm_mul_ps(iq3,jq2);
145 qq33 = _mm_mul_ps(iq3,jq3);
147 /* Avoid stupid compiler warnings */
148 jnrA = jnrB = jnrC = jnrD = 0;
157 for(iidx=0;iidx<4*DIM;iidx++)
162 /* Start outer loop over neighborlists */
163 for(iidx=0; iidx<nri; iidx++)
165 /* Load shift vector for this list */
166 i_shift_offset = DIM*shiftidx[iidx];
168 /* Load limits for loop over neighbors */
169 j_index_start = jindex[iidx];
170 j_index_end = jindex[iidx+1];
172 /* Get outer coordinate index */
174 i_coord_offset = DIM*inr;
176 /* Load i particle coords and add shift vector */
177 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
178 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
180 fix0 = _mm_setzero_ps();
181 fiy0 = _mm_setzero_ps();
182 fiz0 = _mm_setzero_ps();
183 fix1 = _mm_setzero_ps();
184 fiy1 = _mm_setzero_ps();
185 fiz1 = _mm_setzero_ps();
186 fix2 = _mm_setzero_ps();
187 fiy2 = _mm_setzero_ps();
188 fiz2 = _mm_setzero_ps();
189 fix3 = _mm_setzero_ps();
190 fiy3 = _mm_setzero_ps();
191 fiz3 = _mm_setzero_ps();
193 /* Reset potential sums */
194 velecsum = _mm_setzero_ps();
195 vvdwsum = _mm_setzero_ps();
197 /* Start inner kernel loop */
198 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201 /* Get j neighbor index, and coordinate index */
206 j_coord_offsetA = DIM*jnrA;
207 j_coord_offsetB = DIM*jnrB;
208 j_coord_offsetC = DIM*jnrC;
209 j_coord_offsetD = DIM*jnrD;
211 /* load j atom coordinates */
212 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213 x+j_coord_offsetC,x+j_coord_offsetD,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
215 &jy2,&jz2,&jx3,&jy3,&jz3);
217 /* Calculate displacement vector */
218 dx00 = _mm_sub_ps(ix0,jx0);
219 dy00 = _mm_sub_ps(iy0,jy0);
220 dz00 = _mm_sub_ps(iz0,jz0);
221 dx11 = _mm_sub_ps(ix1,jx1);
222 dy11 = _mm_sub_ps(iy1,jy1);
223 dz11 = _mm_sub_ps(iz1,jz1);
224 dx12 = _mm_sub_ps(ix1,jx2);
225 dy12 = _mm_sub_ps(iy1,jy2);
226 dz12 = _mm_sub_ps(iz1,jz2);
227 dx13 = _mm_sub_ps(ix1,jx3);
228 dy13 = _mm_sub_ps(iy1,jy3);
229 dz13 = _mm_sub_ps(iz1,jz3);
230 dx21 = _mm_sub_ps(ix2,jx1);
231 dy21 = _mm_sub_ps(iy2,jy1);
232 dz21 = _mm_sub_ps(iz2,jz1);
233 dx22 = _mm_sub_ps(ix2,jx2);
234 dy22 = _mm_sub_ps(iy2,jy2);
235 dz22 = _mm_sub_ps(iz2,jz2);
236 dx23 = _mm_sub_ps(ix2,jx3);
237 dy23 = _mm_sub_ps(iy2,jy3);
238 dz23 = _mm_sub_ps(iz2,jz3);
239 dx31 = _mm_sub_ps(ix3,jx1);
240 dy31 = _mm_sub_ps(iy3,jy1);
241 dz31 = _mm_sub_ps(iz3,jz1);
242 dx32 = _mm_sub_ps(ix3,jx2);
243 dy32 = _mm_sub_ps(iy3,jy2);
244 dz32 = _mm_sub_ps(iz3,jz2);
245 dx33 = _mm_sub_ps(ix3,jx3);
246 dy33 = _mm_sub_ps(iy3,jy3);
247 dz33 = _mm_sub_ps(iz3,jz3);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
251 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
254 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
257 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
258 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
259 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
261 rinv11 = gmx_mm_invsqrt_ps(rsq11);
262 rinv12 = gmx_mm_invsqrt_ps(rsq12);
263 rinv13 = gmx_mm_invsqrt_ps(rsq13);
264 rinv21 = gmx_mm_invsqrt_ps(rsq21);
265 rinv22 = gmx_mm_invsqrt_ps(rsq22);
266 rinv23 = gmx_mm_invsqrt_ps(rsq23);
267 rinv31 = gmx_mm_invsqrt_ps(rsq31);
268 rinv32 = gmx_mm_invsqrt_ps(rsq32);
269 rinv33 = gmx_mm_invsqrt_ps(rsq33);
271 rinvsq00 = gmx_mm_inv_ps(rsq00);
272 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
273 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
274 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
275 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
276 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
277 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
278 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
279 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
280 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
282 fjx0 = _mm_setzero_ps();
283 fjy0 = _mm_setzero_ps();
284 fjz0 = _mm_setzero_ps();
285 fjx1 = _mm_setzero_ps();
286 fjy1 = _mm_setzero_ps();
287 fjz1 = _mm_setzero_ps();
288 fjx2 = _mm_setzero_ps();
289 fjy2 = _mm_setzero_ps();
290 fjz2 = _mm_setzero_ps();
291 fjx3 = _mm_setzero_ps();
292 fjy3 = _mm_setzero_ps();
293 fjz3 = _mm_setzero_ps();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 /* LENNARD-JONES DISPERSION/REPULSION */
301 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
302 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
303 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
304 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
305 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
312 /* Update vectorial force */
313 fix0 = _mm_macc_ps(dx00,fscal,fix0);
314 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
315 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
317 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
318 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
319 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 /* COULOMB ELECTROSTATICS */
326 velec = _mm_mul_ps(qq11,rinv11);
327 felec = _mm_mul_ps(velec,rinvsq11);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velecsum = _mm_add_ps(velecsum,velec);
334 /* Update vectorial force */
335 fix1 = _mm_macc_ps(dx11,fscal,fix1);
336 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
337 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
339 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
340 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
341 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 /* COULOMB ELECTROSTATICS */
348 velec = _mm_mul_ps(qq12,rinv12);
349 felec = _mm_mul_ps(velec,rinvsq12);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm_add_ps(velecsum,velec);
356 /* Update vectorial force */
357 fix1 = _mm_macc_ps(dx12,fscal,fix1);
358 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
359 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
361 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
362 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
363 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 /* COULOMB ELECTROSTATICS */
370 velec = _mm_mul_ps(qq13,rinv13);
371 felec = _mm_mul_ps(velec,rinvsq13);
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum = _mm_add_ps(velecsum,velec);
378 /* Update vectorial force */
379 fix1 = _mm_macc_ps(dx13,fscal,fix1);
380 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
381 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
383 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
384 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
385 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 /* COULOMB ELECTROSTATICS */
392 velec = _mm_mul_ps(qq21,rinv21);
393 felec = _mm_mul_ps(velec,rinvsq21);
395 /* Update potential sum for this i atom from the interaction with this j atom. */
396 velecsum = _mm_add_ps(velecsum,velec);
400 /* Update vectorial force */
401 fix2 = _mm_macc_ps(dx21,fscal,fix2);
402 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
403 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
405 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
406 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
407 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 /* COULOMB ELECTROSTATICS */
414 velec = _mm_mul_ps(qq22,rinv22);
415 felec = _mm_mul_ps(velec,rinvsq22);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velecsum = _mm_add_ps(velecsum,velec);
422 /* Update vectorial force */
423 fix2 = _mm_macc_ps(dx22,fscal,fix2);
424 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
425 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
427 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
428 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
429 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 /* COULOMB ELECTROSTATICS */
436 velec = _mm_mul_ps(qq23,rinv23);
437 felec = _mm_mul_ps(velec,rinvsq23);
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velecsum = _mm_add_ps(velecsum,velec);
444 /* Update vectorial force */
445 fix2 = _mm_macc_ps(dx23,fscal,fix2);
446 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
447 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
449 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
450 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
451 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 /* COULOMB ELECTROSTATICS */
458 velec = _mm_mul_ps(qq31,rinv31);
459 felec = _mm_mul_ps(velec,rinvsq31);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm_add_ps(velecsum,velec);
466 /* Update vectorial force */
467 fix3 = _mm_macc_ps(dx31,fscal,fix3);
468 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
469 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
471 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
472 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
473 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 /* COULOMB ELECTROSTATICS */
480 velec = _mm_mul_ps(qq32,rinv32);
481 felec = _mm_mul_ps(velec,rinvsq32);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_ps(velecsum,velec);
488 /* Update vectorial force */
489 fix3 = _mm_macc_ps(dx32,fscal,fix3);
490 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
491 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
493 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
494 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
495 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 /* COULOMB ELECTROSTATICS */
502 velec = _mm_mul_ps(qq33,rinv33);
503 felec = _mm_mul_ps(velec,rinvsq33);
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm_add_ps(velecsum,velec);
510 /* Update vectorial force */
511 fix3 = _mm_macc_ps(dx33,fscal,fix3);
512 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
513 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
515 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
516 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
517 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
519 fjptrA = f+j_coord_offsetA;
520 fjptrB = f+j_coord_offsetB;
521 fjptrC = f+j_coord_offsetC;
522 fjptrD = f+j_coord_offsetD;
524 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
525 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
526 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
528 /* Inner loop uses 317 flops */
534 /* Get j neighbor index, and coordinate index */
535 jnrlistA = jjnr[jidx];
536 jnrlistB = jjnr[jidx+1];
537 jnrlistC = jjnr[jidx+2];
538 jnrlistD = jjnr[jidx+3];
539 /* Sign of each element will be negative for non-real atoms.
540 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
541 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
543 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
544 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
545 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
546 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
547 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
548 j_coord_offsetA = DIM*jnrA;
549 j_coord_offsetB = DIM*jnrB;
550 j_coord_offsetC = DIM*jnrC;
551 j_coord_offsetD = DIM*jnrD;
553 /* load j atom coordinates */
554 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
555 x+j_coord_offsetC,x+j_coord_offsetD,
556 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
557 &jy2,&jz2,&jx3,&jy3,&jz3);
559 /* Calculate displacement vector */
560 dx00 = _mm_sub_ps(ix0,jx0);
561 dy00 = _mm_sub_ps(iy0,jy0);
562 dz00 = _mm_sub_ps(iz0,jz0);
563 dx11 = _mm_sub_ps(ix1,jx1);
564 dy11 = _mm_sub_ps(iy1,jy1);
565 dz11 = _mm_sub_ps(iz1,jz1);
566 dx12 = _mm_sub_ps(ix1,jx2);
567 dy12 = _mm_sub_ps(iy1,jy2);
568 dz12 = _mm_sub_ps(iz1,jz2);
569 dx13 = _mm_sub_ps(ix1,jx3);
570 dy13 = _mm_sub_ps(iy1,jy3);
571 dz13 = _mm_sub_ps(iz1,jz3);
572 dx21 = _mm_sub_ps(ix2,jx1);
573 dy21 = _mm_sub_ps(iy2,jy1);
574 dz21 = _mm_sub_ps(iz2,jz1);
575 dx22 = _mm_sub_ps(ix2,jx2);
576 dy22 = _mm_sub_ps(iy2,jy2);
577 dz22 = _mm_sub_ps(iz2,jz2);
578 dx23 = _mm_sub_ps(ix2,jx3);
579 dy23 = _mm_sub_ps(iy2,jy3);
580 dz23 = _mm_sub_ps(iz2,jz3);
581 dx31 = _mm_sub_ps(ix3,jx1);
582 dy31 = _mm_sub_ps(iy3,jy1);
583 dz31 = _mm_sub_ps(iz3,jz1);
584 dx32 = _mm_sub_ps(ix3,jx2);
585 dy32 = _mm_sub_ps(iy3,jy2);
586 dz32 = _mm_sub_ps(iz3,jz2);
587 dx33 = _mm_sub_ps(ix3,jx3);
588 dy33 = _mm_sub_ps(iy3,jy3);
589 dz33 = _mm_sub_ps(iz3,jz3);
591 /* Calculate squared distance and things based on it */
592 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
593 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
594 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
595 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
596 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
597 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
598 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
599 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
600 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
601 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
603 rinv11 = gmx_mm_invsqrt_ps(rsq11);
604 rinv12 = gmx_mm_invsqrt_ps(rsq12);
605 rinv13 = gmx_mm_invsqrt_ps(rsq13);
606 rinv21 = gmx_mm_invsqrt_ps(rsq21);
607 rinv22 = gmx_mm_invsqrt_ps(rsq22);
608 rinv23 = gmx_mm_invsqrt_ps(rsq23);
609 rinv31 = gmx_mm_invsqrt_ps(rsq31);
610 rinv32 = gmx_mm_invsqrt_ps(rsq32);
611 rinv33 = gmx_mm_invsqrt_ps(rsq33);
613 rinvsq00 = gmx_mm_inv_ps(rsq00);
614 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
615 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
616 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
617 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
618 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
619 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
620 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
621 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
622 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
624 fjx0 = _mm_setzero_ps();
625 fjy0 = _mm_setzero_ps();
626 fjz0 = _mm_setzero_ps();
627 fjx1 = _mm_setzero_ps();
628 fjy1 = _mm_setzero_ps();
629 fjz1 = _mm_setzero_ps();
630 fjx2 = _mm_setzero_ps();
631 fjy2 = _mm_setzero_ps();
632 fjz2 = _mm_setzero_ps();
633 fjx3 = _mm_setzero_ps();
634 fjy3 = _mm_setzero_ps();
635 fjz3 = _mm_setzero_ps();
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 /* LENNARD-JONES DISPERSION/REPULSION */
643 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
644 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
645 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
646 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
647 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
651 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
655 fscal = _mm_andnot_ps(dummy_mask,fscal);
657 /* Update vectorial force */
658 fix0 = _mm_macc_ps(dx00,fscal,fix0);
659 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
660 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
662 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
663 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
664 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
666 /**************************
667 * CALCULATE INTERACTIONS *
668 **************************/
670 /* COULOMB ELECTROSTATICS */
671 velec = _mm_mul_ps(qq11,rinv11);
672 felec = _mm_mul_ps(velec,rinvsq11);
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velec = _mm_andnot_ps(dummy_mask,velec);
676 velecsum = _mm_add_ps(velecsum,velec);
680 fscal = _mm_andnot_ps(dummy_mask,fscal);
682 /* Update vectorial force */
683 fix1 = _mm_macc_ps(dx11,fscal,fix1);
684 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
685 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
687 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
688 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
689 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
691 /**************************
692 * CALCULATE INTERACTIONS *
693 **************************/
695 /* COULOMB ELECTROSTATICS */
696 velec = _mm_mul_ps(qq12,rinv12);
697 felec = _mm_mul_ps(velec,rinvsq12);
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velec = _mm_andnot_ps(dummy_mask,velec);
701 velecsum = _mm_add_ps(velecsum,velec);
705 fscal = _mm_andnot_ps(dummy_mask,fscal);
707 /* Update vectorial force */
708 fix1 = _mm_macc_ps(dx12,fscal,fix1);
709 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
710 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
712 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
713 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
714 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
716 /**************************
717 * CALCULATE INTERACTIONS *
718 **************************/
720 /* COULOMB ELECTROSTATICS */
721 velec = _mm_mul_ps(qq13,rinv13);
722 felec = _mm_mul_ps(velec,rinvsq13);
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm_andnot_ps(dummy_mask,velec);
726 velecsum = _mm_add_ps(velecsum,velec);
730 fscal = _mm_andnot_ps(dummy_mask,fscal);
732 /* Update vectorial force */
733 fix1 = _mm_macc_ps(dx13,fscal,fix1);
734 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
735 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
737 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
738 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
739 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 /* COULOMB ELECTROSTATICS */
746 velec = _mm_mul_ps(qq21,rinv21);
747 felec = _mm_mul_ps(velec,rinvsq21);
749 /* Update potential sum for this i atom from the interaction with this j atom. */
750 velec = _mm_andnot_ps(dummy_mask,velec);
751 velecsum = _mm_add_ps(velecsum,velec);
755 fscal = _mm_andnot_ps(dummy_mask,fscal);
757 /* Update vectorial force */
758 fix2 = _mm_macc_ps(dx21,fscal,fix2);
759 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
760 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
762 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
763 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
764 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 /* COULOMB ELECTROSTATICS */
771 velec = _mm_mul_ps(qq22,rinv22);
772 felec = _mm_mul_ps(velec,rinvsq22);
774 /* Update potential sum for this i atom from the interaction with this j atom. */
775 velec = _mm_andnot_ps(dummy_mask,velec);
776 velecsum = _mm_add_ps(velecsum,velec);
780 fscal = _mm_andnot_ps(dummy_mask,fscal);
782 /* Update vectorial force */
783 fix2 = _mm_macc_ps(dx22,fscal,fix2);
784 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
785 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
787 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
788 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
789 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 /* COULOMB ELECTROSTATICS */
796 velec = _mm_mul_ps(qq23,rinv23);
797 felec = _mm_mul_ps(velec,rinvsq23);
799 /* Update potential sum for this i atom from the interaction with this j atom. */
800 velec = _mm_andnot_ps(dummy_mask,velec);
801 velecsum = _mm_add_ps(velecsum,velec);
805 fscal = _mm_andnot_ps(dummy_mask,fscal);
807 /* Update vectorial force */
808 fix2 = _mm_macc_ps(dx23,fscal,fix2);
809 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
810 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
812 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
813 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
814 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 /* COULOMB ELECTROSTATICS */
821 velec = _mm_mul_ps(qq31,rinv31);
822 felec = _mm_mul_ps(velec,rinvsq31);
824 /* Update potential sum for this i atom from the interaction with this j atom. */
825 velec = _mm_andnot_ps(dummy_mask,velec);
826 velecsum = _mm_add_ps(velecsum,velec);
830 fscal = _mm_andnot_ps(dummy_mask,fscal);
832 /* Update vectorial force */
833 fix3 = _mm_macc_ps(dx31,fscal,fix3);
834 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
835 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
837 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
838 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
839 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 /* COULOMB ELECTROSTATICS */
846 velec = _mm_mul_ps(qq32,rinv32);
847 felec = _mm_mul_ps(velec,rinvsq32);
849 /* Update potential sum for this i atom from the interaction with this j atom. */
850 velec = _mm_andnot_ps(dummy_mask,velec);
851 velecsum = _mm_add_ps(velecsum,velec);
855 fscal = _mm_andnot_ps(dummy_mask,fscal);
857 /* Update vectorial force */
858 fix3 = _mm_macc_ps(dx32,fscal,fix3);
859 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
860 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
862 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
863 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
864 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 /* COULOMB ELECTROSTATICS */
871 velec = _mm_mul_ps(qq33,rinv33);
872 felec = _mm_mul_ps(velec,rinvsq33);
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm_andnot_ps(dummy_mask,velec);
876 velecsum = _mm_add_ps(velecsum,velec);
880 fscal = _mm_andnot_ps(dummy_mask,fscal);
882 /* Update vectorial force */
883 fix3 = _mm_macc_ps(dx33,fscal,fix3);
884 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
885 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
887 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
888 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
889 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
891 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
892 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
893 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
894 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
896 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
897 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
898 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
900 /* Inner loop uses 317 flops */
903 /* End of innermost loop */
905 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
906 f+i_coord_offset,fshift+i_shift_offset);
909 /* Update potential energies */
910 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
911 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
913 /* Increment number of inner iterations */
914 inneriter += j_index_end - j_index_start;
916 /* Outer loop uses 26 flops */
919 /* Increment number of outer iterations */
922 /* Update outer/inner flops */
924 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*317);
927 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
928 * Electrostatics interaction: Coulomb
929 * VdW interaction: LennardJones
930 * Geometry: Water4-Water4
931 * Calculate force/pot: Force
934 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_128_fma_single
935 (t_nblist * gmx_restrict nlist,
936 rvec * gmx_restrict xx,
937 rvec * gmx_restrict ff,
938 t_forcerec * gmx_restrict fr,
939 t_mdatoms * gmx_restrict mdatoms,
940 nb_kernel_data_t * gmx_restrict kernel_data,
941 t_nrnb * gmx_restrict nrnb)
943 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
944 * just 0 for non-waters.
945 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
946 * jnr indices corresponding to data put in the four positions in the SIMD register.
948 int i_shift_offset,i_coord_offset,outeriter,inneriter;
949 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
950 int jnrA,jnrB,jnrC,jnrD;
951 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
952 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
953 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
955 real *shiftvec,*fshift,*x,*f;
956 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
958 __m128 fscal,rcutoff,rcutoff2,jidxall;
960 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
962 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
964 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
966 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
967 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
968 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
969 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
970 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
971 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
972 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
973 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
974 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
975 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
976 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
977 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
978 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
979 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
980 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
981 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
982 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
983 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
984 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
985 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
988 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
991 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
992 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
993 __m128 dummy_mask,cutoff_mask;
994 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
995 __m128 one = _mm_set1_ps(1.0);
996 __m128 two = _mm_set1_ps(2.0);
1002 jindex = nlist->jindex;
1004 shiftidx = nlist->shift;
1006 shiftvec = fr->shift_vec[0];
1007 fshift = fr->fshift[0];
1008 facel = _mm_set1_ps(fr->epsfac);
1009 charge = mdatoms->chargeA;
1010 nvdwtype = fr->ntype;
1011 vdwparam = fr->nbfp;
1012 vdwtype = mdatoms->typeA;
1014 /* Setup water-specific parameters */
1015 inr = nlist->iinr[0];
1016 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1017 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1018 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1019 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1021 jq1 = _mm_set1_ps(charge[inr+1]);
1022 jq2 = _mm_set1_ps(charge[inr+2]);
1023 jq3 = _mm_set1_ps(charge[inr+3]);
1024 vdwjidx0A = 2*vdwtype[inr+0];
1025 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1026 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1027 qq11 = _mm_mul_ps(iq1,jq1);
1028 qq12 = _mm_mul_ps(iq1,jq2);
1029 qq13 = _mm_mul_ps(iq1,jq3);
1030 qq21 = _mm_mul_ps(iq2,jq1);
1031 qq22 = _mm_mul_ps(iq2,jq2);
1032 qq23 = _mm_mul_ps(iq2,jq3);
1033 qq31 = _mm_mul_ps(iq3,jq1);
1034 qq32 = _mm_mul_ps(iq3,jq2);
1035 qq33 = _mm_mul_ps(iq3,jq3);
1037 /* Avoid stupid compiler warnings */
1038 jnrA = jnrB = jnrC = jnrD = 0;
1039 j_coord_offsetA = 0;
1040 j_coord_offsetB = 0;
1041 j_coord_offsetC = 0;
1042 j_coord_offsetD = 0;
1047 for(iidx=0;iidx<4*DIM;iidx++)
1049 scratch[iidx] = 0.0;
1052 /* Start outer loop over neighborlists */
1053 for(iidx=0; iidx<nri; iidx++)
1055 /* Load shift vector for this list */
1056 i_shift_offset = DIM*shiftidx[iidx];
1058 /* Load limits for loop over neighbors */
1059 j_index_start = jindex[iidx];
1060 j_index_end = jindex[iidx+1];
1062 /* Get outer coordinate index */
1064 i_coord_offset = DIM*inr;
1066 /* Load i particle coords and add shift vector */
1067 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1068 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1070 fix0 = _mm_setzero_ps();
1071 fiy0 = _mm_setzero_ps();
1072 fiz0 = _mm_setzero_ps();
1073 fix1 = _mm_setzero_ps();
1074 fiy1 = _mm_setzero_ps();
1075 fiz1 = _mm_setzero_ps();
1076 fix2 = _mm_setzero_ps();
1077 fiy2 = _mm_setzero_ps();
1078 fiz2 = _mm_setzero_ps();
1079 fix3 = _mm_setzero_ps();
1080 fiy3 = _mm_setzero_ps();
1081 fiz3 = _mm_setzero_ps();
1083 /* Start inner kernel loop */
1084 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1087 /* Get j neighbor index, and coordinate index */
1089 jnrB = jjnr[jidx+1];
1090 jnrC = jjnr[jidx+2];
1091 jnrD = jjnr[jidx+3];
1092 j_coord_offsetA = DIM*jnrA;
1093 j_coord_offsetB = DIM*jnrB;
1094 j_coord_offsetC = DIM*jnrC;
1095 j_coord_offsetD = DIM*jnrD;
1097 /* load j atom coordinates */
1098 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1099 x+j_coord_offsetC,x+j_coord_offsetD,
1100 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1101 &jy2,&jz2,&jx3,&jy3,&jz3);
1103 /* Calculate displacement vector */
1104 dx00 = _mm_sub_ps(ix0,jx0);
1105 dy00 = _mm_sub_ps(iy0,jy0);
1106 dz00 = _mm_sub_ps(iz0,jz0);
1107 dx11 = _mm_sub_ps(ix1,jx1);
1108 dy11 = _mm_sub_ps(iy1,jy1);
1109 dz11 = _mm_sub_ps(iz1,jz1);
1110 dx12 = _mm_sub_ps(ix1,jx2);
1111 dy12 = _mm_sub_ps(iy1,jy2);
1112 dz12 = _mm_sub_ps(iz1,jz2);
1113 dx13 = _mm_sub_ps(ix1,jx3);
1114 dy13 = _mm_sub_ps(iy1,jy3);
1115 dz13 = _mm_sub_ps(iz1,jz3);
1116 dx21 = _mm_sub_ps(ix2,jx1);
1117 dy21 = _mm_sub_ps(iy2,jy1);
1118 dz21 = _mm_sub_ps(iz2,jz1);
1119 dx22 = _mm_sub_ps(ix2,jx2);
1120 dy22 = _mm_sub_ps(iy2,jy2);
1121 dz22 = _mm_sub_ps(iz2,jz2);
1122 dx23 = _mm_sub_ps(ix2,jx3);
1123 dy23 = _mm_sub_ps(iy2,jy3);
1124 dz23 = _mm_sub_ps(iz2,jz3);
1125 dx31 = _mm_sub_ps(ix3,jx1);
1126 dy31 = _mm_sub_ps(iy3,jy1);
1127 dz31 = _mm_sub_ps(iz3,jz1);
1128 dx32 = _mm_sub_ps(ix3,jx2);
1129 dy32 = _mm_sub_ps(iy3,jy2);
1130 dz32 = _mm_sub_ps(iz3,jz2);
1131 dx33 = _mm_sub_ps(ix3,jx3);
1132 dy33 = _mm_sub_ps(iy3,jy3);
1133 dz33 = _mm_sub_ps(iz3,jz3);
1135 /* Calculate squared distance and things based on it */
1136 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1137 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1138 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1139 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1140 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1141 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1142 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1143 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1144 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1145 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1147 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1148 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1149 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1150 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1151 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1152 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1153 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1154 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1155 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1157 rinvsq00 = gmx_mm_inv_ps(rsq00);
1158 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1159 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1160 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1161 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1162 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1163 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1164 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1165 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1166 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1168 fjx0 = _mm_setzero_ps();
1169 fjy0 = _mm_setzero_ps();
1170 fjz0 = _mm_setzero_ps();
1171 fjx1 = _mm_setzero_ps();
1172 fjy1 = _mm_setzero_ps();
1173 fjz1 = _mm_setzero_ps();
1174 fjx2 = _mm_setzero_ps();
1175 fjy2 = _mm_setzero_ps();
1176 fjz2 = _mm_setzero_ps();
1177 fjx3 = _mm_setzero_ps();
1178 fjy3 = _mm_setzero_ps();
1179 fjz3 = _mm_setzero_ps();
1181 /**************************
1182 * CALCULATE INTERACTIONS *
1183 **************************/
1185 /* LENNARD-JONES DISPERSION/REPULSION */
1187 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1188 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1192 /* Update vectorial force */
1193 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1194 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1195 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1197 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1198 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1199 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 /* COULOMB ELECTROSTATICS */
1206 velec = _mm_mul_ps(qq11,rinv11);
1207 felec = _mm_mul_ps(velec,rinvsq11);
1211 /* Update vectorial force */
1212 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1213 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1214 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1216 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1217 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1218 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 /* COULOMB ELECTROSTATICS */
1225 velec = _mm_mul_ps(qq12,rinv12);
1226 felec = _mm_mul_ps(velec,rinvsq12);
1230 /* Update vectorial force */
1231 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1232 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1233 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1235 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1236 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1237 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1239 /**************************
1240 * CALCULATE INTERACTIONS *
1241 **************************/
1243 /* COULOMB ELECTROSTATICS */
1244 velec = _mm_mul_ps(qq13,rinv13);
1245 felec = _mm_mul_ps(velec,rinvsq13);
1249 /* Update vectorial force */
1250 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1251 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1252 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1254 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1255 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1256 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1258 /**************************
1259 * CALCULATE INTERACTIONS *
1260 **************************/
1262 /* COULOMB ELECTROSTATICS */
1263 velec = _mm_mul_ps(qq21,rinv21);
1264 felec = _mm_mul_ps(velec,rinvsq21);
1268 /* Update vectorial force */
1269 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1270 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1271 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1273 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1274 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1275 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1277 /**************************
1278 * CALCULATE INTERACTIONS *
1279 **************************/
1281 /* COULOMB ELECTROSTATICS */
1282 velec = _mm_mul_ps(qq22,rinv22);
1283 felec = _mm_mul_ps(velec,rinvsq22);
1287 /* Update vectorial force */
1288 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1289 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1290 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1292 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1293 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1294 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1296 /**************************
1297 * CALCULATE INTERACTIONS *
1298 **************************/
1300 /* COULOMB ELECTROSTATICS */
1301 velec = _mm_mul_ps(qq23,rinv23);
1302 felec = _mm_mul_ps(velec,rinvsq23);
1306 /* Update vectorial force */
1307 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1308 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1309 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1311 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1312 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1313 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1315 /**************************
1316 * CALCULATE INTERACTIONS *
1317 **************************/
1319 /* COULOMB ELECTROSTATICS */
1320 velec = _mm_mul_ps(qq31,rinv31);
1321 felec = _mm_mul_ps(velec,rinvsq31);
1325 /* Update vectorial force */
1326 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1327 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1328 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1330 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1331 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1332 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 /* COULOMB ELECTROSTATICS */
1339 velec = _mm_mul_ps(qq32,rinv32);
1340 felec = _mm_mul_ps(velec,rinvsq32);
1344 /* Update vectorial force */
1345 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1346 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1347 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1349 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1350 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1351 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 /* COULOMB ELECTROSTATICS */
1358 velec = _mm_mul_ps(qq33,rinv33);
1359 felec = _mm_mul_ps(velec,rinvsq33);
1363 /* Update vectorial force */
1364 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1365 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1366 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1368 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1369 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1370 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1372 fjptrA = f+j_coord_offsetA;
1373 fjptrB = f+j_coord_offsetB;
1374 fjptrC = f+j_coord_offsetC;
1375 fjptrD = f+j_coord_offsetD;
1377 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1378 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1379 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1381 /* Inner loop uses 303 flops */
1384 if(jidx<j_index_end)
1387 /* Get j neighbor index, and coordinate index */
1388 jnrlistA = jjnr[jidx];
1389 jnrlistB = jjnr[jidx+1];
1390 jnrlistC = jjnr[jidx+2];
1391 jnrlistD = jjnr[jidx+3];
1392 /* Sign of each element will be negative for non-real atoms.
1393 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1394 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1396 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1397 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1398 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1399 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1400 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1401 j_coord_offsetA = DIM*jnrA;
1402 j_coord_offsetB = DIM*jnrB;
1403 j_coord_offsetC = DIM*jnrC;
1404 j_coord_offsetD = DIM*jnrD;
1406 /* load j atom coordinates */
1407 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1408 x+j_coord_offsetC,x+j_coord_offsetD,
1409 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1410 &jy2,&jz2,&jx3,&jy3,&jz3);
1412 /* Calculate displacement vector */
1413 dx00 = _mm_sub_ps(ix0,jx0);
1414 dy00 = _mm_sub_ps(iy0,jy0);
1415 dz00 = _mm_sub_ps(iz0,jz0);
1416 dx11 = _mm_sub_ps(ix1,jx1);
1417 dy11 = _mm_sub_ps(iy1,jy1);
1418 dz11 = _mm_sub_ps(iz1,jz1);
1419 dx12 = _mm_sub_ps(ix1,jx2);
1420 dy12 = _mm_sub_ps(iy1,jy2);
1421 dz12 = _mm_sub_ps(iz1,jz2);
1422 dx13 = _mm_sub_ps(ix1,jx3);
1423 dy13 = _mm_sub_ps(iy1,jy3);
1424 dz13 = _mm_sub_ps(iz1,jz3);
1425 dx21 = _mm_sub_ps(ix2,jx1);
1426 dy21 = _mm_sub_ps(iy2,jy1);
1427 dz21 = _mm_sub_ps(iz2,jz1);
1428 dx22 = _mm_sub_ps(ix2,jx2);
1429 dy22 = _mm_sub_ps(iy2,jy2);
1430 dz22 = _mm_sub_ps(iz2,jz2);
1431 dx23 = _mm_sub_ps(ix2,jx3);
1432 dy23 = _mm_sub_ps(iy2,jy3);
1433 dz23 = _mm_sub_ps(iz2,jz3);
1434 dx31 = _mm_sub_ps(ix3,jx1);
1435 dy31 = _mm_sub_ps(iy3,jy1);
1436 dz31 = _mm_sub_ps(iz3,jz1);
1437 dx32 = _mm_sub_ps(ix3,jx2);
1438 dy32 = _mm_sub_ps(iy3,jy2);
1439 dz32 = _mm_sub_ps(iz3,jz2);
1440 dx33 = _mm_sub_ps(ix3,jx3);
1441 dy33 = _mm_sub_ps(iy3,jy3);
1442 dz33 = _mm_sub_ps(iz3,jz3);
1444 /* Calculate squared distance and things based on it */
1445 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1447 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1448 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1449 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1450 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1451 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1452 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1453 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1454 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1456 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1457 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1458 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1459 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1460 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1461 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1462 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1463 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1464 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1466 rinvsq00 = gmx_mm_inv_ps(rsq00);
1467 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1468 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1469 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1470 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1471 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1472 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1473 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1474 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1475 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1477 fjx0 = _mm_setzero_ps();
1478 fjy0 = _mm_setzero_ps();
1479 fjz0 = _mm_setzero_ps();
1480 fjx1 = _mm_setzero_ps();
1481 fjy1 = _mm_setzero_ps();
1482 fjz1 = _mm_setzero_ps();
1483 fjx2 = _mm_setzero_ps();
1484 fjy2 = _mm_setzero_ps();
1485 fjz2 = _mm_setzero_ps();
1486 fjx3 = _mm_setzero_ps();
1487 fjy3 = _mm_setzero_ps();
1488 fjz3 = _mm_setzero_ps();
1490 /**************************
1491 * CALCULATE INTERACTIONS *
1492 **************************/
1494 /* LENNARD-JONES DISPERSION/REPULSION */
1496 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1497 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1501 fscal = _mm_andnot_ps(dummy_mask,fscal);
1503 /* Update vectorial force */
1504 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1505 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1506 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1508 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1509 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1510 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1512 /**************************
1513 * CALCULATE INTERACTIONS *
1514 **************************/
1516 /* COULOMB ELECTROSTATICS */
1517 velec = _mm_mul_ps(qq11,rinv11);
1518 felec = _mm_mul_ps(velec,rinvsq11);
1522 fscal = _mm_andnot_ps(dummy_mask,fscal);
1524 /* Update vectorial force */
1525 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1526 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1527 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1529 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1530 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1531 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1533 /**************************
1534 * CALCULATE INTERACTIONS *
1535 **************************/
1537 /* COULOMB ELECTROSTATICS */
1538 velec = _mm_mul_ps(qq12,rinv12);
1539 felec = _mm_mul_ps(velec,rinvsq12);
1543 fscal = _mm_andnot_ps(dummy_mask,fscal);
1545 /* Update vectorial force */
1546 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1547 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1548 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1550 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1551 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1552 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1554 /**************************
1555 * CALCULATE INTERACTIONS *
1556 **************************/
1558 /* COULOMB ELECTROSTATICS */
1559 velec = _mm_mul_ps(qq13,rinv13);
1560 felec = _mm_mul_ps(velec,rinvsq13);
1564 fscal = _mm_andnot_ps(dummy_mask,fscal);
1566 /* Update vectorial force */
1567 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1568 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1569 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1571 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1572 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1573 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1575 /**************************
1576 * CALCULATE INTERACTIONS *
1577 **************************/
1579 /* COULOMB ELECTROSTATICS */
1580 velec = _mm_mul_ps(qq21,rinv21);
1581 felec = _mm_mul_ps(velec,rinvsq21);
1585 fscal = _mm_andnot_ps(dummy_mask,fscal);
1587 /* Update vectorial force */
1588 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1589 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1590 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1592 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1593 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1594 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 /* COULOMB ELECTROSTATICS */
1601 velec = _mm_mul_ps(qq22,rinv22);
1602 felec = _mm_mul_ps(velec,rinvsq22);
1606 fscal = _mm_andnot_ps(dummy_mask,fscal);
1608 /* Update vectorial force */
1609 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1610 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1611 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1613 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1614 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1615 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 /* COULOMB ELECTROSTATICS */
1622 velec = _mm_mul_ps(qq23,rinv23);
1623 felec = _mm_mul_ps(velec,rinvsq23);
1627 fscal = _mm_andnot_ps(dummy_mask,fscal);
1629 /* Update vectorial force */
1630 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1631 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1632 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1634 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1635 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1636 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1638 /**************************
1639 * CALCULATE INTERACTIONS *
1640 **************************/
1642 /* COULOMB ELECTROSTATICS */
1643 velec = _mm_mul_ps(qq31,rinv31);
1644 felec = _mm_mul_ps(velec,rinvsq31);
1648 fscal = _mm_andnot_ps(dummy_mask,fscal);
1650 /* Update vectorial force */
1651 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1652 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1653 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1655 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1656 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1657 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1659 /**************************
1660 * CALCULATE INTERACTIONS *
1661 **************************/
1663 /* COULOMB ELECTROSTATICS */
1664 velec = _mm_mul_ps(qq32,rinv32);
1665 felec = _mm_mul_ps(velec,rinvsq32);
1669 fscal = _mm_andnot_ps(dummy_mask,fscal);
1671 /* Update vectorial force */
1672 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1673 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1674 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1676 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1677 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1678 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1680 /**************************
1681 * CALCULATE INTERACTIONS *
1682 **************************/
1684 /* COULOMB ELECTROSTATICS */
1685 velec = _mm_mul_ps(qq33,rinv33);
1686 felec = _mm_mul_ps(velec,rinvsq33);
1690 fscal = _mm_andnot_ps(dummy_mask,fscal);
1692 /* Update vectorial force */
1693 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1694 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1695 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1697 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1698 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1699 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1701 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1702 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1703 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1704 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1706 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1707 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1708 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1710 /* Inner loop uses 303 flops */
1713 /* End of innermost loop */
1715 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1716 f+i_coord_offset,fshift+i_shift_offset);
1718 /* Increment number of inner iterations */
1719 inneriter += j_index_end - j_index_start;
1721 /* Outer loop uses 24 flops */
1724 /* Increment number of outer iterations */
1727 /* Update outer/inner flops */
1729 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);