2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 krf = _mm_set1_ps(fr->ic->k_rf);
135 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
136 crf = _mm_set1_ps(fr->ic->c_rf);
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 jq3 = _mm_set1_ps(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq11 = _mm_mul_ps(iq1,jq1);
155 qq12 = _mm_mul_ps(iq1,jq2);
156 qq13 = _mm_mul_ps(iq1,jq3);
157 qq21 = _mm_mul_ps(iq2,jq1);
158 qq22 = _mm_mul_ps(iq2,jq2);
159 qq23 = _mm_mul_ps(iq2,jq3);
160 qq31 = _mm_mul_ps(iq3,jq1);
161 qq32 = _mm_mul_ps(iq3,jq2);
162 qq33 = _mm_mul_ps(iq3,jq3);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm_set1_ps(rcutoff_scalar);
167 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
169 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
170 rvdw = _mm_set1_ps(fr->rvdw);
172 /* Avoid stupid compiler warnings */
173 jnrA = jnrB = jnrC = jnrD = 0;
182 for(iidx=0;iidx<4*DIM;iidx++)
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
190 /* Load shift vector for this list */
191 i_shift_offset = DIM*shiftidx[iidx];
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
197 /* Get outer coordinate index */
199 i_coord_offset = DIM*inr;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
203 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
205 fix0 = _mm_setzero_ps();
206 fiy0 = _mm_setzero_ps();
207 fiz0 = _mm_setzero_ps();
208 fix1 = _mm_setzero_ps();
209 fiy1 = _mm_setzero_ps();
210 fiz1 = _mm_setzero_ps();
211 fix2 = _mm_setzero_ps();
212 fiy2 = _mm_setzero_ps();
213 fiz2 = _mm_setzero_ps();
214 fix3 = _mm_setzero_ps();
215 fiy3 = _mm_setzero_ps();
216 fiz3 = _mm_setzero_ps();
218 /* Reset potential sums */
219 velecsum = _mm_setzero_ps();
220 vvdwsum = _mm_setzero_ps();
222 /* Start inner kernel loop */
223 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
233 j_coord_offsetC = DIM*jnrC;
234 j_coord_offsetD = DIM*jnrD;
236 /* load j atom coordinates */
237 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
238 x+j_coord_offsetC,x+j_coord_offsetD,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240 &jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx00 = _mm_sub_ps(ix0,jx0);
244 dy00 = _mm_sub_ps(iy0,jy0);
245 dz00 = _mm_sub_ps(iz0,jz0);
246 dx11 = _mm_sub_ps(ix1,jx1);
247 dy11 = _mm_sub_ps(iy1,jy1);
248 dz11 = _mm_sub_ps(iz1,jz1);
249 dx12 = _mm_sub_ps(ix1,jx2);
250 dy12 = _mm_sub_ps(iy1,jy2);
251 dz12 = _mm_sub_ps(iz1,jz2);
252 dx13 = _mm_sub_ps(ix1,jx3);
253 dy13 = _mm_sub_ps(iy1,jy3);
254 dz13 = _mm_sub_ps(iz1,jz3);
255 dx21 = _mm_sub_ps(ix2,jx1);
256 dy21 = _mm_sub_ps(iy2,jy1);
257 dz21 = _mm_sub_ps(iz2,jz1);
258 dx22 = _mm_sub_ps(ix2,jx2);
259 dy22 = _mm_sub_ps(iy2,jy2);
260 dz22 = _mm_sub_ps(iz2,jz2);
261 dx23 = _mm_sub_ps(ix2,jx3);
262 dy23 = _mm_sub_ps(iy2,jy3);
263 dz23 = _mm_sub_ps(iz2,jz3);
264 dx31 = _mm_sub_ps(ix3,jx1);
265 dy31 = _mm_sub_ps(iy3,jy1);
266 dz31 = _mm_sub_ps(iz3,jz1);
267 dx32 = _mm_sub_ps(ix3,jx2);
268 dy32 = _mm_sub_ps(iy3,jy2);
269 dz32 = _mm_sub_ps(iz3,jz2);
270 dx33 = _mm_sub_ps(ix3,jx3);
271 dy33 = _mm_sub_ps(iy3,jy3);
272 dz33 = _mm_sub_ps(iz3,jz3);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
276 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
277 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
278 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
279 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
280 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
281 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
282 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
283 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
284 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
286 rinv11 = gmx_mm_invsqrt_ps(rsq11);
287 rinv12 = gmx_mm_invsqrt_ps(rsq12);
288 rinv13 = gmx_mm_invsqrt_ps(rsq13);
289 rinv21 = gmx_mm_invsqrt_ps(rsq21);
290 rinv22 = gmx_mm_invsqrt_ps(rsq22);
291 rinv23 = gmx_mm_invsqrt_ps(rsq23);
292 rinv31 = gmx_mm_invsqrt_ps(rsq31);
293 rinv32 = gmx_mm_invsqrt_ps(rsq32);
294 rinv33 = gmx_mm_invsqrt_ps(rsq33);
296 rinvsq00 = gmx_mm_inv_ps(rsq00);
297 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
298 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
299 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
300 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
301 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
302 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
303 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
304 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
305 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
307 fjx0 = _mm_setzero_ps();
308 fjy0 = _mm_setzero_ps();
309 fjz0 = _mm_setzero_ps();
310 fjx1 = _mm_setzero_ps();
311 fjy1 = _mm_setzero_ps();
312 fjz1 = _mm_setzero_ps();
313 fjx2 = _mm_setzero_ps();
314 fjy2 = _mm_setzero_ps();
315 fjz2 = _mm_setzero_ps();
316 fjx3 = _mm_setzero_ps();
317 fjy3 = _mm_setzero_ps();
318 fjz3 = _mm_setzero_ps();
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 if (gmx_mm_any_lt(rsq00,rcutoff2))
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
330 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
331 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
332 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
333 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
334 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
336 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 vvdw = _mm_and_ps(vvdw,cutoff_mask);
340 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
344 fscal = _mm_and_ps(fscal,cutoff_mask);
346 /* Update vectorial force */
347 fix0 = _mm_macc_ps(dx00,fscal,fix0);
348 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
349 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
351 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
352 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
353 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 if (gmx_mm_any_lt(rsq11,rcutoff2))
364 /* REACTION-FIELD ELECTROSTATICS */
365 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
366 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
368 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
370 /* Update potential sum for this i atom from the interaction with this j atom. */
371 velec = _mm_and_ps(velec,cutoff_mask);
372 velecsum = _mm_add_ps(velecsum,velec);
376 fscal = _mm_and_ps(fscal,cutoff_mask);
378 /* Update vectorial force */
379 fix1 = _mm_macc_ps(dx11,fscal,fix1);
380 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
381 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
383 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
384 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
385 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 if (gmx_mm_any_lt(rsq12,rcutoff2))
396 /* REACTION-FIELD ELECTROSTATICS */
397 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
398 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
400 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velec = _mm_and_ps(velec,cutoff_mask);
404 velecsum = _mm_add_ps(velecsum,velec);
408 fscal = _mm_and_ps(fscal,cutoff_mask);
410 /* Update vectorial force */
411 fix1 = _mm_macc_ps(dx12,fscal,fix1);
412 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
413 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
415 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
416 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
417 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 if (gmx_mm_any_lt(rsq13,rcutoff2))
428 /* REACTION-FIELD ELECTROSTATICS */
429 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_macc_ps(krf,rsq13,rinv13),crf));
430 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
432 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velec = _mm_and_ps(velec,cutoff_mask);
436 velecsum = _mm_add_ps(velecsum,velec);
440 fscal = _mm_and_ps(fscal,cutoff_mask);
442 /* Update vectorial force */
443 fix1 = _mm_macc_ps(dx13,fscal,fix1);
444 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
445 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
447 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
448 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
449 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 if (gmx_mm_any_lt(rsq21,rcutoff2))
460 /* REACTION-FIELD ELECTROSTATICS */
461 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
462 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
464 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velec = _mm_and_ps(velec,cutoff_mask);
468 velecsum = _mm_add_ps(velecsum,velec);
472 fscal = _mm_and_ps(fscal,cutoff_mask);
474 /* Update vectorial force */
475 fix2 = _mm_macc_ps(dx21,fscal,fix2);
476 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
477 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
479 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
480 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
481 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 if (gmx_mm_any_lt(rsq22,rcutoff2))
492 /* REACTION-FIELD ELECTROSTATICS */
493 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
494 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
496 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velec = _mm_and_ps(velec,cutoff_mask);
500 velecsum = _mm_add_ps(velecsum,velec);
504 fscal = _mm_and_ps(fscal,cutoff_mask);
506 /* Update vectorial force */
507 fix2 = _mm_macc_ps(dx22,fscal,fix2);
508 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
509 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
511 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
512 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
513 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 if (gmx_mm_any_lt(rsq23,rcutoff2))
524 /* REACTION-FIELD ELECTROSTATICS */
525 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_macc_ps(krf,rsq23,rinv23),crf));
526 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
528 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velec = _mm_and_ps(velec,cutoff_mask);
532 velecsum = _mm_add_ps(velecsum,velec);
536 fscal = _mm_and_ps(fscal,cutoff_mask);
538 /* Update vectorial force */
539 fix2 = _mm_macc_ps(dx23,fscal,fix2);
540 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
541 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
543 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
544 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
545 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 if (gmx_mm_any_lt(rsq31,rcutoff2))
556 /* REACTION-FIELD ELECTROSTATICS */
557 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_macc_ps(krf,rsq31,rinv31),crf));
558 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
560 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velec = _mm_and_ps(velec,cutoff_mask);
564 velecsum = _mm_add_ps(velecsum,velec);
568 fscal = _mm_and_ps(fscal,cutoff_mask);
570 /* Update vectorial force */
571 fix3 = _mm_macc_ps(dx31,fscal,fix3);
572 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
573 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
575 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
576 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
577 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
585 if (gmx_mm_any_lt(rsq32,rcutoff2))
588 /* REACTION-FIELD ELECTROSTATICS */
589 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_macc_ps(krf,rsq32,rinv32),crf));
590 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
592 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
594 /* Update potential sum for this i atom from the interaction with this j atom. */
595 velec = _mm_and_ps(velec,cutoff_mask);
596 velecsum = _mm_add_ps(velecsum,velec);
600 fscal = _mm_and_ps(fscal,cutoff_mask);
602 /* Update vectorial force */
603 fix3 = _mm_macc_ps(dx32,fscal,fix3);
604 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
605 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
607 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
608 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
609 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm_any_lt(rsq33,rcutoff2))
620 /* REACTION-FIELD ELECTROSTATICS */
621 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_macc_ps(krf,rsq33,rinv33),crf));
622 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
624 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velec = _mm_and_ps(velec,cutoff_mask);
628 velecsum = _mm_add_ps(velecsum,velec);
632 fscal = _mm_and_ps(fscal,cutoff_mask);
634 /* Update vectorial force */
635 fix3 = _mm_macc_ps(dx33,fscal,fix3);
636 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
637 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
639 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
640 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
641 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
645 fjptrA = f+j_coord_offsetA;
646 fjptrB = f+j_coord_offsetB;
647 fjptrC = f+j_coord_offsetC;
648 fjptrD = f+j_coord_offsetD;
650 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
651 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
652 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
654 /* Inner loop uses 398 flops */
660 /* Get j neighbor index, and coordinate index */
661 jnrlistA = jjnr[jidx];
662 jnrlistB = jjnr[jidx+1];
663 jnrlistC = jjnr[jidx+2];
664 jnrlistD = jjnr[jidx+3];
665 /* Sign of each element will be negative for non-real atoms.
666 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
667 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
669 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
670 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
671 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
672 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
673 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
674 j_coord_offsetA = DIM*jnrA;
675 j_coord_offsetB = DIM*jnrB;
676 j_coord_offsetC = DIM*jnrC;
677 j_coord_offsetD = DIM*jnrD;
679 /* load j atom coordinates */
680 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
681 x+j_coord_offsetC,x+j_coord_offsetD,
682 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
683 &jy2,&jz2,&jx3,&jy3,&jz3);
685 /* Calculate displacement vector */
686 dx00 = _mm_sub_ps(ix0,jx0);
687 dy00 = _mm_sub_ps(iy0,jy0);
688 dz00 = _mm_sub_ps(iz0,jz0);
689 dx11 = _mm_sub_ps(ix1,jx1);
690 dy11 = _mm_sub_ps(iy1,jy1);
691 dz11 = _mm_sub_ps(iz1,jz1);
692 dx12 = _mm_sub_ps(ix1,jx2);
693 dy12 = _mm_sub_ps(iy1,jy2);
694 dz12 = _mm_sub_ps(iz1,jz2);
695 dx13 = _mm_sub_ps(ix1,jx3);
696 dy13 = _mm_sub_ps(iy1,jy3);
697 dz13 = _mm_sub_ps(iz1,jz3);
698 dx21 = _mm_sub_ps(ix2,jx1);
699 dy21 = _mm_sub_ps(iy2,jy1);
700 dz21 = _mm_sub_ps(iz2,jz1);
701 dx22 = _mm_sub_ps(ix2,jx2);
702 dy22 = _mm_sub_ps(iy2,jy2);
703 dz22 = _mm_sub_ps(iz2,jz2);
704 dx23 = _mm_sub_ps(ix2,jx3);
705 dy23 = _mm_sub_ps(iy2,jy3);
706 dz23 = _mm_sub_ps(iz2,jz3);
707 dx31 = _mm_sub_ps(ix3,jx1);
708 dy31 = _mm_sub_ps(iy3,jy1);
709 dz31 = _mm_sub_ps(iz3,jz1);
710 dx32 = _mm_sub_ps(ix3,jx2);
711 dy32 = _mm_sub_ps(iy3,jy2);
712 dz32 = _mm_sub_ps(iz3,jz2);
713 dx33 = _mm_sub_ps(ix3,jx3);
714 dy33 = _mm_sub_ps(iy3,jy3);
715 dz33 = _mm_sub_ps(iz3,jz3);
717 /* Calculate squared distance and things based on it */
718 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
719 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
720 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
721 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
722 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
723 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
724 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
725 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
726 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
727 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
729 rinv11 = gmx_mm_invsqrt_ps(rsq11);
730 rinv12 = gmx_mm_invsqrt_ps(rsq12);
731 rinv13 = gmx_mm_invsqrt_ps(rsq13);
732 rinv21 = gmx_mm_invsqrt_ps(rsq21);
733 rinv22 = gmx_mm_invsqrt_ps(rsq22);
734 rinv23 = gmx_mm_invsqrt_ps(rsq23);
735 rinv31 = gmx_mm_invsqrt_ps(rsq31);
736 rinv32 = gmx_mm_invsqrt_ps(rsq32);
737 rinv33 = gmx_mm_invsqrt_ps(rsq33);
739 rinvsq00 = gmx_mm_inv_ps(rsq00);
740 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
741 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
742 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
743 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
744 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
745 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
746 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
747 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
748 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
750 fjx0 = _mm_setzero_ps();
751 fjy0 = _mm_setzero_ps();
752 fjz0 = _mm_setzero_ps();
753 fjx1 = _mm_setzero_ps();
754 fjy1 = _mm_setzero_ps();
755 fjz1 = _mm_setzero_ps();
756 fjx2 = _mm_setzero_ps();
757 fjy2 = _mm_setzero_ps();
758 fjz2 = _mm_setzero_ps();
759 fjx3 = _mm_setzero_ps();
760 fjy3 = _mm_setzero_ps();
761 fjz3 = _mm_setzero_ps();
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 if (gmx_mm_any_lt(rsq00,rcutoff2))
770 /* LENNARD-JONES DISPERSION/REPULSION */
772 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
773 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
774 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
775 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
776 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
777 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
779 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
781 /* Update potential sum for this i atom from the interaction with this j atom. */
782 vvdw = _mm_and_ps(vvdw,cutoff_mask);
783 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
784 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
788 fscal = _mm_and_ps(fscal,cutoff_mask);
790 fscal = _mm_andnot_ps(dummy_mask,fscal);
792 /* Update vectorial force */
793 fix0 = _mm_macc_ps(dx00,fscal,fix0);
794 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
795 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
797 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
798 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
799 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
803 /**************************
804 * CALCULATE INTERACTIONS *
805 **************************/
807 if (gmx_mm_any_lt(rsq11,rcutoff2))
810 /* REACTION-FIELD ELECTROSTATICS */
811 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
812 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
814 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
816 /* Update potential sum for this i atom from the interaction with this j atom. */
817 velec = _mm_and_ps(velec,cutoff_mask);
818 velec = _mm_andnot_ps(dummy_mask,velec);
819 velecsum = _mm_add_ps(velecsum,velec);
823 fscal = _mm_and_ps(fscal,cutoff_mask);
825 fscal = _mm_andnot_ps(dummy_mask,fscal);
827 /* Update vectorial force */
828 fix1 = _mm_macc_ps(dx11,fscal,fix1);
829 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
830 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
832 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
833 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
834 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 if (gmx_mm_any_lt(rsq12,rcutoff2))
845 /* REACTION-FIELD ELECTROSTATICS */
846 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
847 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
849 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
851 /* Update potential sum for this i atom from the interaction with this j atom. */
852 velec = _mm_and_ps(velec,cutoff_mask);
853 velec = _mm_andnot_ps(dummy_mask,velec);
854 velecsum = _mm_add_ps(velecsum,velec);
858 fscal = _mm_and_ps(fscal,cutoff_mask);
860 fscal = _mm_andnot_ps(dummy_mask,fscal);
862 /* Update vectorial force */
863 fix1 = _mm_macc_ps(dx12,fscal,fix1);
864 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
865 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
867 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
868 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
869 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 if (gmx_mm_any_lt(rsq13,rcutoff2))
880 /* REACTION-FIELD ELECTROSTATICS */
881 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_macc_ps(krf,rsq13,rinv13),crf));
882 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
884 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm_and_ps(velec,cutoff_mask);
888 velec = _mm_andnot_ps(dummy_mask,velec);
889 velecsum = _mm_add_ps(velecsum,velec);
893 fscal = _mm_and_ps(fscal,cutoff_mask);
895 fscal = _mm_andnot_ps(dummy_mask,fscal);
897 /* Update vectorial force */
898 fix1 = _mm_macc_ps(dx13,fscal,fix1);
899 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
900 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
902 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
903 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
904 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 if (gmx_mm_any_lt(rsq21,rcutoff2))
915 /* REACTION-FIELD ELECTROSTATICS */
916 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
917 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
919 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm_and_ps(velec,cutoff_mask);
923 velec = _mm_andnot_ps(dummy_mask,velec);
924 velecsum = _mm_add_ps(velecsum,velec);
928 fscal = _mm_and_ps(fscal,cutoff_mask);
930 fscal = _mm_andnot_ps(dummy_mask,fscal);
932 /* Update vectorial force */
933 fix2 = _mm_macc_ps(dx21,fscal,fix2);
934 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
935 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
937 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
938 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
939 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_mm_any_lt(rsq22,rcutoff2))
950 /* REACTION-FIELD ELECTROSTATICS */
951 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
952 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
954 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm_and_ps(velec,cutoff_mask);
958 velec = _mm_andnot_ps(dummy_mask,velec);
959 velecsum = _mm_add_ps(velecsum,velec);
963 fscal = _mm_and_ps(fscal,cutoff_mask);
965 fscal = _mm_andnot_ps(dummy_mask,fscal);
967 /* Update vectorial force */
968 fix2 = _mm_macc_ps(dx22,fscal,fix2);
969 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
970 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
972 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
973 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
974 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 if (gmx_mm_any_lt(rsq23,rcutoff2))
985 /* REACTION-FIELD ELECTROSTATICS */
986 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_macc_ps(krf,rsq23,rinv23),crf));
987 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
989 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm_and_ps(velec,cutoff_mask);
993 velec = _mm_andnot_ps(dummy_mask,velec);
994 velecsum = _mm_add_ps(velecsum,velec);
998 fscal = _mm_and_ps(fscal,cutoff_mask);
1000 fscal = _mm_andnot_ps(dummy_mask,fscal);
1002 /* Update vectorial force */
1003 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1004 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1005 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1007 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1008 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1009 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 if (gmx_mm_any_lt(rsq31,rcutoff2))
1020 /* REACTION-FIELD ELECTROSTATICS */
1021 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_macc_ps(krf,rsq31,rinv31),crf));
1022 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
1024 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm_and_ps(velec,cutoff_mask);
1028 velec = _mm_andnot_ps(dummy_mask,velec);
1029 velecsum = _mm_add_ps(velecsum,velec);
1033 fscal = _mm_and_ps(fscal,cutoff_mask);
1035 fscal = _mm_andnot_ps(dummy_mask,fscal);
1037 /* Update vectorial force */
1038 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1039 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1040 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1042 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1043 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1044 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 if (gmx_mm_any_lt(rsq32,rcutoff2))
1055 /* REACTION-FIELD ELECTROSTATICS */
1056 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_macc_ps(krf,rsq32,rinv32),crf));
1057 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
1059 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1061 /* Update potential sum for this i atom from the interaction with this j atom. */
1062 velec = _mm_and_ps(velec,cutoff_mask);
1063 velec = _mm_andnot_ps(dummy_mask,velec);
1064 velecsum = _mm_add_ps(velecsum,velec);
1068 fscal = _mm_and_ps(fscal,cutoff_mask);
1070 fscal = _mm_andnot_ps(dummy_mask,fscal);
1072 /* Update vectorial force */
1073 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1074 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1075 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1077 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1078 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1079 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1083 /**************************
1084 * CALCULATE INTERACTIONS *
1085 **************************/
1087 if (gmx_mm_any_lt(rsq33,rcutoff2))
1090 /* REACTION-FIELD ELECTROSTATICS */
1091 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_macc_ps(krf,rsq33,rinv33),crf));
1092 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
1094 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1096 /* Update potential sum for this i atom from the interaction with this j atom. */
1097 velec = _mm_and_ps(velec,cutoff_mask);
1098 velec = _mm_andnot_ps(dummy_mask,velec);
1099 velecsum = _mm_add_ps(velecsum,velec);
1103 fscal = _mm_and_ps(fscal,cutoff_mask);
1105 fscal = _mm_andnot_ps(dummy_mask,fscal);
1107 /* Update vectorial force */
1108 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1109 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1110 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1112 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1113 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1114 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1118 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1119 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1120 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1121 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1123 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1124 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1125 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1127 /* Inner loop uses 398 flops */
1130 /* End of innermost loop */
1132 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1133 f+i_coord_offset,fshift+i_shift_offset);
1136 /* Update potential energies */
1137 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1138 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1140 /* Increment number of inner iterations */
1141 inneriter += j_index_end - j_index_start;
1143 /* Outer loop uses 26 flops */
1146 /* Increment number of outer iterations */
1149 /* Update outer/inner flops */
1151 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*398);
1154 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_single
1155 * Electrostatics interaction: ReactionField
1156 * VdW interaction: LennardJones
1157 * Geometry: Water4-Water4
1158 * Calculate force/pot: Force
1161 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_128_fma_single
1162 (t_nblist * gmx_restrict nlist,
1163 rvec * gmx_restrict xx,
1164 rvec * gmx_restrict ff,
1165 t_forcerec * gmx_restrict fr,
1166 t_mdatoms * gmx_restrict mdatoms,
1167 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1168 t_nrnb * gmx_restrict nrnb)
1170 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1171 * just 0 for non-waters.
1172 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1173 * jnr indices corresponding to data put in the four positions in the SIMD register.
1175 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1176 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1177 int jnrA,jnrB,jnrC,jnrD;
1178 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1179 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1180 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1181 real rcutoff_scalar;
1182 real *shiftvec,*fshift,*x,*f;
1183 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1184 real scratch[4*DIM];
1185 __m128 fscal,rcutoff,rcutoff2,jidxall;
1187 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1189 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1191 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1193 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1194 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1195 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1196 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1197 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1198 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1199 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1200 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1201 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1202 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1203 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1204 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1205 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1206 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1207 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1208 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1209 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1210 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1211 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1212 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1215 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1218 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1219 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1220 __m128 dummy_mask,cutoff_mask;
1221 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1222 __m128 one = _mm_set1_ps(1.0);
1223 __m128 two = _mm_set1_ps(2.0);
1229 jindex = nlist->jindex;
1231 shiftidx = nlist->shift;
1233 shiftvec = fr->shift_vec[0];
1234 fshift = fr->fshift[0];
1235 facel = _mm_set1_ps(fr->epsfac);
1236 charge = mdatoms->chargeA;
1237 krf = _mm_set1_ps(fr->ic->k_rf);
1238 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1239 crf = _mm_set1_ps(fr->ic->c_rf);
1240 nvdwtype = fr->ntype;
1241 vdwparam = fr->nbfp;
1242 vdwtype = mdatoms->typeA;
1244 /* Setup water-specific parameters */
1245 inr = nlist->iinr[0];
1246 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1247 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1248 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1249 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1251 jq1 = _mm_set1_ps(charge[inr+1]);
1252 jq2 = _mm_set1_ps(charge[inr+2]);
1253 jq3 = _mm_set1_ps(charge[inr+3]);
1254 vdwjidx0A = 2*vdwtype[inr+0];
1255 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1256 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1257 qq11 = _mm_mul_ps(iq1,jq1);
1258 qq12 = _mm_mul_ps(iq1,jq2);
1259 qq13 = _mm_mul_ps(iq1,jq3);
1260 qq21 = _mm_mul_ps(iq2,jq1);
1261 qq22 = _mm_mul_ps(iq2,jq2);
1262 qq23 = _mm_mul_ps(iq2,jq3);
1263 qq31 = _mm_mul_ps(iq3,jq1);
1264 qq32 = _mm_mul_ps(iq3,jq2);
1265 qq33 = _mm_mul_ps(iq3,jq3);
1267 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1268 rcutoff_scalar = fr->rcoulomb;
1269 rcutoff = _mm_set1_ps(rcutoff_scalar);
1270 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1272 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1273 rvdw = _mm_set1_ps(fr->rvdw);
1275 /* Avoid stupid compiler warnings */
1276 jnrA = jnrB = jnrC = jnrD = 0;
1277 j_coord_offsetA = 0;
1278 j_coord_offsetB = 0;
1279 j_coord_offsetC = 0;
1280 j_coord_offsetD = 0;
1285 for(iidx=0;iidx<4*DIM;iidx++)
1287 scratch[iidx] = 0.0;
1290 /* Start outer loop over neighborlists */
1291 for(iidx=0; iidx<nri; iidx++)
1293 /* Load shift vector for this list */
1294 i_shift_offset = DIM*shiftidx[iidx];
1296 /* Load limits for loop over neighbors */
1297 j_index_start = jindex[iidx];
1298 j_index_end = jindex[iidx+1];
1300 /* Get outer coordinate index */
1302 i_coord_offset = DIM*inr;
1304 /* Load i particle coords and add shift vector */
1305 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1306 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1308 fix0 = _mm_setzero_ps();
1309 fiy0 = _mm_setzero_ps();
1310 fiz0 = _mm_setzero_ps();
1311 fix1 = _mm_setzero_ps();
1312 fiy1 = _mm_setzero_ps();
1313 fiz1 = _mm_setzero_ps();
1314 fix2 = _mm_setzero_ps();
1315 fiy2 = _mm_setzero_ps();
1316 fiz2 = _mm_setzero_ps();
1317 fix3 = _mm_setzero_ps();
1318 fiy3 = _mm_setzero_ps();
1319 fiz3 = _mm_setzero_ps();
1321 /* Start inner kernel loop */
1322 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1325 /* Get j neighbor index, and coordinate index */
1327 jnrB = jjnr[jidx+1];
1328 jnrC = jjnr[jidx+2];
1329 jnrD = jjnr[jidx+3];
1330 j_coord_offsetA = DIM*jnrA;
1331 j_coord_offsetB = DIM*jnrB;
1332 j_coord_offsetC = DIM*jnrC;
1333 j_coord_offsetD = DIM*jnrD;
1335 /* load j atom coordinates */
1336 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1337 x+j_coord_offsetC,x+j_coord_offsetD,
1338 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1339 &jy2,&jz2,&jx3,&jy3,&jz3);
1341 /* Calculate displacement vector */
1342 dx00 = _mm_sub_ps(ix0,jx0);
1343 dy00 = _mm_sub_ps(iy0,jy0);
1344 dz00 = _mm_sub_ps(iz0,jz0);
1345 dx11 = _mm_sub_ps(ix1,jx1);
1346 dy11 = _mm_sub_ps(iy1,jy1);
1347 dz11 = _mm_sub_ps(iz1,jz1);
1348 dx12 = _mm_sub_ps(ix1,jx2);
1349 dy12 = _mm_sub_ps(iy1,jy2);
1350 dz12 = _mm_sub_ps(iz1,jz2);
1351 dx13 = _mm_sub_ps(ix1,jx3);
1352 dy13 = _mm_sub_ps(iy1,jy3);
1353 dz13 = _mm_sub_ps(iz1,jz3);
1354 dx21 = _mm_sub_ps(ix2,jx1);
1355 dy21 = _mm_sub_ps(iy2,jy1);
1356 dz21 = _mm_sub_ps(iz2,jz1);
1357 dx22 = _mm_sub_ps(ix2,jx2);
1358 dy22 = _mm_sub_ps(iy2,jy2);
1359 dz22 = _mm_sub_ps(iz2,jz2);
1360 dx23 = _mm_sub_ps(ix2,jx3);
1361 dy23 = _mm_sub_ps(iy2,jy3);
1362 dz23 = _mm_sub_ps(iz2,jz3);
1363 dx31 = _mm_sub_ps(ix3,jx1);
1364 dy31 = _mm_sub_ps(iy3,jy1);
1365 dz31 = _mm_sub_ps(iz3,jz1);
1366 dx32 = _mm_sub_ps(ix3,jx2);
1367 dy32 = _mm_sub_ps(iy3,jy2);
1368 dz32 = _mm_sub_ps(iz3,jz2);
1369 dx33 = _mm_sub_ps(ix3,jx3);
1370 dy33 = _mm_sub_ps(iy3,jy3);
1371 dz33 = _mm_sub_ps(iz3,jz3);
1373 /* Calculate squared distance and things based on it */
1374 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1375 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1376 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1377 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1378 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1379 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1380 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1381 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1382 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1383 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1385 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1386 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1387 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1388 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1389 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1390 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1391 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1392 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1393 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1395 rinvsq00 = gmx_mm_inv_ps(rsq00);
1396 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1397 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1398 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1399 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1400 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1401 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1402 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1403 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1404 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1406 fjx0 = _mm_setzero_ps();
1407 fjy0 = _mm_setzero_ps();
1408 fjz0 = _mm_setzero_ps();
1409 fjx1 = _mm_setzero_ps();
1410 fjy1 = _mm_setzero_ps();
1411 fjz1 = _mm_setzero_ps();
1412 fjx2 = _mm_setzero_ps();
1413 fjy2 = _mm_setzero_ps();
1414 fjz2 = _mm_setzero_ps();
1415 fjx3 = _mm_setzero_ps();
1416 fjy3 = _mm_setzero_ps();
1417 fjz3 = _mm_setzero_ps();
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 if (gmx_mm_any_lt(rsq00,rcutoff2))
1426 /* LENNARD-JONES DISPERSION/REPULSION */
1428 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1429 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1431 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1435 fscal = _mm_and_ps(fscal,cutoff_mask);
1437 /* Update vectorial force */
1438 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1439 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1440 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1442 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1443 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1444 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 if (gmx_mm_any_lt(rsq11,rcutoff2))
1455 /* REACTION-FIELD ELECTROSTATICS */
1456 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1458 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1462 fscal = _mm_and_ps(fscal,cutoff_mask);
1464 /* Update vectorial force */
1465 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1466 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1467 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1469 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1470 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1471 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1475 /**************************
1476 * CALCULATE INTERACTIONS *
1477 **************************/
1479 if (gmx_mm_any_lt(rsq12,rcutoff2))
1482 /* REACTION-FIELD ELECTROSTATICS */
1483 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1485 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1489 fscal = _mm_and_ps(fscal,cutoff_mask);
1491 /* Update vectorial force */
1492 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1493 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1494 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1496 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1497 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1498 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 if (gmx_mm_any_lt(rsq13,rcutoff2))
1509 /* REACTION-FIELD ELECTROSTATICS */
1510 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
1512 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1516 fscal = _mm_and_ps(fscal,cutoff_mask);
1518 /* Update vectorial force */
1519 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1520 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1521 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1523 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1524 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1525 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 if (gmx_mm_any_lt(rsq21,rcutoff2))
1536 /* REACTION-FIELD ELECTROSTATICS */
1537 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1539 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1543 fscal = _mm_and_ps(fscal,cutoff_mask);
1545 /* Update vectorial force */
1546 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1547 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1548 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1550 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1551 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1552 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 if (gmx_mm_any_lt(rsq22,rcutoff2))
1563 /* REACTION-FIELD ELECTROSTATICS */
1564 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1566 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1570 fscal = _mm_and_ps(fscal,cutoff_mask);
1572 /* Update vectorial force */
1573 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1574 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1575 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1577 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1578 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1579 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1583 /**************************
1584 * CALCULATE INTERACTIONS *
1585 **************************/
1587 if (gmx_mm_any_lt(rsq23,rcutoff2))
1590 /* REACTION-FIELD ELECTROSTATICS */
1591 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
1593 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1597 fscal = _mm_and_ps(fscal,cutoff_mask);
1599 /* Update vectorial force */
1600 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1601 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1602 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1604 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1605 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1606 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 if (gmx_mm_any_lt(rsq31,rcutoff2))
1617 /* REACTION-FIELD ELECTROSTATICS */
1618 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
1620 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1624 fscal = _mm_and_ps(fscal,cutoff_mask);
1626 /* Update vectorial force */
1627 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1628 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1629 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1631 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1632 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1633 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1637 /**************************
1638 * CALCULATE INTERACTIONS *
1639 **************************/
1641 if (gmx_mm_any_lt(rsq32,rcutoff2))
1644 /* REACTION-FIELD ELECTROSTATICS */
1645 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
1647 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1651 fscal = _mm_and_ps(fscal,cutoff_mask);
1653 /* Update vectorial force */
1654 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1655 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1656 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1658 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1659 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1660 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 if (gmx_mm_any_lt(rsq33,rcutoff2))
1671 /* REACTION-FIELD ELECTROSTATICS */
1672 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
1674 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1678 fscal = _mm_and_ps(fscal,cutoff_mask);
1680 /* Update vectorial force */
1681 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1682 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1683 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1685 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1686 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1687 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1691 fjptrA = f+j_coord_offsetA;
1692 fjptrB = f+j_coord_offsetB;
1693 fjptrC = f+j_coord_offsetC;
1694 fjptrD = f+j_coord_offsetD;
1696 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1697 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1698 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1700 /* Inner loop uses 333 flops */
1703 if(jidx<j_index_end)
1706 /* Get j neighbor index, and coordinate index */
1707 jnrlistA = jjnr[jidx];
1708 jnrlistB = jjnr[jidx+1];
1709 jnrlistC = jjnr[jidx+2];
1710 jnrlistD = jjnr[jidx+3];
1711 /* Sign of each element will be negative for non-real atoms.
1712 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1713 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1715 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1716 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1717 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1718 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1719 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1720 j_coord_offsetA = DIM*jnrA;
1721 j_coord_offsetB = DIM*jnrB;
1722 j_coord_offsetC = DIM*jnrC;
1723 j_coord_offsetD = DIM*jnrD;
1725 /* load j atom coordinates */
1726 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1727 x+j_coord_offsetC,x+j_coord_offsetD,
1728 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1729 &jy2,&jz2,&jx3,&jy3,&jz3);
1731 /* Calculate displacement vector */
1732 dx00 = _mm_sub_ps(ix0,jx0);
1733 dy00 = _mm_sub_ps(iy0,jy0);
1734 dz00 = _mm_sub_ps(iz0,jz0);
1735 dx11 = _mm_sub_ps(ix1,jx1);
1736 dy11 = _mm_sub_ps(iy1,jy1);
1737 dz11 = _mm_sub_ps(iz1,jz1);
1738 dx12 = _mm_sub_ps(ix1,jx2);
1739 dy12 = _mm_sub_ps(iy1,jy2);
1740 dz12 = _mm_sub_ps(iz1,jz2);
1741 dx13 = _mm_sub_ps(ix1,jx3);
1742 dy13 = _mm_sub_ps(iy1,jy3);
1743 dz13 = _mm_sub_ps(iz1,jz3);
1744 dx21 = _mm_sub_ps(ix2,jx1);
1745 dy21 = _mm_sub_ps(iy2,jy1);
1746 dz21 = _mm_sub_ps(iz2,jz1);
1747 dx22 = _mm_sub_ps(ix2,jx2);
1748 dy22 = _mm_sub_ps(iy2,jy2);
1749 dz22 = _mm_sub_ps(iz2,jz2);
1750 dx23 = _mm_sub_ps(ix2,jx3);
1751 dy23 = _mm_sub_ps(iy2,jy3);
1752 dz23 = _mm_sub_ps(iz2,jz3);
1753 dx31 = _mm_sub_ps(ix3,jx1);
1754 dy31 = _mm_sub_ps(iy3,jy1);
1755 dz31 = _mm_sub_ps(iz3,jz1);
1756 dx32 = _mm_sub_ps(ix3,jx2);
1757 dy32 = _mm_sub_ps(iy3,jy2);
1758 dz32 = _mm_sub_ps(iz3,jz2);
1759 dx33 = _mm_sub_ps(ix3,jx3);
1760 dy33 = _mm_sub_ps(iy3,jy3);
1761 dz33 = _mm_sub_ps(iz3,jz3);
1763 /* Calculate squared distance and things based on it */
1764 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1765 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1766 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1767 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1768 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1769 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1770 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1771 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1772 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1773 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1775 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1776 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1777 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1778 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1779 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1780 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1781 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1782 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1783 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1785 rinvsq00 = gmx_mm_inv_ps(rsq00);
1786 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1787 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1788 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1789 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1790 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1791 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1792 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1793 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1794 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1796 fjx0 = _mm_setzero_ps();
1797 fjy0 = _mm_setzero_ps();
1798 fjz0 = _mm_setzero_ps();
1799 fjx1 = _mm_setzero_ps();
1800 fjy1 = _mm_setzero_ps();
1801 fjz1 = _mm_setzero_ps();
1802 fjx2 = _mm_setzero_ps();
1803 fjy2 = _mm_setzero_ps();
1804 fjz2 = _mm_setzero_ps();
1805 fjx3 = _mm_setzero_ps();
1806 fjy3 = _mm_setzero_ps();
1807 fjz3 = _mm_setzero_ps();
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 if (gmx_mm_any_lt(rsq00,rcutoff2))
1816 /* LENNARD-JONES DISPERSION/REPULSION */
1818 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1819 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1821 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1825 fscal = _mm_and_ps(fscal,cutoff_mask);
1827 fscal = _mm_andnot_ps(dummy_mask,fscal);
1829 /* Update vectorial force */
1830 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1831 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1832 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1834 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1835 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1836 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 if (gmx_mm_any_lt(rsq11,rcutoff2))
1847 /* REACTION-FIELD ELECTROSTATICS */
1848 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1850 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1854 fscal = _mm_and_ps(fscal,cutoff_mask);
1856 fscal = _mm_andnot_ps(dummy_mask,fscal);
1858 /* Update vectorial force */
1859 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1860 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1861 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1863 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1864 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1865 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 if (gmx_mm_any_lt(rsq12,rcutoff2))
1876 /* REACTION-FIELD ELECTROSTATICS */
1877 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1879 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1883 fscal = _mm_and_ps(fscal,cutoff_mask);
1885 fscal = _mm_andnot_ps(dummy_mask,fscal);
1887 /* Update vectorial force */
1888 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1889 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1890 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1892 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1893 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1894 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1898 /**************************
1899 * CALCULATE INTERACTIONS *
1900 **************************/
1902 if (gmx_mm_any_lt(rsq13,rcutoff2))
1905 /* REACTION-FIELD ELECTROSTATICS */
1906 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
1908 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1912 fscal = _mm_and_ps(fscal,cutoff_mask);
1914 fscal = _mm_andnot_ps(dummy_mask,fscal);
1916 /* Update vectorial force */
1917 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1918 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1919 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1921 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1922 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1923 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1927 /**************************
1928 * CALCULATE INTERACTIONS *
1929 **************************/
1931 if (gmx_mm_any_lt(rsq21,rcutoff2))
1934 /* REACTION-FIELD ELECTROSTATICS */
1935 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1937 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1941 fscal = _mm_and_ps(fscal,cutoff_mask);
1943 fscal = _mm_andnot_ps(dummy_mask,fscal);
1945 /* Update vectorial force */
1946 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1947 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1948 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1950 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1951 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1952 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1956 /**************************
1957 * CALCULATE INTERACTIONS *
1958 **************************/
1960 if (gmx_mm_any_lt(rsq22,rcutoff2))
1963 /* REACTION-FIELD ELECTROSTATICS */
1964 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1966 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1970 fscal = _mm_and_ps(fscal,cutoff_mask);
1972 fscal = _mm_andnot_ps(dummy_mask,fscal);
1974 /* Update vectorial force */
1975 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1976 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1977 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1979 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1980 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1981 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1985 /**************************
1986 * CALCULATE INTERACTIONS *
1987 **************************/
1989 if (gmx_mm_any_lt(rsq23,rcutoff2))
1992 /* REACTION-FIELD ELECTROSTATICS */
1993 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
1995 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1999 fscal = _mm_and_ps(fscal,cutoff_mask);
2001 fscal = _mm_andnot_ps(dummy_mask,fscal);
2003 /* Update vectorial force */
2004 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2005 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2006 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2008 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2009 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2010 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm_any_lt(rsq31,rcutoff2))
2021 /* REACTION-FIELD ELECTROSTATICS */
2022 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
2024 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2028 fscal = _mm_and_ps(fscal,cutoff_mask);
2030 fscal = _mm_andnot_ps(dummy_mask,fscal);
2032 /* Update vectorial force */
2033 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2034 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2035 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2037 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2038 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2039 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 if (gmx_mm_any_lt(rsq32,rcutoff2))
2050 /* REACTION-FIELD ELECTROSTATICS */
2051 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
2053 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2057 fscal = _mm_and_ps(fscal,cutoff_mask);
2059 fscal = _mm_andnot_ps(dummy_mask,fscal);
2061 /* Update vectorial force */
2062 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2063 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2064 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2066 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2067 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2068 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2072 /**************************
2073 * CALCULATE INTERACTIONS *
2074 **************************/
2076 if (gmx_mm_any_lt(rsq33,rcutoff2))
2079 /* REACTION-FIELD ELECTROSTATICS */
2080 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
2082 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2086 fscal = _mm_and_ps(fscal,cutoff_mask);
2088 fscal = _mm_andnot_ps(dummy_mask,fscal);
2090 /* Update vectorial force */
2091 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2092 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2093 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2095 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2096 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2097 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2101 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2102 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2103 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2104 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2106 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2107 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2108 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2110 /* Inner loop uses 333 flops */
2113 /* End of innermost loop */
2115 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2116 f+i_coord_offset,fshift+i_shift_offset);
2118 /* Increment number of inner iterations */
2119 inneriter += j_index_end - j_index_start;
2121 /* Outer loop uses 24 flops */
2124 /* Increment number of outer iterations */
2127 /* Update outer/inner flops */
2129 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*333);