2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128 dummy_mask,cutoff_mask;
105 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106 __m128 one = _mm_set1_ps(1.0);
107 __m128 two = _mm_set1_ps(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_ps(fr->epsfac);
120 charge = mdatoms->chargeA;
121 krf = _mm_set1_ps(fr->ic->k_rf);
122 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
123 crf = _mm_set1_ps(fr->ic->c_rf);
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_vdw->data;
129 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_ps(charge[inr+1]);
139 jq2 = _mm_set1_ps(charge[inr+2]);
140 jq3 = _mm_set1_ps(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_ps(iq1,jq1);
145 qq12 = _mm_mul_ps(iq1,jq2);
146 qq13 = _mm_mul_ps(iq1,jq3);
147 qq21 = _mm_mul_ps(iq2,jq1);
148 qq22 = _mm_mul_ps(iq2,jq2);
149 qq23 = _mm_mul_ps(iq2,jq3);
150 qq31 = _mm_mul_ps(iq3,jq1);
151 qq32 = _mm_mul_ps(iq3,jq2);
152 qq33 = _mm_mul_ps(iq3,jq3);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = 0;
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
169 shX = shiftvec[i_shift_offset+XX];
170 shY = shiftvec[i_shift_offset+YY];
171 shZ = shiftvec[i_shift_offset+ZZ];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
183 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
184 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
185 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
186 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
187 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
188 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
189 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
190 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
191 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
192 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
193 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
195 fix0 = _mm_setzero_ps();
196 fiy0 = _mm_setzero_ps();
197 fiz0 = _mm_setzero_ps();
198 fix1 = _mm_setzero_ps();
199 fiy1 = _mm_setzero_ps();
200 fiz1 = _mm_setzero_ps();
201 fix2 = _mm_setzero_ps();
202 fiy2 = _mm_setzero_ps();
203 fiz2 = _mm_setzero_ps();
204 fix3 = _mm_setzero_ps();
205 fiy3 = _mm_setzero_ps();
206 fiz3 = _mm_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm_setzero_ps();
210 vvdwsum = _mm_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
222 j_coord_offsetA = DIM*jnrA;
223 j_coord_offsetB = DIM*jnrB;
224 j_coord_offsetC = DIM*jnrC;
225 j_coord_offsetD = DIM*jnrD;
227 /* load j atom coordinates */
228 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229 x+j_coord_offsetC,x+j_coord_offsetD,
230 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
231 &jy2,&jz2,&jx3,&jy3,&jz3);
233 /* Calculate displacement vector */
234 dx00 = _mm_sub_ps(ix0,jx0);
235 dy00 = _mm_sub_ps(iy0,jy0);
236 dz00 = _mm_sub_ps(iz0,jz0);
237 dx11 = _mm_sub_ps(ix1,jx1);
238 dy11 = _mm_sub_ps(iy1,jy1);
239 dz11 = _mm_sub_ps(iz1,jz1);
240 dx12 = _mm_sub_ps(ix1,jx2);
241 dy12 = _mm_sub_ps(iy1,jy2);
242 dz12 = _mm_sub_ps(iz1,jz2);
243 dx13 = _mm_sub_ps(ix1,jx3);
244 dy13 = _mm_sub_ps(iy1,jy3);
245 dz13 = _mm_sub_ps(iz1,jz3);
246 dx21 = _mm_sub_ps(ix2,jx1);
247 dy21 = _mm_sub_ps(iy2,jy1);
248 dz21 = _mm_sub_ps(iz2,jz1);
249 dx22 = _mm_sub_ps(ix2,jx2);
250 dy22 = _mm_sub_ps(iy2,jy2);
251 dz22 = _mm_sub_ps(iz2,jz2);
252 dx23 = _mm_sub_ps(ix2,jx3);
253 dy23 = _mm_sub_ps(iy2,jy3);
254 dz23 = _mm_sub_ps(iz2,jz3);
255 dx31 = _mm_sub_ps(ix3,jx1);
256 dy31 = _mm_sub_ps(iy3,jy1);
257 dz31 = _mm_sub_ps(iz3,jz1);
258 dx32 = _mm_sub_ps(ix3,jx2);
259 dy32 = _mm_sub_ps(iy3,jy2);
260 dz32 = _mm_sub_ps(iz3,jz2);
261 dx33 = _mm_sub_ps(ix3,jx3);
262 dy33 = _mm_sub_ps(iy3,jy3);
263 dz33 = _mm_sub_ps(iz3,jz3);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
267 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
277 rinv00 = gmx_mm_invsqrt_ps(rsq00);
278 rinv11 = gmx_mm_invsqrt_ps(rsq11);
279 rinv12 = gmx_mm_invsqrt_ps(rsq12);
280 rinv13 = gmx_mm_invsqrt_ps(rsq13);
281 rinv21 = gmx_mm_invsqrt_ps(rsq21);
282 rinv22 = gmx_mm_invsqrt_ps(rsq22);
283 rinv23 = gmx_mm_invsqrt_ps(rsq23);
284 rinv31 = gmx_mm_invsqrt_ps(rsq31);
285 rinv32 = gmx_mm_invsqrt_ps(rsq32);
286 rinv33 = gmx_mm_invsqrt_ps(rsq33);
288 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
289 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
290 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
291 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
292 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
293 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
294 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
295 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
296 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
298 fjx0 = _mm_setzero_ps();
299 fjy0 = _mm_setzero_ps();
300 fjz0 = _mm_setzero_ps();
301 fjx1 = _mm_setzero_ps();
302 fjy1 = _mm_setzero_ps();
303 fjz1 = _mm_setzero_ps();
304 fjx2 = _mm_setzero_ps();
305 fjy2 = _mm_setzero_ps();
306 fjz2 = _mm_setzero_ps();
307 fjx3 = _mm_setzero_ps();
308 fjy3 = _mm_setzero_ps();
309 fjz3 = _mm_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 r00 = _mm_mul_ps(rsq00,rinv00);
317 /* Calculate table index by multiplying r with table scale and truncate to integer */
318 rt = _mm_mul_ps(r00,vftabscale);
319 vfitab = _mm_cvttps_epi32(rt);
320 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
321 vfitab = _mm_slli_epi32(vfitab,3);
323 /* CUBIC SPLINE TABLE DISPERSION */
324 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
325 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
326 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
327 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
328 _MM_TRANSPOSE4_PS(Y,F,G,H);
329 Heps = _mm_mul_ps(vfeps,H);
330 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
331 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
332 vvdw6 = _mm_mul_ps(c6_00,VV);
333 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
334 fvdw6 = _mm_mul_ps(c6_00,FF);
336 /* CUBIC SPLINE TABLE REPULSION */
337 vfitab = _mm_add_epi32(vfitab,ifour);
338 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
339 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
340 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
341 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
342 _MM_TRANSPOSE4_PS(Y,F,G,H);
343 Heps = _mm_mul_ps(vfeps,H);
344 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
345 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
346 vvdw12 = _mm_mul_ps(c12_00,VV);
347 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
348 fvdw12 = _mm_mul_ps(c12_00,FF);
349 vvdw = _mm_add_ps(vvdw12,vvdw6);
350 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_ps(fscal,dx00);
359 ty = _mm_mul_ps(fscal,dy00);
360 tz = _mm_mul_ps(fscal,dz00);
362 /* Update vectorial force */
363 fix0 = _mm_add_ps(fix0,tx);
364 fiy0 = _mm_add_ps(fiy0,ty);
365 fiz0 = _mm_add_ps(fiz0,tz);
367 fjx0 = _mm_add_ps(fjx0,tx);
368 fjy0 = _mm_add_ps(fjy0,ty);
369 fjz0 = _mm_add_ps(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 /* REACTION-FIELD ELECTROSTATICS */
376 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
377 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm_add_ps(velecsum,velec);
384 /* Calculate temporary vectorial force */
385 tx = _mm_mul_ps(fscal,dx11);
386 ty = _mm_mul_ps(fscal,dy11);
387 tz = _mm_mul_ps(fscal,dz11);
389 /* Update vectorial force */
390 fix1 = _mm_add_ps(fix1,tx);
391 fiy1 = _mm_add_ps(fiy1,ty);
392 fiz1 = _mm_add_ps(fiz1,tz);
394 fjx1 = _mm_add_ps(fjx1,tx);
395 fjy1 = _mm_add_ps(fjy1,ty);
396 fjz1 = _mm_add_ps(fjz1,tz);
398 /**************************
399 * CALCULATE INTERACTIONS *
400 **************************/
402 /* REACTION-FIELD ELECTROSTATICS */
403 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
404 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velecsum = _mm_add_ps(velecsum,velec);
411 /* Calculate temporary vectorial force */
412 tx = _mm_mul_ps(fscal,dx12);
413 ty = _mm_mul_ps(fscal,dy12);
414 tz = _mm_mul_ps(fscal,dz12);
416 /* Update vectorial force */
417 fix1 = _mm_add_ps(fix1,tx);
418 fiy1 = _mm_add_ps(fiy1,ty);
419 fiz1 = _mm_add_ps(fiz1,tz);
421 fjx2 = _mm_add_ps(fjx2,tx);
422 fjy2 = _mm_add_ps(fjy2,ty);
423 fjz2 = _mm_add_ps(fjz2,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 /* REACTION-FIELD ELECTROSTATICS */
430 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
431 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm_add_ps(velecsum,velec);
438 /* Calculate temporary vectorial force */
439 tx = _mm_mul_ps(fscal,dx13);
440 ty = _mm_mul_ps(fscal,dy13);
441 tz = _mm_mul_ps(fscal,dz13);
443 /* Update vectorial force */
444 fix1 = _mm_add_ps(fix1,tx);
445 fiy1 = _mm_add_ps(fiy1,ty);
446 fiz1 = _mm_add_ps(fiz1,tz);
448 fjx3 = _mm_add_ps(fjx3,tx);
449 fjy3 = _mm_add_ps(fjy3,ty);
450 fjz3 = _mm_add_ps(fjz3,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 /* REACTION-FIELD ELECTROSTATICS */
457 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
458 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
460 /* Update potential sum for this i atom from the interaction with this j atom. */
461 velecsum = _mm_add_ps(velecsum,velec);
465 /* Calculate temporary vectorial force */
466 tx = _mm_mul_ps(fscal,dx21);
467 ty = _mm_mul_ps(fscal,dy21);
468 tz = _mm_mul_ps(fscal,dz21);
470 /* Update vectorial force */
471 fix2 = _mm_add_ps(fix2,tx);
472 fiy2 = _mm_add_ps(fiy2,ty);
473 fiz2 = _mm_add_ps(fiz2,tz);
475 fjx1 = _mm_add_ps(fjx1,tx);
476 fjy1 = _mm_add_ps(fjy1,ty);
477 fjz1 = _mm_add_ps(fjz1,tz);
479 /**************************
480 * CALCULATE INTERACTIONS *
481 **************************/
483 /* REACTION-FIELD ELECTROSTATICS */
484 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
485 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm_add_ps(velecsum,velec);
492 /* Calculate temporary vectorial force */
493 tx = _mm_mul_ps(fscal,dx22);
494 ty = _mm_mul_ps(fscal,dy22);
495 tz = _mm_mul_ps(fscal,dz22);
497 /* Update vectorial force */
498 fix2 = _mm_add_ps(fix2,tx);
499 fiy2 = _mm_add_ps(fiy2,ty);
500 fiz2 = _mm_add_ps(fiz2,tz);
502 fjx2 = _mm_add_ps(fjx2,tx);
503 fjy2 = _mm_add_ps(fjy2,ty);
504 fjz2 = _mm_add_ps(fjz2,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 /* REACTION-FIELD ELECTROSTATICS */
511 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
512 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velecsum = _mm_add_ps(velecsum,velec);
519 /* Calculate temporary vectorial force */
520 tx = _mm_mul_ps(fscal,dx23);
521 ty = _mm_mul_ps(fscal,dy23);
522 tz = _mm_mul_ps(fscal,dz23);
524 /* Update vectorial force */
525 fix2 = _mm_add_ps(fix2,tx);
526 fiy2 = _mm_add_ps(fiy2,ty);
527 fiz2 = _mm_add_ps(fiz2,tz);
529 fjx3 = _mm_add_ps(fjx3,tx);
530 fjy3 = _mm_add_ps(fjy3,ty);
531 fjz3 = _mm_add_ps(fjz3,tz);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 /* REACTION-FIELD ELECTROSTATICS */
538 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
539 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velecsum = _mm_add_ps(velecsum,velec);
546 /* Calculate temporary vectorial force */
547 tx = _mm_mul_ps(fscal,dx31);
548 ty = _mm_mul_ps(fscal,dy31);
549 tz = _mm_mul_ps(fscal,dz31);
551 /* Update vectorial force */
552 fix3 = _mm_add_ps(fix3,tx);
553 fiy3 = _mm_add_ps(fiy3,ty);
554 fiz3 = _mm_add_ps(fiz3,tz);
556 fjx1 = _mm_add_ps(fjx1,tx);
557 fjy1 = _mm_add_ps(fjy1,ty);
558 fjz1 = _mm_add_ps(fjz1,tz);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 /* REACTION-FIELD ELECTROSTATICS */
565 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
566 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velecsum = _mm_add_ps(velecsum,velec);
573 /* Calculate temporary vectorial force */
574 tx = _mm_mul_ps(fscal,dx32);
575 ty = _mm_mul_ps(fscal,dy32);
576 tz = _mm_mul_ps(fscal,dz32);
578 /* Update vectorial force */
579 fix3 = _mm_add_ps(fix3,tx);
580 fiy3 = _mm_add_ps(fiy3,ty);
581 fiz3 = _mm_add_ps(fiz3,tz);
583 fjx2 = _mm_add_ps(fjx2,tx);
584 fjy2 = _mm_add_ps(fjy2,ty);
585 fjz2 = _mm_add_ps(fjz2,tz);
587 /**************************
588 * CALCULATE INTERACTIONS *
589 **************************/
591 /* REACTION-FIELD ELECTROSTATICS */
592 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
593 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
595 /* Update potential sum for this i atom from the interaction with this j atom. */
596 velecsum = _mm_add_ps(velecsum,velec);
600 /* Calculate temporary vectorial force */
601 tx = _mm_mul_ps(fscal,dx33);
602 ty = _mm_mul_ps(fscal,dy33);
603 tz = _mm_mul_ps(fscal,dz33);
605 /* Update vectorial force */
606 fix3 = _mm_add_ps(fix3,tx);
607 fiy3 = _mm_add_ps(fiy3,ty);
608 fiz3 = _mm_add_ps(fiz3,tz);
610 fjx3 = _mm_add_ps(fjx3,tx);
611 fjy3 = _mm_add_ps(fjy3,ty);
612 fjz3 = _mm_add_ps(fjz3,tz);
614 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
615 f+j_coord_offsetC,f+j_coord_offsetD,
616 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
617 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
619 /* Inner loop uses 347 flops */
625 /* Get j neighbor index, and coordinate index */
631 /* Sign of each element will be negative for non-real atoms.
632 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
633 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
635 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
636 jnrA = (jnrA>=0) ? jnrA : 0;
637 jnrB = (jnrB>=0) ? jnrB : 0;
638 jnrC = (jnrC>=0) ? jnrC : 0;
639 jnrD = (jnrD>=0) ? jnrD : 0;
641 j_coord_offsetA = DIM*jnrA;
642 j_coord_offsetB = DIM*jnrB;
643 j_coord_offsetC = DIM*jnrC;
644 j_coord_offsetD = DIM*jnrD;
646 /* load j atom coordinates */
647 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
648 x+j_coord_offsetC,x+j_coord_offsetD,
649 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
650 &jy2,&jz2,&jx3,&jy3,&jz3);
652 /* Calculate displacement vector */
653 dx00 = _mm_sub_ps(ix0,jx0);
654 dy00 = _mm_sub_ps(iy0,jy0);
655 dz00 = _mm_sub_ps(iz0,jz0);
656 dx11 = _mm_sub_ps(ix1,jx1);
657 dy11 = _mm_sub_ps(iy1,jy1);
658 dz11 = _mm_sub_ps(iz1,jz1);
659 dx12 = _mm_sub_ps(ix1,jx2);
660 dy12 = _mm_sub_ps(iy1,jy2);
661 dz12 = _mm_sub_ps(iz1,jz2);
662 dx13 = _mm_sub_ps(ix1,jx3);
663 dy13 = _mm_sub_ps(iy1,jy3);
664 dz13 = _mm_sub_ps(iz1,jz3);
665 dx21 = _mm_sub_ps(ix2,jx1);
666 dy21 = _mm_sub_ps(iy2,jy1);
667 dz21 = _mm_sub_ps(iz2,jz1);
668 dx22 = _mm_sub_ps(ix2,jx2);
669 dy22 = _mm_sub_ps(iy2,jy2);
670 dz22 = _mm_sub_ps(iz2,jz2);
671 dx23 = _mm_sub_ps(ix2,jx3);
672 dy23 = _mm_sub_ps(iy2,jy3);
673 dz23 = _mm_sub_ps(iz2,jz3);
674 dx31 = _mm_sub_ps(ix3,jx1);
675 dy31 = _mm_sub_ps(iy3,jy1);
676 dz31 = _mm_sub_ps(iz3,jz1);
677 dx32 = _mm_sub_ps(ix3,jx2);
678 dy32 = _mm_sub_ps(iy3,jy2);
679 dz32 = _mm_sub_ps(iz3,jz2);
680 dx33 = _mm_sub_ps(ix3,jx3);
681 dy33 = _mm_sub_ps(iy3,jy3);
682 dz33 = _mm_sub_ps(iz3,jz3);
684 /* Calculate squared distance and things based on it */
685 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
686 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
687 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
688 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
689 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
690 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
691 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
692 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
693 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
694 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
696 rinv00 = gmx_mm_invsqrt_ps(rsq00);
697 rinv11 = gmx_mm_invsqrt_ps(rsq11);
698 rinv12 = gmx_mm_invsqrt_ps(rsq12);
699 rinv13 = gmx_mm_invsqrt_ps(rsq13);
700 rinv21 = gmx_mm_invsqrt_ps(rsq21);
701 rinv22 = gmx_mm_invsqrt_ps(rsq22);
702 rinv23 = gmx_mm_invsqrt_ps(rsq23);
703 rinv31 = gmx_mm_invsqrt_ps(rsq31);
704 rinv32 = gmx_mm_invsqrt_ps(rsq32);
705 rinv33 = gmx_mm_invsqrt_ps(rsq33);
707 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
708 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
709 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
710 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
711 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
712 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
713 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
714 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
715 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
717 fjx0 = _mm_setzero_ps();
718 fjy0 = _mm_setzero_ps();
719 fjz0 = _mm_setzero_ps();
720 fjx1 = _mm_setzero_ps();
721 fjy1 = _mm_setzero_ps();
722 fjz1 = _mm_setzero_ps();
723 fjx2 = _mm_setzero_ps();
724 fjy2 = _mm_setzero_ps();
725 fjz2 = _mm_setzero_ps();
726 fjx3 = _mm_setzero_ps();
727 fjy3 = _mm_setzero_ps();
728 fjz3 = _mm_setzero_ps();
730 /**************************
731 * CALCULATE INTERACTIONS *
732 **************************/
734 r00 = _mm_mul_ps(rsq00,rinv00);
735 r00 = _mm_andnot_ps(dummy_mask,r00);
737 /* Calculate table index by multiplying r with table scale and truncate to integer */
738 rt = _mm_mul_ps(r00,vftabscale);
739 vfitab = _mm_cvttps_epi32(rt);
740 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
741 vfitab = _mm_slli_epi32(vfitab,3);
743 /* CUBIC SPLINE TABLE DISPERSION */
744 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
745 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
746 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
747 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
748 _MM_TRANSPOSE4_PS(Y,F,G,H);
749 Heps = _mm_mul_ps(vfeps,H);
750 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
751 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
752 vvdw6 = _mm_mul_ps(c6_00,VV);
753 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
754 fvdw6 = _mm_mul_ps(c6_00,FF);
756 /* CUBIC SPLINE TABLE REPULSION */
757 vfitab = _mm_add_epi32(vfitab,ifour);
758 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
759 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
760 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
761 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
762 _MM_TRANSPOSE4_PS(Y,F,G,H);
763 Heps = _mm_mul_ps(vfeps,H);
764 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
765 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
766 vvdw12 = _mm_mul_ps(c12_00,VV);
767 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
768 fvdw12 = _mm_mul_ps(c12_00,FF);
769 vvdw = _mm_add_ps(vvdw12,vvdw6);
770 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
772 /* Update potential sum for this i atom from the interaction with this j atom. */
773 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
774 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
778 fscal = _mm_andnot_ps(dummy_mask,fscal);
780 /* Calculate temporary vectorial force */
781 tx = _mm_mul_ps(fscal,dx00);
782 ty = _mm_mul_ps(fscal,dy00);
783 tz = _mm_mul_ps(fscal,dz00);
785 /* Update vectorial force */
786 fix0 = _mm_add_ps(fix0,tx);
787 fiy0 = _mm_add_ps(fiy0,ty);
788 fiz0 = _mm_add_ps(fiz0,tz);
790 fjx0 = _mm_add_ps(fjx0,tx);
791 fjy0 = _mm_add_ps(fjy0,ty);
792 fjz0 = _mm_add_ps(fjz0,tz);
794 /**************************
795 * CALCULATE INTERACTIONS *
796 **************************/
798 /* REACTION-FIELD ELECTROSTATICS */
799 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_add_ps(rinv11,_mm_mul_ps(krf,rsq11)),crf));
800 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
802 /* Update potential sum for this i atom from the interaction with this j atom. */
803 velec = _mm_andnot_ps(dummy_mask,velec);
804 velecsum = _mm_add_ps(velecsum,velec);
808 fscal = _mm_andnot_ps(dummy_mask,fscal);
810 /* Calculate temporary vectorial force */
811 tx = _mm_mul_ps(fscal,dx11);
812 ty = _mm_mul_ps(fscal,dy11);
813 tz = _mm_mul_ps(fscal,dz11);
815 /* Update vectorial force */
816 fix1 = _mm_add_ps(fix1,tx);
817 fiy1 = _mm_add_ps(fiy1,ty);
818 fiz1 = _mm_add_ps(fiz1,tz);
820 fjx1 = _mm_add_ps(fjx1,tx);
821 fjy1 = _mm_add_ps(fjy1,ty);
822 fjz1 = _mm_add_ps(fjz1,tz);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 /* REACTION-FIELD ELECTROSTATICS */
829 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_add_ps(rinv12,_mm_mul_ps(krf,rsq12)),crf));
830 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
832 /* Update potential sum for this i atom from the interaction with this j atom. */
833 velec = _mm_andnot_ps(dummy_mask,velec);
834 velecsum = _mm_add_ps(velecsum,velec);
838 fscal = _mm_andnot_ps(dummy_mask,fscal);
840 /* Calculate temporary vectorial force */
841 tx = _mm_mul_ps(fscal,dx12);
842 ty = _mm_mul_ps(fscal,dy12);
843 tz = _mm_mul_ps(fscal,dz12);
845 /* Update vectorial force */
846 fix1 = _mm_add_ps(fix1,tx);
847 fiy1 = _mm_add_ps(fiy1,ty);
848 fiz1 = _mm_add_ps(fiz1,tz);
850 fjx2 = _mm_add_ps(fjx2,tx);
851 fjy2 = _mm_add_ps(fjy2,ty);
852 fjz2 = _mm_add_ps(fjz2,tz);
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 /* REACTION-FIELD ELECTROSTATICS */
859 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_add_ps(rinv13,_mm_mul_ps(krf,rsq13)),crf));
860 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm_andnot_ps(dummy_mask,velec);
864 velecsum = _mm_add_ps(velecsum,velec);
868 fscal = _mm_andnot_ps(dummy_mask,fscal);
870 /* Calculate temporary vectorial force */
871 tx = _mm_mul_ps(fscal,dx13);
872 ty = _mm_mul_ps(fscal,dy13);
873 tz = _mm_mul_ps(fscal,dz13);
875 /* Update vectorial force */
876 fix1 = _mm_add_ps(fix1,tx);
877 fiy1 = _mm_add_ps(fiy1,ty);
878 fiz1 = _mm_add_ps(fiz1,tz);
880 fjx3 = _mm_add_ps(fjx3,tx);
881 fjy3 = _mm_add_ps(fjy3,ty);
882 fjz3 = _mm_add_ps(fjz3,tz);
884 /**************************
885 * CALCULATE INTERACTIONS *
886 **************************/
888 /* REACTION-FIELD ELECTROSTATICS */
889 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_add_ps(rinv21,_mm_mul_ps(krf,rsq21)),crf));
890 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
892 /* Update potential sum for this i atom from the interaction with this j atom. */
893 velec = _mm_andnot_ps(dummy_mask,velec);
894 velecsum = _mm_add_ps(velecsum,velec);
898 fscal = _mm_andnot_ps(dummy_mask,fscal);
900 /* Calculate temporary vectorial force */
901 tx = _mm_mul_ps(fscal,dx21);
902 ty = _mm_mul_ps(fscal,dy21);
903 tz = _mm_mul_ps(fscal,dz21);
905 /* Update vectorial force */
906 fix2 = _mm_add_ps(fix2,tx);
907 fiy2 = _mm_add_ps(fiy2,ty);
908 fiz2 = _mm_add_ps(fiz2,tz);
910 fjx1 = _mm_add_ps(fjx1,tx);
911 fjy1 = _mm_add_ps(fjy1,ty);
912 fjz1 = _mm_add_ps(fjz1,tz);
914 /**************************
915 * CALCULATE INTERACTIONS *
916 **************************/
918 /* REACTION-FIELD ELECTROSTATICS */
919 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_add_ps(rinv22,_mm_mul_ps(krf,rsq22)),crf));
920 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 velec = _mm_andnot_ps(dummy_mask,velec);
924 velecsum = _mm_add_ps(velecsum,velec);
928 fscal = _mm_andnot_ps(dummy_mask,fscal);
930 /* Calculate temporary vectorial force */
931 tx = _mm_mul_ps(fscal,dx22);
932 ty = _mm_mul_ps(fscal,dy22);
933 tz = _mm_mul_ps(fscal,dz22);
935 /* Update vectorial force */
936 fix2 = _mm_add_ps(fix2,tx);
937 fiy2 = _mm_add_ps(fiy2,ty);
938 fiz2 = _mm_add_ps(fiz2,tz);
940 fjx2 = _mm_add_ps(fjx2,tx);
941 fjy2 = _mm_add_ps(fjy2,ty);
942 fjz2 = _mm_add_ps(fjz2,tz);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 /* REACTION-FIELD ELECTROSTATICS */
949 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_add_ps(rinv23,_mm_mul_ps(krf,rsq23)),crf));
950 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec = _mm_andnot_ps(dummy_mask,velec);
954 velecsum = _mm_add_ps(velecsum,velec);
958 fscal = _mm_andnot_ps(dummy_mask,fscal);
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_ps(fscal,dx23);
962 ty = _mm_mul_ps(fscal,dy23);
963 tz = _mm_mul_ps(fscal,dz23);
965 /* Update vectorial force */
966 fix2 = _mm_add_ps(fix2,tx);
967 fiy2 = _mm_add_ps(fiy2,ty);
968 fiz2 = _mm_add_ps(fiz2,tz);
970 fjx3 = _mm_add_ps(fjx3,tx);
971 fjy3 = _mm_add_ps(fjy3,ty);
972 fjz3 = _mm_add_ps(fjz3,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 /* REACTION-FIELD ELECTROSTATICS */
979 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_add_ps(rinv31,_mm_mul_ps(krf,rsq31)),crf));
980 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm_andnot_ps(dummy_mask,velec);
984 velecsum = _mm_add_ps(velecsum,velec);
988 fscal = _mm_andnot_ps(dummy_mask,fscal);
990 /* Calculate temporary vectorial force */
991 tx = _mm_mul_ps(fscal,dx31);
992 ty = _mm_mul_ps(fscal,dy31);
993 tz = _mm_mul_ps(fscal,dz31);
995 /* Update vectorial force */
996 fix3 = _mm_add_ps(fix3,tx);
997 fiy3 = _mm_add_ps(fiy3,ty);
998 fiz3 = _mm_add_ps(fiz3,tz);
1000 fjx1 = _mm_add_ps(fjx1,tx);
1001 fjy1 = _mm_add_ps(fjy1,ty);
1002 fjz1 = _mm_add_ps(fjz1,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 /* REACTION-FIELD ELECTROSTATICS */
1009 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_add_ps(rinv32,_mm_mul_ps(krf,rsq32)),crf));
1010 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1012 /* Update potential sum for this i atom from the interaction with this j atom. */
1013 velec = _mm_andnot_ps(dummy_mask,velec);
1014 velecsum = _mm_add_ps(velecsum,velec);
1018 fscal = _mm_andnot_ps(dummy_mask,fscal);
1020 /* Calculate temporary vectorial force */
1021 tx = _mm_mul_ps(fscal,dx32);
1022 ty = _mm_mul_ps(fscal,dy32);
1023 tz = _mm_mul_ps(fscal,dz32);
1025 /* Update vectorial force */
1026 fix3 = _mm_add_ps(fix3,tx);
1027 fiy3 = _mm_add_ps(fiy3,ty);
1028 fiz3 = _mm_add_ps(fiz3,tz);
1030 fjx2 = _mm_add_ps(fjx2,tx);
1031 fjy2 = _mm_add_ps(fjy2,ty);
1032 fjz2 = _mm_add_ps(fjz2,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 /* REACTION-FIELD ELECTROSTATICS */
1039 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_add_ps(rinv33,_mm_mul_ps(krf,rsq33)),crf));
1040 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1042 /* Update potential sum for this i atom from the interaction with this j atom. */
1043 velec = _mm_andnot_ps(dummy_mask,velec);
1044 velecsum = _mm_add_ps(velecsum,velec);
1048 fscal = _mm_andnot_ps(dummy_mask,fscal);
1050 /* Calculate temporary vectorial force */
1051 tx = _mm_mul_ps(fscal,dx33);
1052 ty = _mm_mul_ps(fscal,dy33);
1053 tz = _mm_mul_ps(fscal,dz33);
1055 /* Update vectorial force */
1056 fix3 = _mm_add_ps(fix3,tx);
1057 fiy3 = _mm_add_ps(fiy3,ty);
1058 fiz3 = _mm_add_ps(fiz3,tz);
1060 fjx3 = _mm_add_ps(fjx3,tx);
1061 fjy3 = _mm_add_ps(fjy3,ty);
1062 fjz3 = _mm_add_ps(fjz3,tz);
1064 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1065 f+j_coord_offsetC,f+j_coord_offsetD,
1066 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1067 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1069 /* Inner loop uses 348 flops */
1072 /* End of innermost loop */
1074 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1075 f+i_coord_offset,fshift+i_shift_offset);
1078 /* Update potential energies */
1079 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1080 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1082 /* Increment number of inner iterations */
1083 inneriter += j_index_end - j_index_start;
1085 /* Outer loop uses 38 flops */
1088 /* Increment number of outer iterations */
1091 /* Update outer/inner flops */
1093 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*348);
1096 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
1097 * Electrostatics interaction: ReactionField
1098 * VdW interaction: CubicSplineTable
1099 * Geometry: Water4-Water4
1100 * Calculate force/pot: Force
1103 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
1104 (t_nblist * gmx_restrict nlist,
1105 rvec * gmx_restrict xx,
1106 rvec * gmx_restrict ff,
1107 t_forcerec * gmx_restrict fr,
1108 t_mdatoms * gmx_restrict mdatoms,
1109 nb_kernel_data_t * gmx_restrict kernel_data,
1110 t_nrnb * gmx_restrict nrnb)
1112 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1113 * just 0 for non-waters.
1114 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1115 * jnr indices corresponding to data put in the four positions in the SIMD register.
1117 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1118 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1119 int jnrA,jnrB,jnrC,jnrD;
1120 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1121 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1122 real shX,shY,shZ,rcutoff_scalar;
1123 real *shiftvec,*fshift,*x,*f;
1124 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1126 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1128 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1130 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1132 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1133 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1134 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1135 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1136 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1137 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1138 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1139 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1140 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1141 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1142 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1143 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1144 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1145 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1146 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1147 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1148 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1149 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1150 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1151 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1154 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1157 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1158 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1160 __m128i ifour = _mm_set1_epi32(4);
1161 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1163 __m128 dummy_mask,cutoff_mask;
1164 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1165 __m128 one = _mm_set1_ps(1.0);
1166 __m128 two = _mm_set1_ps(2.0);
1172 jindex = nlist->jindex;
1174 shiftidx = nlist->shift;
1176 shiftvec = fr->shift_vec[0];
1177 fshift = fr->fshift[0];
1178 facel = _mm_set1_ps(fr->epsfac);
1179 charge = mdatoms->chargeA;
1180 krf = _mm_set1_ps(fr->ic->k_rf);
1181 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1182 crf = _mm_set1_ps(fr->ic->c_rf);
1183 nvdwtype = fr->ntype;
1184 vdwparam = fr->nbfp;
1185 vdwtype = mdatoms->typeA;
1187 vftab = kernel_data->table_vdw->data;
1188 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1190 /* Setup water-specific parameters */
1191 inr = nlist->iinr[0];
1192 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1193 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1194 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1195 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1197 jq1 = _mm_set1_ps(charge[inr+1]);
1198 jq2 = _mm_set1_ps(charge[inr+2]);
1199 jq3 = _mm_set1_ps(charge[inr+3]);
1200 vdwjidx0A = 2*vdwtype[inr+0];
1201 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1202 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1203 qq11 = _mm_mul_ps(iq1,jq1);
1204 qq12 = _mm_mul_ps(iq1,jq2);
1205 qq13 = _mm_mul_ps(iq1,jq3);
1206 qq21 = _mm_mul_ps(iq2,jq1);
1207 qq22 = _mm_mul_ps(iq2,jq2);
1208 qq23 = _mm_mul_ps(iq2,jq3);
1209 qq31 = _mm_mul_ps(iq3,jq1);
1210 qq32 = _mm_mul_ps(iq3,jq2);
1211 qq33 = _mm_mul_ps(iq3,jq3);
1213 /* Avoid stupid compiler warnings */
1214 jnrA = jnrB = jnrC = jnrD = 0;
1215 j_coord_offsetA = 0;
1216 j_coord_offsetB = 0;
1217 j_coord_offsetC = 0;
1218 j_coord_offsetD = 0;
1223 /* Start outer loop over neighborlists */
1224 for(iidx=0; iidx<nri; iidx++)
1226 /* Load shift vector for this list */
1227 i_shift_offset = DIM*shiftidx[iidx];
1228 shX = shiftvec[i_shift_offset+XX];
1229 shY = shiftvec[i_shift_offset+YY];
1230 shZ = shiftvec[i_shift_offset+ZZ];
1232 /* Load limits for loop over neighbors */
1233 j_index_start = jindex[iidx];
1234 j_index_end = jindex[iidx+1];
1236 /* Get outer coordinate index */
1238 i_coord_offset = DIM*inr;
1240 /* Load i particle coords and add shift vector */
1241 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1242 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1243 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1244 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1245 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1246 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1247 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1248 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1249 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1250 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1251 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1252 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1254 fix0 = _mm_setzero_ps();
1255 fiy0 = _mm_setzero_ps();
1256 fiz0 = _mm_setzero_ps();
1257 fix1 = _mm_setzero_ps();
1258 fiy1 = _mm_setzero_ps();
1259 fiz1 = _mm_setzero_ps();
1260 fix2 = _mm_setzero_ps();
1261 fiy2 = _mm_setzero_ps();
1262 fiz2 = _mm_setzero_ps();
1263 fix3 = _mm_setzero_ps();
1264 fiy3 = _mm_setzero_ps();
1265 fiz3 = _mm_setzero_ps();
1267 /* Start inner kernel loop */
1268 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1271 /* Get j neighbor index, and coordinate index */
1273 jnrB = jjnr[jidx+1];
1274 jnrC = jjnr[jidx+2];
1275 jnrD = jjnr[jidx+3];
1277 j_coord_offsetA = DIM*jnrA;
1278 j_coord_offsetB = DIM*jnrB;
1279 j_coord_offsetC = DIM*jnrC;
1280 j_coord_offsetD = DIM*jnrD;
1282 /* load j atom coordinates */
1283 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1284 x+j_coord_offsetC,x+j_coord_offsetD,
1285 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1286 &jy2,&jz2,&jx3,&jy3,&jz3);
1288 /* Calculate displacement vector */
1289 dx00 = _mm_sub_ps(ix0,jx0);
1290 dy00 = _mm_sub_ps(iy0,jy0);
1291 dz00 = _mm_sub_ps(iz0,jz0);
1292 dx11 = _mm_sub_ps(ix1,jx1);
1293 dy11 = _mm_sub_ps(iy1,jy1);
1294 dz11 = _mm_sub_ps(iz1,jz1);
1295 dx12 = _mm_sub_ps(ix1,jx2);
1296 dy12 = _mm_sub_ps(iy1,jy2);
1297 dz12 = _mm_sub_ps(iz1,jz2);
1298 dx13 = _mm_sub_ps(ix1,jx3);
1299 dy13 = _mm_sub_ps(iy1,jy3);
1300 dz13 = _mm_sub_ps(iz1,jz3);
1301 dx21 = _mm_sub_ps(ix2,jx1);
1302 dy21 = _mm_sub_ps(iy2,jy1);
1303 dz21 = _mm_sub_ps(iz2,jz1);
1304 dx22 = _mm_sub_ps(ix2,jx2);
1305 dy22 = _mm_sub_ps(iy2,jy2);
1306 dz22 = _mm_sub_ps(iz2,jz2);
1307 dx23 = _mm_sub_ps(ix2,jx3);
1308 dy23 = _mm_sub_ps(iy2,jy3);
1309 dz23 = _mm_sub_ps(iz2,jz3);
1310 dx31 = _mm_sub_ps(ix3,jx1);
1311 dy31 = _mm_sub_ps(iy3,jy1);
1312 dz31 = _mm_sub_ps(iz3,jz1);
1313 dx32 = _mm_sub_ps(ix3,jx2);
1314 dy32 = _mm_sub_ps(iy3,jy2);
1315 dz32 = _mm_sub_ps(iz3,jz2);
1316 dx33 = _mm_sub_ps(ix3,jx3);
1317 dy33 = _mm_sub_ps(iy3,jy3);
1318 dz33 = _mm_sub_ps(iz3,jz3);
1320 /* Calculate squared distance and things based on it */
1321 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1322 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1323 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1324 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1325 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1326 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1327 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1328 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1329 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1330 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1332 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1333 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1334 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1335 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1336 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1337 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1338 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1339 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1340 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1341 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1343 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1344 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1345 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1346 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1347 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1348 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1349 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1350 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1351 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1353 fjx0 = _mm_setzero_ps();
1354 fjy0 = _mm_setzero_ps();
1355 fjz0 = _mm_setzero_ps();
1356 fjx1 = _mm_setzero_ps();
1357 fjy1 = _mm_setzero_ps();
1358 fjz1 = _mm_setzero_ps();
1359 fjx2 = _mm_setzero_ps();
1360 fjy2 = _mm_setzero_ps();
1361 fjz2 = _mm_setzero_ps();
1362 fjx3 = _mm_setzero_ps();
1363 fjy3 = _mm_setzero_ps();
1364 fjz3 = _mm_setzero_ps();
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 r00 = _mm_mul_ps(rsq00,rinv00);
1372 /* Calculate table index by multiplying r with table scale and truncate to integer */
1373 rt = _mm_mul_ps(r00,vftabscale);
1374 vfitab = _mm_cvttps_epi32(rt);
1375 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1376 vfitab = _mm_slli_epi32(vfitab,3);
1378 /* CUBIC SPLINE TABLE DISPERSION */
1379 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1380 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1381 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1382 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1383 _MM_TRANSPOSE4_PS(Y,F,G,H);
1384 Heps = _mm_mul_ps(vfeps,H);
1385 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1386 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1387 fvdw6 = _mm_mul_ps(c6_00,FF);
1389 /* CUBIC SPLINE TABLE REPULSION */
1390 vfitab = _mm_add_epi32(vfitab,ifour);
1391 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1392 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1393 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1394 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1395 _MM_TRANSPOSE4_PS(Y,F,G,H);
1396 Heps = _mm_mul_ps(vfeps,H);
1397 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1398 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1399 fvdw12 = _mm_mul_ps(c12_00,FF);
1400 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1404 /* Calculate temporary vectorial force */
1405 tx = _mm_mul_ps(fscal,dx00);
1406 ty = _mm_mul_ps(fscal,dy00);
1407 tz = _mm_mul_ps(fscal,dz00);
1409 /* Update vectorial force */
1410 fix0 = _mm_add_ps(fix0,tx);
1411 fiy0 = _mm_add_ps(fiy0,ty);
1412 fiz0 = _mm_add_ps(fiz0,tz);
1414 fjx0 = _mm_add_ps(fjx0,tx);
1415 fjy0 = _mm_add_ps(fjy0,ty);
1416 fjz0 = _mm_add_ps(fjz0,tz);
1418 /**************************
1419 * CALCULATE INTERACTIONS *
1420 **************************/
1422 /* REACTION-FIELD ELECTROSTATICS */
1423 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1427 /* Calculate temporary vectorial force */
1428 tx = _mm_mul_ps(fscal,dx11);
1429 ty = _mm_mul_ps(fscal,dy11);
1430 tz = _mm_mul_ps(fscal,dz11);
1432 /* Update vectorial force */
1433 fix1 = _mm_add_ps(fix1,tx);
1434 fiy1 = _mm_add_ps(fiy1,ty);
1435 fiz1 = _mm_add_ps(fiz1,tz);
1437 fjx1 = _mm_add_ps(fjx1,tx);
1438 fjy1 = _mm_add_ps(fjy1,ty);
1439 fjz1 = _mm_add_ps(fjz1,tz);
1441 /**************************
1442 * CALCULATE INTERACTIONS *
1443 **************************/
1445 /* REACTION-FIELD ELECTROSTATICS */
1446 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1450 /* Calculate temporary vectorial force */
1451 tx = _mm_mul_ps(fscal,dx12);
1452 ty = _mm_mul_ps(fscal,dy12);
1453 tz = _mm_mul_ps(fscal,dz12);
1455 /* Update vectorial force */
1456 fix1 = _mm_add_ps(fix1,tx);
1457 fiy1 = _mm_add_ps(fiy1,ty);
1458 fiz1 = _mm_add_ps(fiz1,tz);
1460 fjx2 = _mm_add_ps(fjx2,tx);
1461 fjy2 = _mm_add_ps(fjy2,ty);
1462 fjz2 = _mm_add_ps(fjz2,tz);
1464 /**************************
1465 * CALCULATE INTERACTIONS *
1466 **************************/
1468 /* REACTION-FIELD ELECTROSTATICS */
1469 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1473 /* Calculate temporary vectorial force */
1474 tx = _mm_mul_ps(fscal,dx13);
1475 ty = _mm_mul_ps(fscal,dy13);
1476 tz = _mm_mul_ps(fscal,dz13);
1478 /* Update vectorial force */
1479 fix1 = _mm_add_ps(fix1,tx);
1480 fiy1 = _mm_add_ps(fiy1,ty);
1481 fiz1 = _mm_add_ps(fiz1,tz);
1483 fjx3 = _mm_add_ps(fjx3,tx);
1484 fjy3 = _mm_add_ps(fjy3,ty);
1485 fjz3 = _mm_add_ps(fjz3,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 /* REACTION-FIELD ELECTROSTATICS */
1492 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1496 /* Calculate temporary vectorial force */
1497 tx = _mm_mul_ps(fscal,dx21);
1498 ty = _mm_mul_ps(fscal,dy21);
1499 tz = _mm_mul_ps(fscal,dz21);
1501 /* Update vectorial force */
1502 fix2 = _mm_add_ps(fix2,tx);
1503 fiy2 = _mm_add_ps(fiy2,ty);
1504 fiz2 = _mm_add_ps(fiz2,tz);
1506 fjx1 = _mm_add_ps(fjx1,tx);
1507 fjy1 = _mm_add_ps(fjy1,ty);
1508 fjz1 = _mm_add_ps(fjz1,tz);
1510 /**************************
1511 * CALCULATE INTERACTIONS *
1512 **************************/
1514 /* REACTION-FIELD ELECTROSTATICS */
1515 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1519 /* Calculate temporary vectorial force */
1520 tx = _mm_mul_ps(fscal,dx22);
1521 ty = _mm_mul_ps(fscal,dy22);
1522 tz = _mm_mul_ps(fscal,dz22);
1524 /* Update vectorial force */
1525 fix2 = _mm_add_ps(fix2,tx);
1526 fiy2 = _mm_add_ps(fiy2,ty);
1527 fiz2 = _mm_add_ps(fiz2,tz);
1529 fjx2 = _mm_add_ps(fjx2,tx);
1530 fjy2 = _mm_add_ps(fjy2,ty);
1531 fjz2 = _mm_add_ps(fjz2,tz);
1533 /**************************
1534 * CALCULATE INTERACTIONS *
1535 **************************/
1537 /* REACTION-FIELD ELECTROSTATICS */
1538 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1542 /* Calculate temporary vectorial force */
1543 tx = _mm_mul_ps(fscal,dx23);
1544 ty = _mm_mul_ps(fscal,dy23);
1545 tz = _mm_mul_ps(fscal,dz23);
1547 /* Update vectorial force */
1548 fix2 = _mm_add_ps(fix2,tx);
1549 fiy2 = _mm_add_ps(fiy2,ty);
1550 fiz2 = _mm_add_ps(fiz2,tz);
1552 fjx3 = _mm_add_ps(fjx3,tx);
1553 fjy3 = _mm_add_ps(fjy3,ty);
1554 fjz3 = _mm_add_ps(fjz3,tz);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 /* REACTION-FIELD ELECTROSTATICS */
1561 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1565 /* Calculate temporary vectorial force */
1566 tx = _mm_mul_ps(fscal,dx31);
1567 ty = _mm_mul_ps(fscal,dy31);
1568 tz = _mm_mul_ps(fscal,dz31);
1570 /* Update vectorial force */
1571 fix3 = _mm_add_ps(fix3,tx);
1572 fiy3 = _mm_add_ps(fiy3,ty);
1573 fiz3 = _mm_add_ps(fiz3,tz);
1575 fjx1 = _mm_add_ps(fjx1,tx);
1576 fjy1 = _mm_add_ps(fjy1,ty);
1577 fjz1 = _mm_add_ps(fjz1,tz);
1579 /**************************
1580 * CALCULATE INTERACTIONS *
1581 **************************/
1583 /* REACTION-FIELD ELECTROSTATICS */
1584 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1588 /* Calculate temporary vectorial force */
1589 tx = _mm_mul_ps(fscal,dx32);
1590 ty = _mm_mul_ps(fscal,dy32);
1591 tz = _mm_mul_ps(fscal,dz32);
1593 /* Update vectorial force */
1594 fix3 = _mm_add_ps(fix3,tx);
1595 fiy3 = _mm_add_ps(fiy3,ty);
1596 fiz3 = _mm_add_ps(fiz3,tz);
1598 fjx2 = _mm_add_ps(fjx2,tx);
1599 fjy2 = _mm_add_ps(fjy2,ty);
1600 fjz2 = _mm_add_ps(fjz2,tz);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 /* REACTION-FIELD ELECTROSTATICS */
1607 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
1611 /* Calculate temporary vectorial force */
1612 tx = _mm_mul_ps(fscal,dx33);
1613 ty = _mm_mul_ps(fscal,dy33);
1614 tz = _mm_mul_ps(fscal,dz33);
1616 /* Update vectorial force */
1617 fix3 = _mm_add_ps(fix3,tx);
1618 fiy3 = _mm_add_ps(fiy3,ty);
1619 fiz3 = _mm_add_ps(fiz3,tz);
1621 fjx3 = _mm_add_ps(fjx3,tx);
1622 fjy3 = _mm_add_ps(fjy3,ty);
1623 fjz3 = _mm_add_ps(fjz3,tz);
1625 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1626 f+j_coord_offsetC,f+j_coord_offsetD,
1627 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1628 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1630 /* Inner loop uses 294 flops */
1633 if(jidx<j_index_end)
1636 /* Get j neighbor index, and coordinate index */
1638 jnrB = jjnr[jidx+1];
1639 jnrC = jjnr[jidx+2];
1640 jnrD = jjnr[jidx+3];
1642 /* Sign of each element will be negative for non-real atoms.
1643 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1644 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1646 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1647 jnrA = (jnrA>=0) ? jnrA : 0;
1648 jnrB = (jnrB>=0) ? jnrB : 0;
1649 jnrC = (jnrC>=0) ? jnrC : 0;
1650 jnrD = (jnrD>=0) ? jnrD : 0;
1652 j_coord_offsetA = DIM*jnrA;
1653 j_coord_offsetB = DIM*jnrB;
1654 j_coord_offsetC = DIM*jnrC;
1655 j_coord_offsetD = DIM*jnrD;
1657 /* load j atom coordinates */
1658 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1659 x+j_coord_offsetC,x+j_coord_offsetD,
1660 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1661 &jy2,&jz2,&jx3,&jy3,&jz3);
1663 /* Calculate displacement vector */
1664 dx00 = _mm_sub_ps(ix0,jx0);
1665 dy00 = _mm_sub_ps(iy0,jy0);
1666 dz00 = _mm_sub_ps(iz0,jz0);
1667 dx11 = _mm_sub_ps(ix1,jx1);
1668 dy11 = _mm_sub_ps(iy1,jy1);
1669 dz11 = _mm_sub_ps(iz1,jz1);
1670 dx12 = _mm_sub_ps(ix1,jx2);
1671 dy12 = _mm_sub_ps(iy1,jy2);
1672 dz12 = _mm_sub_ps(iz1,jz2);
1673 dx13 = _mm_sub_ps(ix1,jx3);
1674 dy13 = _mm_sub_ps(iy1,jy3);
1675 dz13 = _mm_sub_ps(iz1,jz3);
1676 dx21 = _mm_sub_ps(ix2,jx1);
1677 dy21 = _mm_sub_ps(iy2,jy1);
1678 dz21 = _mm_sub_ps(iz2,jz1);
1679 dx22 = _mm_sub_ps(ix2,jx2);
1680 dy22 = _mm_sub_ps(iy2,jy2);
1681 dz22 = _mm_sub_ps(iz2,jz2);
1682 dx23 = _mm_sub_ps(ix2,jx3);
1683 dy23 = _mm_sub_ps(iy2,jy3);
1684 dz23 = _mm_sub_ps(iz2,jz3);
1685 dx31 = _mm_sub_ps(ix3,jx1);
1686 dy31 = _mm_sub_ps(iy3,jy1);
1687 dz31 = _mm_sub_ps(iz3,jz1);
1688 dx32 = _mm_sub_ps(ix3,jx2);
1689 dy32 = _mm_sub_ps(iy3,jy2);
1690 dz32 = _mm_sub_ps(iz3,jz2);
1691 dx33 = _mm_sub_ps(ix3,jx3);
1692 dy33 = _mm_sub_ps(iy3,jy3);
1693 dz33 = _mm_sub_ps(iz3,jz3);
1695 /* Calculate squared distance and things based on it */
1696 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1697 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1698 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1699 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1700 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1701 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1702 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1703 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1704 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1705 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1707 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1708 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1709 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1710 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1711 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1712 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1713 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1714 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1715 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1716 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1718 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1719 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1720 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1721 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1722 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1723 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1724 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1725 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1726 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1728 fjx0 = _mm_setzero_ps();
1729 fjy0 = _mm_setzero_ps();
1730 fjz0 = _mm_setzero_ps();
1731 fjx1 = _mm_setzero_ps();
1732 fjy1 = _mm_setzero_ps();
1733 fjz1 = _mm_setzero_ps();
1734 fjx2 = _mm_setzero_ps();
1735 fjy2 = _mm_setzero_ps();
1736 fjz2 = _mm_setzero_ps();
1737 fjx3 = _mm_setzero_ps();
1738 fjy3 = _mm_setzero_ps();
1739 fjz3 = _mm_setzero_ps();
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1745 r00 = _mm_mul_ps(rsq00,rinv00);
1746 r00 = _mm_andnot_ps(dummy_mask,r00);
1748 /* Calculate table index by multiplying r with table scale and truncate to integer */
1749 rt = _mm_mul_ps(r00,vftabscale);
1750 vfitab = _mm_cvttps_epi32(rt);
1751 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1752 vfitab = _mm_slli_epi32(vfitab,3);
1754 /* CUBIC SPLINE TABLE DISPERSION */
1755 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1756 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1757 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1758 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1759 _MM_TRANSPOSE4_PS(Y,F,G,H);
1760 Heps = _mm_mul_ps(vfeps,H);
1761 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1762 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1763 fvdw6 = _mm_mul_ps(c6_00,FF);
1765 /* CUBIC SPLINE TABLE REPULSION */
1766 vfitab = _mm_add_epi32(vfitab,ifour);
1767 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1768 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1769 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1770 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1771 _MM_TRANSPOSE4_PS(Y,F,G,H);
1772 Heps = _mm_mul_ps(vfeps,H);
1773 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1774 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1775 fvdw12 = _mm_mul_ps(c12_00,FF);
1776 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1780 fscal = _mm_andnot_ps(dummy_mask,fscal);
1782 /* Calculate temporary vectorial force */
1783 tx = _mm_mul_ps(fscal,dx00);
1784 ty = _mm_mul_ps(fscal,dy00);
1785 tz = _mm_mul_ps(fscal,dz00);
1787 /* Update vectorial force */
1788 fix0 = _mm_add_ps(fix0,tx);
1789 fiy0 = _mm_add_ps(fiy0,ty);
1790 fiz0 = _mm_add_ps(fiz0,tz);
1792 fjx0 = _mm_add_ps(fjx0,tx);
1793 fjy0 = _mm_add_ps(fjy0,ty);
1794 fjz0 = _mm_add_ps(fjz0,tz);
1796 /**************************
1797 * CALCULATE INTERACTIONS *
1798 **************************/
1800 /* REACTION-FIELD ELECTROSTATICS */
1801 felec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_mul_ps(rinv11,rinvsq11),krf2));
1805 fscal = _mm_andnot_ps(dummy_mask,fscal);
1807 /* Calculate temporary vectorial force */
1808 tx = _mm_mul_ps(fscal,dx11);
1809 ty = _mm_mul_ps(fscal,dy11);
1810 tz = _mm_mul_ps(fscal,dz11);
1812 /* Update vectorial force */
1813 fix1 = _mm_add_ps(fix1,tx);
1814 fiy1 = _mm_add_ps(fiy1,ty);
1815 fiz1 = _mm_add_ps(fiz1,tz);
1817 fjx1 = _mm_add_ps(fjx1,tx);
1818 fjy1 = _mm_add_ps(fjy1,ty);
1819 fjz1 = _mm_add_ps(fjz1,tz);
1821 /**************************
1822 * CALCULATE INTERACTIONS *
1823 **************************/
1825 /* REACTION-FIELD ELECTROSTATICS */
1826 felec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_mul_ps(rinv12,rinvsq12),krf2));
1830 fscal = _mm_andnot_ps(dummy_mask,fscal);
1832 /* Calculate temporary vectorial force */
1833 tx = _mm_mul_ps(fscal,dx12);
1834 ty = _mm_mul_ps(fscal,dy12);
1835 tz = _mm_mul_ps(fscal,dz12);
1837 /* Update vectorial force */
1838 fix1 = _mm_add_ps(fix1,tx);
1839 fiy1 = _mm_add_ps(fiy1,ty);
1840 fiz1 = _mm_add_ps(fiz1,tz);
1842 fjx2 = _mm_add_ps(fjx2,tx);
1843 fjy2 = _mm_add_ps(fjy2,ty);
1844 fjz2 = _mm_add_ps(fjz2,tz);
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 /* REACTION-FIELD ELECTROSTATICS */
1851 felec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_mul_ps(rinv13,rinvsq13),krf2));
1855 fscal = _mm_andnot_ps(dummy_mask,fscal);
1857 /* Calculate temporary vectorial force */
1858 tx = _mm_mul_ps(fscal,dx13);
1859 ty = _mm_mul_ps(fscal,dy13);
1860 tz = _mm_mul_ps(fscal,dz13);
1862 /* Update vectorial force */
1863 fix1 = _mm_add_ps(fix1,tx);
1864 fiy1 = _mm_add_ps(fiy1,ty);
1865 fiz1 = _mm_add_ps(fiz1,tz);
1867 fjx3 = _mm_add_ps(fjx3,tx);
1868 fjy3 = _mm_add_ps(fjy3,ty);
1869 fjz3 = _mm_add_ps(fjz3,tz);
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 /* REACTION-FIELD ELECTROSTATICS */
1876 felec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_mul_ps(rinv21,rinvsq21),krf2));
1880 fscal = _mm_andnot_ps(dummy_mask,fscal);
1882 /* Calculate temporary vectorial force */
1883 tx = _mm_mul_ps(fscal,dx21);
1884 ty = _mm_mul_ps(fscal,dy21);
1885 tz = _mm_mul_ps(fscal,dz21);
1887 /* Update vectorial force */
1888 fix2 = _mm_add_ps(fix2,tx);
1889 fiy2 = _mm_add_ps(fiy2,ty);
1890 fiz2 = _mm_add_ps(fiz2,tz);
1892 fjx1 = _mm_add_ps(fjx1,tx);
1893 fjy1 = _mm_add_ps(fjy1,ty);
1894 fjz1 = _mm_add_ps(fjz1,tz);
1896 /**************************
1897 * CALCULATE INTERACTIONS *
1898 **************************/
1900 /* REACTION-FIELD ELECTROSTATICS */
1901 felec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_mul_ps(rinv22,rinvsq22),krf2));
1905 fscal = _mm_andnot_ps(dummy_mask,fscal);
1907 /* Calculate temporary vectorial force */
1908 tx = _mm_mul_ps(fscal,dx22);
1909 ty = _mm_mul_ps(fscal,dy22);
1910 tz = _mm_mul_ps(fscal,dz22);
1912 /* Update vectorial force */
1913 fix2 = _mm_add_ps(fix2,tx);
1914 fiy2 = _mm_add_ps(fiy2,ty);
1915 fiz2 = _mm_add_ps(fiz2,tz);
1917 fjx2 = _mm_add_ps(fjx2,tx);
1918 fjy2 = _mm_add_ps(fjy2,ty);
1919 fjz2 = _mm_add_ps(fjz2,tz);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 /* REACTION-FIELD ELECTROSTATICS */
1926 felec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_mul_ps(rinv23,rinvsq23),krf2));
1930 fscal = _mm_andnot_ps(dummy_mask,fscal);
1932 /* Calculate temporary vectorial force */
1933 tx = _mm_mul_ps(fscal,dx23);
1934 ty = _mm_mul_ps(fscal,dy23);
1935 tz = _mm_mul_ps(fscal,dz23);
1937 /* Update vectorial force */
1938 fix2 = _mm_add_ps(fix2,tx);
1939 fiy2 = _mm_add_ps(fiy2,ty);
1940 fiz2 = _mm_add_ps(fiz2,tz);
1942 fjx3 = _mm_add_ps(fjx3,tx);
1943 fjy3 = _mm_add_ps(fjy3,ty);
1944 fjz3 = _mm_add_ps(fjz3,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 /* REACTION-FIELD ELECTROSTATICS */
1951 felec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_mul_ps(rinv31,rinvsq31),krf2));
1955 fscal = _mm_andnot_ps(dummy_mask,fscal);
1957 /* Calculate temporary vectorial force */
1958 tx = _mm_mul_ps(fscal,dx31);
1959 ty = _mm_mul_ps(fscal,dy31);
1960 tz = _mm_mul_ps(fscal,dz31);
1962 /* Update vectorial force */
1963 fix3 = _mm_add_ps(fix3,tx);
1964 fiy3 = _mm_add_ps(fiy3,ty);
1965 fiz3 = _mm_add_ps(fiz3,tz);
1967 fjx1 = _mm_add_ps(fjx1,tx);
1968 fjy1 = _mm_add_ps(fjy1,ty);
1969 fjz1 = _mm_add_ps(fjz1,tz);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 /* REACTION-FIELD ELECTROSTATICS */
1976 felec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_mul_ps(rinv32,rinvsq32),krf2));
1980 fscal = _mm_andnot_ps(dummy_mask,fscal);
1982 /* Calculate temporary vectorial force */
1983 tx = _mm_mul_ps(fscal,dx32);
1984 ty = _mm_mul_ps(fscal,dy32);
1985 tz = _mm_mul_ps(fscal,dz32);
1987 /* Update vectorial force */
1988 fix3 = _mm_add_ps(fix3,tx);
1989 fiy3 = _mm_add_ps(fiy3,ty);
1990 fiz3 = _mm_add_ps(fiz3,tz);
1992 fjx2 = _mm_add_ps(fjx2,tx);
1993 fjy2 = _mm_add_ps(fjy2,ty);
1994 fjz2 = _mm_add_ps(fjz2,tz);
1996 /**************************
1997 * CALCULATE INTERACTIONS *
1998 **************************/
2000 /* REACTION-FIELD ELECTROSTATICS */
2001 felec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_mul_ps(rinv33,rinvsq33),krf2));
2005 fscal = _mm_andnot_ps(dummy_mask,fscal);
2007 /* Calculate temporary vectorial force */
2008 tx = _mm_mul_ps(fscal,dx33);
2009 ty = _mm_mul_ps(fscal,dy33);
2010 tz = _mm_mul_ps(fscal,dz33);
2012 /* Update vectorial force */
2013 fix3 = _mm_add_ps(fix3,tx);
2014 fiy3 = _mm_add_ps(fiy3,ty);
2015 fiz3 = _mm_add_ps(fiz3,tz);
2017 fjx3 = _mm_add_ps(fjx3,tx);
2018 fjy3 = _mm_add_ps(fjy3,ty);
2019 fjz3 = _mm_add_ps(fjz3,tz);
2021 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2022 f+j_coord_offsetC,f+j_coord_offsetD,
2023 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2024 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2026 /* Inner loop uses 295 flops */
2029 /* End of innermost loop */
2031 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2032 f+i_coord_offset,fshift+i_shift_offset);
2034 /* Increment number of inner iterations */
2035 inneriter += j_index_end - j_index_start;
2037 /* Outer loop uses 36 flops */
2040 /* Increment number of outer iterations */
2043 /* Update outer/inner flops */
2045 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);