2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120 __m128 dummy_mask,cutoff_mask;
121 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one = _mm_set1_ps(1.0);
123 __m128 two = _mm_set1_ps(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm_set1_ps(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 jq1 = _mm_set1_ps(charge[inr+1]);
154 jq2 = _mm_set1_ps(charge[inr+2]);
155 jq3 = _mm_set1_ps(charge[inr+3]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq11 = _mm_mul_ps(iq1,jq1);
160 qq12 = _mm_mul_ps(iq1,jq2);
161 qq13 = _mm_mul_ps(iq1,jq3);
162 qq21 = _mm_mul_ps(iq2,jq1);
163 qq22 = _mm_mul_ps(iq2,jq2);
164 qq23 = _mm_mul_ps(iq2,jq3);
165 qq31 = _mm_mul_ps(iq3,jq1);
166 qq32 = _mm_mul_ps(iq3,jq2);
167 qq33 = _mm_mul_ps(iq3,jq3);
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202 fix0 = _mm_setzero_ps();
203 fiy0 = _mm_setzero_ps();
204 fiz0 = _mm_setzero_ps();
205 fix1 = _mm_setzero_ps();
206 fiy1 = _mm_setzero_ps();
207 fiz1 = _mm_setzero_ps();
208 fix2 = _mm_setzero_ps();
209 fiy2 = _mm_setzero_ps();
210 fiz2 = _mm_setzero_ps();
211 fix3 = _mm_setzero_ps();
212 fiy3 = _mm_setzero_ps();
213 fiz3 = _mm_setzero_ps();
215 /* Reset potential sums */
216 velecsum = _mm_setzero_ps();
217 vvdwsum = _mm_setzero_ps();
219 /* Start inner kernel loop */
220 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
223 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
233 /* load j atom coordinates */
234 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
235 x+j_coord_offsetC,x+j_coord_offsetD,
236 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
237 &jy2,&jz2,&jx3,&jy3,&jz3);
239 /* Calculate displacement vector */
240 dx00 = _mm_sub_ps(ix0,jx0);
241 dy00 = _mm_sub_ps(iy0,jy0);
242 dz00 = _mm_sub_ps(iz0,jz0);
243 dx11 = _mm_sub_ps(ix1,jx1);
244 dy11 = _mm_sub_ps(iy1,jy1);
245 dz11 = _mm_sub_ps(iz1,jz1);
246 dx12 = _mm_sub_ps(ix1,jx2);
247 dy12 = _mm_sub_ps(iy1,jy2);
248 dz12 = _mm_sub_ps(iz1,jz2);
249 dx13 = _mm_sub_ps(ix1,jx3);
250 dy13 = _mm_sub_ps(iy1,jy3);
251 dz13 = _mm_sub_ps(iz1,jz3);
252 dx21 = _mm_sub_ps(ix2,jx1);
253 dy21 = _mm_sub_ps(iy2,jy1);
254 dz21 = _mm_sub_ps(iz2,jz1);
255 dx22 = _mm_sub_ps(ix2,jx2);
256 dy22 = _mm_sub_ps(iy2,jy2);
257 dz22 = _mm_sub_ps(iz2,jz2);
258 dx23 = _mm_sub_ps(ix2,jx3);
259 dy23 = _mm_sub_ps(iy2,jy3);
260 dz23 = _mm_sub_ps(iz2,jz3);
261 dx31 = _mm_sub_ps(ix3,jx1);
262 dy31 = _mm_sub_ps(iy3,jy1);
263 dz31 = _mm_sub_ps(iz3,jz1);
264 dx32 = _mm_sub_ps(ix3,jx2);
265 dy32 = _mm_sub_ps(iy3,jy2);
266 dz32 = _mm_sub_ps(iz3,jz2);
267 dx33 = _mm_sub_ps(ix3,jx3);
268 dy33 = _mm_sub_ps(iy3,jy3);
269 dz33 = _mm_sub_ps(iz3,jz3);
271 /* Calculate squared distance and things based on it */
272 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
273 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
274 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
275 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
276 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
277 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
278 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
279 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
280 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
281 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
283 rinv11 = gmx_mm_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm_invsqrt_ps(rsq12);
285 rinv13 = gmx_mm_invsqrt_ps(rsq13);
286 rinv21 = gmx_mm_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm_invsqrt_ps(rsq22);
288 rinv23 = gmx_mm_invsqrt_ps(rsq23);
289 rinv31 = gmx_mm_invsqrt_ps(rsq31);
290 rinv32 = gmx_mm_invsqrt_ps(rsq32);
291 rinv33 = gmx_mm_invsqrt_ps(rsq33);
293 rinvsq00 = gmx_mm_inv_ps(rsq00);
294 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
295 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
296 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
297 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
298 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
299 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
300 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
301 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
302 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
304 fjx0 = _mm_setzero_ps();
305 fjy0 = _mm_setzero_ps();
306 fjz0 = _mm_setzero_ps();
307 fjx1 = _mm_setzero_ps();
308 fjy1 = _mm_setzero_ps();
309 fjz1 = _mm_setzero_ps();
310 fjx2 = _mm_setzero_ps();
311 fjy2 = _mm_setzero_ps();
312 fjz2 = _mm_setzero_ps();
313 fjx3 = _mm_setzero_ps();
314 fjy3 = _mm_setzero_ps();
315 fjz3 = _mm_setzero_ps();
317 /**************************
318 * CALCULATE INTERACTIONS *
319 **************************/
321 /* LENNARD-JONES DISPERSION/REPULSION */
323 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
324 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
325 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
326 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
327 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
334 /* Calculate temporary vectorial force */
335 tx = _mm_mul_ps(fscal,dx00);
336 ty = _mm_mul_ps(fscal,dy00);
337 tz = _mm_mul_ps(fscal,dz00);
339 /* Update vectorial force */
340 fix0 = _mm_add_ps(fix0,tx);
341 fiy0 = _mm_add_ps(fiy0,ty);
342 fiz0 = _mm_add_ps(fiz0,tz);
344 fjx0 = _mm_add_ps(fjx0,tx);
345 fjy0 = _mm_add_ps(fjy0,ty);
346 fjz0 = _mm_add_ps(fjz0,tz);
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 r11 = _mm_mul_ps(rsq11,rinv11);
354 /* EWALD ELECTROSTATICS */
356 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
357 ewrt = _mm_mul_ps(r11,ewtabscale);
358 ewitab = _mm_cvttps_epi32(ewrt);
359 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
360 ewitab = _mm_slli_epi32(ewitab,2);
361 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
362 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
363 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
364 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
365 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
366 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
367 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
368 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
369 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velecsum = _mm_add_ps(velecsum,velec);
376 /* Calculate temporary vectorial force */
377 tx = _mm_mul_ps(fscal,dx11);
378 ty = _mm_mul_ps(fscal,dy11);
379 tz = _mm_mul_ps(fscal,dz11);
381 /* Update vectorial force */
382 fix1 = _mm_add_ps(fix1,tx);
383 fiy1 = _mm_add_ps(fiy1,ty);
384 fiz1 = _mm_add_ps(fiz1,tz);
386 fjx1 = _mm_add_ps(fjx1,tx);
387 fjy1 = _mm_add_ps(fjy1,ty);
388 fjz1 = _mm_add_ps(fjz1,tz);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r12 = _mm_mul_ps(rsq12,rinv12);
396 /* EWALD ELECTROSTATICS */
398 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
399 ewrt = _mm_mul_ps(r12,ewtabscale);
400 ewitab = _mm_cvttps_epi32(ewrt);
401 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
402 ewitab = _mm_slli_epi32(ewitab,2);
403 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
404 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
405 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
406 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
407 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
408 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
409 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
410 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
411 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velecsum = _mm_add_ps(velecsum,velec);
418 /* Calculate temporary vectorial force */
419 tx = _mm_mul_ps(fscal,dx12);
420 ty = _mm_mul_ps(fscal,dy12);
421 tz = _mm_mul_ps(fscal,dz12);
423 /* Update vectorial force */
424 fix1 = _mm_add_ps(fix1,tx);
425 fiy1 = _mm_add_ps(fiy1,ty);
426 fiz1 = _mm_add_ps(fiz1,tz);
428 fjx2 = _mm_add_ps(fjx2,tx);
429 fjy2 = _mm_add_ps(fjy2,ty);
430 fjz2 = _mm_add_ps(fjz2,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 r13 = _mm_mul_ps(rsq13,rinv13);
438 /* EWALD ELECTROSTATICS */
440 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
441 ewrt = _mm_mul_ps(r13,ewtabscale);
442 ewitab = _mm_cvttps_epi32(ewrt);
443 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
444 ewitab = _mm_slli_epi32(ewitab,2);
445 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
446 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
447 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
448 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
449 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
450 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
451 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
452 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
453 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum = _mm_add_ps(velecsum,velec);
460 /* Calculate temporary vectorial force */
461 tx = _mm_mul_ps(fscal,dx13);
462 ty = _mm_mul_ps(fscal,dy13);
463 tz = _mm_mul_ps(fscal,dz13);
465 /* Update vectorial force */
466 fix1 = _mm_add_ps(fix1,tx);
467 fiy1 = _mm_add_ps(fiy1,ty);
468 fiz1 = _mm_add_ps(fiz1,tz);
470 fjx3 = _mm_add_ps(fjx3,tx);
471 fjy3 = _mm_add_ps(fjy3,ty);
472 fjz3 = _mm_add_ps(fjz3,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 r21 = _mm_mul_ps(rsq21,rinv21);
480 /* EWALD ELECTROSTATICS */
482 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
483 ewrt = _mm_mul_ps(r21,ewtabscale);
484 ewitab = _mm_cvttps_epi32(ewrt);
485 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
486 ewitab = _mm_slli_epi32(ewitab,2);
487 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
488 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
489 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
490 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
491 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
492 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
493 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
494 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
495 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_ps(velecsum,velec);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_ps(fscal,dx21);
504 ty = _mm_mul_ps(fscal,dy21);
505 tz = _mm_mul_ps(fscal,dz21);
507 /* Update vectorial force */
508 fix2 = _mm_add_ps(fix2,tx);
509 fiy2 = _mm_add_ps(fiy2,ty);
510 fiz2 = _mm_add_ps(fiz2,tz);
512 fjx1 = _mm_add_ps(fjx1,tx);
513 fjy1 = _mm_add_ps(fjy1,ty);
514 fjz1 = _mm_add_ps(fjz1,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 r22 = _mm_mul_ps(rsq22,rinv22);
522 /* EWALD ELECTROSTATICS */
524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525 ewrt = _mm_mul_ps(r22,ewtabscale);
526 ewitab = _mm_cvttps_epi32(ewrt);
527 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
528 ewitab = _mm_slli_epi32(ewitab,2);
529 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
530 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
531 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
532 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
533 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
534 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
535 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
536 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
537 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velecsum = _mm_add_ps(velecsum,velec);
544 /* Calculate temporary vectorial force */
545 tx = _mm_mul_ps(fscal,dx22);
546 ty = _mm_mul_ps(fscal,dy22);
547 tz = _mm_mul_ps(fscal,dz22);
549 /* Update vectorial force */
550 fix2 = _mm_add_ps(fix2,tx);
551 fiy2 = _mm_add_ps(fiy2,ty);
552 fiz2 = _mm_add_ps(fiz2,tz);
554 fjx2 = _mm_add_ps(fjx2,tx);
555 fjy2 = _mm_add_ps(fjy2,ty);
556 fjz2 = _mm_add_ps(fjz2,tz);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 r23 = _mm_mul_ps(rsq23,rinv23);
564 /* EWALD ELECTROSTATICS */
566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567 ewrt = _mm_mul_ps(r23,ewtabscale);
568 ewitab = _mm_cvttps_epi32(ewrt);
569 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
570 ewitab = _mm_slli_epi32(ewitab,2);
571 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
572 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
573 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
574 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
575 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
576 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
577 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
578 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
579 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
581 /* Update potential sum for this i atom from the interaction with this j atom. */
582 velecsum = _mm_add_ps(velecsum,velec);
586 /* Calculate temporary vectorial force */
587 tx = _mm_mul_ps(fscal,dx23);
588 ty = _mm_mul_ps(fscal,dy23);
589 tz = _mm_mul_ps(fscal,dz23);
591 /* Update vectorial force */
592 fix2 = _mm_add_ps(fix2,tx);
593 fiy2 = _mm_add_ps(fiy2,ty);
594 fiz2 = _mm_add_ps(fiz2,tz);
596 fjx3 = _mm_add_ps(fjx3,tx);
597 fjy3 = _mm_add_ps(fjy3,ty);
598 fjz3 = _mm_add_ps(fjz3,tz);
600 /**************************
601 * CALCULATE INTERACTIONS *
602 **************************/
604 r31 = _mm_mul_ps(rsq31,rinv31);
606 /* EWALD ELECTROSTATICS */
608 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
609 ewrt = _mm_mul_ps(r31,ewtabscale);
610 ewitab = _mm_cvttps_epi32(ewrt);
611 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
612 ewitab = _mm_slli_epi32(ewitab,2);
613 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
614 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
615 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
616 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
617 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
618 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
619 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
620 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
621 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
623 /* Update potential sum for this i atom from the interaction with this j atom. */
624 velecsum = _mm_add_ps(velecsum,velec);
628 /* Calculate temporary vectorial force */
629 tx = _mm_mul_ps(fscal,dx31);
630 ty = _mm_mul_ps(fscal,dy31);
631 tz = _mm_mul_ps(fscal,dz31);
633 /* Update vectorial force */
634 fix3 = _mm_add_ps(fix3,tx);
635 fiy3 = _mm_add_ps(fiy3,ty);
636 fiz3 = _mm_add_ps(fiz3,tz);
638 fjx1 = _mm_add_ps(fjx1,tx);
639 fjy1 = _mm_add_ps(fjy1,ty);
640 fjz1 = _mm_add_ps(fjz1,tz);
642 /**************************
643 * CALCULATE INTERACTIONS *
644 **************************/
646 r32 = _mm_mul_ps(rsq32,rinv32);
648 /* EWALD ELECTROSTATICS */
650 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
651 ewrt = _mm_mul_ps(r32,ewtabscale);
652 ewitab = _mm_cvttps_epi32(ewrt);
653 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
654 ewitab = _mm_slli_epi32(ewitab,2);
655 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
656 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
657 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
658 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
659 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
660 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
661 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
662 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
663 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velecsum = _mm_add_ps(velecsum,velec);
670 /* Calculate temporary vectorial force */
671 tx = _mm_mul_ps(fscal,dx32);
672 ty = _mm_mul_ps(fscal,dy32);
673 tz = _mm_mul_ps(fscal,dz32);
675 /* Update vectorial force */
676 fix3 = _mm_add_ps(fix3,tx);
677 fiy3 = _mm_add_ps(fiy3,ty);
678 fiz3 = _mm_add_ps(fiz3,tz);
680 fjx2 = _mm_add_ps(fjx2,tx);
681 fjy2 = _mm_add_ps(fjy2,ty);
682 fjz2 = _mm_add_ps(fjz2,tz);
684 /**************************
685 * CALCULATE INTERACTIONS *
686 **************************/
688 r33 = _mm_mul_ps(rsq33,rinv33);
690 /* EWALD ELECTROSTATICS */
692 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
693 ewrt = _mm_mul_ps(r33,ewtabscale);
694 ewitab = _mm_cvttps_epi32(ewrt);
695 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
696 ewitab = _mm_slli_epi32(ewitab,2);
697 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
698 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
699 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
700 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
701 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
702 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
703 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
704 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
705 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
707 /* Update potential sum for this i atom from the interaction with this j atom. */
708 velecsum = _mm_add_ps(velecsum,velec);
712 /* Calculate temporary vectorial force */
713 tx = _mm_mul_ps(fscal,dx33);
714 ty = _mm_mul_ps(fscal,dy33);
715 tz = _mm_mul_ps(fscal,dz33);
717 /* Update vectorial force */
718 fix3 = _mm_add_ps(fix3,tx);
719 fiy3 = _mm_add_ps(fiy3,ty);
720 fiz3 = _mm_add_ps(fiz3,tz);
722 fjx3 = _mm_add_ps(fjx3,tx);
723 fjy3 = _mm_add_ps(fjy3,ty);
724 fjz3 = _mm_add_ps(fjz3,tz);
726 fjptrA = f+j_coord_offsetA;
727 fjptrB = f+j_coord_offsetB;
728 fjptrC = f+j_coord_offsetC;
729 fjptrD = f+j_coord_offsetD;
731 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
732 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
733 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
735 /* Inner loop uses 404 flops */
741 /* Get j neighbor index, and coordinate index */
742 jnrlistA = jjnr[jidx];
743 jnrlistB = jjnr[jidx+1];
744 jnrlistC = jjnr[jidx+2];
745 jnrlistD = jjnr[jidx+3];
746 /* Sign of each element will be negative for non-real atoms.
747 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
748 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
750 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
751 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
752 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
753 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
754 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
755 j_coord_offsetA = DIM*jnrA;
756 j_coord_offsetB = DIM*jnrB;
757 j_coord_offsetC = DIM*jnrC;
758 j_coord_offsetD = DIM*jnrD;
760 /* load j atom coordinates */
761 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
762 x+j_coord_offsetC,x+j_coord_offsetD,
763 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
764 &jy2,&jz2,&jx3,&jy3,&jz3);
766 /* Calculate displacement vector */
767 dx00 = _mm_sub_ps(ix0,jx0);
768 dy00 = _mm_sub_ps(iy0,jy0);
769 dz00 = _mm_sub_ps(iz0,jz0);
770 dx11 = _mm_sub_ps(ix1,jx1);
771 dy11 = _mm_sub_ps(iy1,jy1);
772 dz11 = _mm_sub_ps(iz1,jz1);
773 dx12 = _mm_sub_ps(ix1,jx2);
774 dy12 = _mm_sub_ps(iy1,jy2);
775 dz12 = _mm_sub_ps(iz1,jz2);
776 dx13 = _mm_sub_ps(ix1,jx3);
777 dy13 = _mm_sub_ps(iy1,jy3);
778 dz13 = _mm_sub_ps(iz1,jz3);
779 dx21 = _mm_sub_ps(ix2,jx1);
780 dy21 = _mm_sub_ps(iy2,jy1);
781 dz21 = _mm_sub_ps(iz2,jz1);
782 dx22 = _mm_sub_ps(ix2,jx2);
783 dy22 = _mm_sub_ps(iy2,jy2);
784 dz22 = _mm_sub_ps(iz2,jz2);
785 dx23 = _mm_sub_ps(ix2,jx3);
786 dy23 = _mm_sub_ps(iy2,jy3);
787 dz23 = _mm_sub_ps(iz2,jz3);
788 dx31 = _mm_sub_ps(ix3,jx1);
789 dy31 = _mm_sub_ps(iy3,jy1);
790 dz31 = _mm_sub_ps(iz3,jz1);
791 dx32 = _mm_sub_ps(ix3,jx2);
792 dy32 = _mm_sub_ps(iy3,jy2);
793 dz32 = _mm_sub_ps(iz3,jz2);
794 dx33 = _mm_sub_ps(ix3,jx3);
795 dy33 = _mm_sub_ps(iy3,jy3);
796 dz33 = _mm_sub_ps(iz3,jz3);
798 /* Calculate squared distance and things based on it */
799 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
800 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
801 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
802 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
803 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
804 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
805 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
806 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
807 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
808 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
810 rinv11 = gmx_mm_invsqrt_ps(rsq11);
811 rinv12 = gmx_mm_invsqrt_ps(rsq12);
812 rinv13 = gmx_mm_invsqrt_ps(rsq13);
813 rinv21 = gmx_mm_invsqrt_ps(rsq21);
814 rinv22 = gmx_mm_invsqrt_ps(rsq22);
815 rinv23 = gmx_mm_invsqrt_ps(rsq23);
816 rinv31 = gmx_mm_invsqrt_ps(rsq31);
817 rinv32 = gmx_mm_invsqrt_ps(rsq32);
818 rinv33 = gmx_mm_invsqrt_ps(rsq33);
820 rinvsq00 = gmx_mm_inv_ps(rsq00);
821 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
822 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
823 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
824 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
825 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
826 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
827 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
828 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
829 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
831 fjx0 = _mm_setzero_ps();
832 fjy0 = _mm_setzero_ps();
833 fjz0 = _mm_setzero_ps();
834 fjx1 = _mm_setzero_ps();
835 fjy1 = _mm_setzero_ps();
836 fjz1 = _mm_setzero_ps();
837 fjx2 = _mm_setzero_ps();
838 fjy2 = _mm_setzero_ps();
839 fjz2 = _mm_setzero_ps();
840 fjx3 = _mm_setzero_ps();
841 fjy3 = _mm_setzero_ps();
842 fjz3 = _mm_setzero_ps();
844 /**************************
845 * CALCULATE INTERACTIONS *
846 **************************/
848 /* LENNARD-JONES DISPERSION/REPULSION */
850 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
851 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
852 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
853 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
854 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
856 /* Update potential sum for this i atom from the interaction with this j atom. */
857 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
858 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
862 fscal = _mm_andnot_ps(dummy_mask,fscal);
864 /* Calculate temporary vectorial force */
865 tx = _mm_mul_ps(fscal,dx00);
866 ty = _mm_mul_ps(fscal,dy00);
867 tz = _mm_mul_ps(fscal,dz00);
869 /* Update vectorial force */
870 fix0 = _mm_add_ps(fix0,tx);
871 fiy0 = _mm_add_ps(fiy0,ty);
872 fiz0 = _mm_add_ps(fiz0,tz);
874 fjx0 = _mm_add_ps(fjx0,tx);
875 fjy0 = _mm_add_ps(fjy0,ty);
876 fjz0 = _mm_add_ps(fjz0,tz);
878 /**************************
879 * CALCULATE INTERACTIONS *
880 **************************/
882 r11 = _mm_mul_ps(rsq11,rinv11);
883 r11 = _mm_andnot_ps(dummy_mask,r11);
885 /* EWALD ELECTROSTATICS */
887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
888 ewrt = _mm_mul_ps(r11,ewtabscale);
889 ewitab = _mm_cvttps_epi32(ewrt);
890 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
891 ewitab = _mm_slli_epi32(ewitab,2);
892 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
893 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
894 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
895 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
896 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
897 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
898 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
899 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
900 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm_andnot_ps(dummy_mask,velec);
904 velecsum = _mm_add_ps(velecsum,velec);
908 fscal = _mm_andnot_ps(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm_mul_ps(fscal,dx11);
912 ty = _mm_mul_ps(fscal,dy11);
913 tz = _mm_mul_ps(fscal,dz11);
915 /* Update vectorial force */
916 fix1 = _mm_add_ps(fix1,tx);
917 fiy1 = _mm_add_ps(fiy1,ty);
918 fiz1 = _mm_add_ps(fiz1,tz);
920 fjx1 = _mm_add_ps(fjx1,tx);
921 fjy1 = _mm_add_ps(fjy1,ty);
922 fjz1 = _mm_add_ps(fjz1,tz);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 r12 = _mm_mul_ps(rsq12,rinv12);
929 r12 = _mm_andnot_ps(dummy_mask,r12);
931 /* EWALD ELECTROSTATICS */
933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
934 ewrt = _mm_mul_ps(r12,ewtabscale);
935 ewitab = _mm_cvttps_epi32(ewrt);
936 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
937 ewitab = _mm_slli_epi32(ewitab,2);
938 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
939 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
940 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
941 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
942 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
943 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
944 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
945 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
946 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm_andnot_ps(dummy_mask,velec);
950 velecsum = _mm_add_ps(velecsum,velec);
954 fscal = _mm_andnot_ps(dummy_mask,fscal);
956 /* Calculate temporary vectorial force */
957 tx = _mm_mul_ps(fscal,dx12);
958 ty = _mm_mul_ps(fscal,dy12);
959 tz = _mm_mul_ps(fscal,dz12);
961 /* Update vectorial force */
962 fix1 = _mm_add_ps(fix1,tx);
963 fiy1 = _mm_add_ps(fiy1,ty);
964 fiz1 = _mm_add_ps(fiz1,tz);
966 fjx2 = _mm_add_ps(fjx2,tx);
967 fjy2 = _mm_add_ps(fjy2,ty);
968 fjz2 = _mm_add_ps(fjz2,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r13 = _mm_mul_ps(rsq13,rinv13);
975 r13 = _mm_andnot_ps(dummy_mask,r13);
977 /* EWALD ELECTROSTATICS */
979 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
980 ewrt = _mm_mul_ps(r13,ewtabscale);
981 ewitab = _mm_cvttps_epi32(ewrt);
982 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
983 ewitab = _mm_slli_epi32(ewitab,2);
984 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
985 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
986 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
987 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
988 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
989 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
990 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
991 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
992 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm_andnot_ps(dummy_mask,velec);
996 velecsum = _mm_add_ps(velecsum,velec);
1000 fscal = _mm_andnot_ps(dummy_mask,fscal);
1002 /* Calculate temporary vectorial force */
1003 tx = _mm_mul_ps(fscal,dx13);
1004 ty = _mm_mul_ps(fscal,dy13);
1005 tz = _mm_mul_ps(fscal,dz13);
1007 /* Update vectorial force */
1008 fix1 = _mm_add_ps(fix1,tx);
1009 fiy1 = _mm_add_ps(fiy1,ty);
1010 fiz1 = _mm_add_ps(fiz1,tz);
1012 fjx3 = _mm_add_ps(fjx3,tx);
1013 fjy3 = _mm_add_ps(fjy3,ty);
1014 fjz3 = _mm_add_ps(fjz3,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 r21 = _mm_mul_ps(rsq21,rinv21);
1021 r21 = _mm_andnot_ps(dummy_mask,r21);
1023 /* EWALD ELECTROSTATICS */
1025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1026 ewrt = _mm_mul_ps(r21,ewtabscale);
1027 ewitab = _mm_cvttps_epi32(ewrt);
1028 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1029 ewitab = _mm_slli_epi32(ewitab,2);
1030 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1031 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1032 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1033 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1034 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1035 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1036 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1037 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1038 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm_andnot_ps(dummy_mask,velec);
1042 velecsum = _mm_add_ps(velecsum,velec);
1046 fscal = _mm_andnot_ps(dummy_mask,fscal);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm_mul_ps(fscal,dx21);
1050 ty = _mm_mul_ps(fscal,dy21);
1051 tz = _mm_mul_ps(fscal,dz21);
1053 /* Update vectorial force */
1054 fix2 = _mm_add_ps(fix2,tx);
1055 fiy2 = _mm_add_ps(fiy2,ty);
1056 fiz2 = _mm_add_ps(fiz2,tz);
1058 fjx1 = _mm_add_ps(fjx1,tx);
1059 fjy1 = _mm_add_ps(fjy1,ty);
1060 fjz1 = _mm_add_ps(fjz1,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r22 = _mm_mul_ps(rsq22,rinv22);
1067 r22 = _mm_andnot_ps(dummy_mask,r22);
1069 /* EWALD ELECTROSTATICS */
1071 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1072 ewrt = _mm_mul_ps(r22,ewtabscale);
1073 ewitab = _mm_cvttps_epi32(ewrt);
1074 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1075 ewitab = _mm_slli_epi32(ewitab,2);
1076 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1077 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1078 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1079 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1080 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1081 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1082 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1083 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1084 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm_andnot_ps(dummy_mask,velec);
1088 velecsum = _mm_add_ps(velecsum,velec);
1092 fscal = _mm_andnot_ps(dummy_mask,fscal);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm_mul_ps(fscal,dx22);
1096 ty = _mm_mul_ps(fscal,dy22);
1097 tz = _mm_mul_ps(fscal,dz22);
1099 /* Update vectorial force */
1100 fix2 = _mm_add_ps(fix2,tx);
1101 fiy2 = _mm_add_ps(fiy2,ty);
1102 fiz2 = _mm_add_ps(fiz2,tz);
1104 fjx2 = _mm_add_ps(fjx2,tx);
1105 fjy2 = _mm_add_ps(fjy2,ty);
1106 fjz2 = _mm_add_ps(fjz2,tz);
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 r23 = _mm_mul_ps(rsq23,rinv23);
1113 r23 = _mm_andnot_ps(dummy_mask,r23);
1115 /* EWALD ELECTROSTATICS */
1117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118 ewrt = _mm_mul_ps(r23,ewtabscale);
1119 ewitab = _mm_cvttps_epi32(ewrt);
1120 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1121 ewitab = _mm_slli_epi32(ewitab,2);
1122 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1123 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1124 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1125 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1126 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1127 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1128 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1129 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1130 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec = _mm_andnot_ps(dummy_mask,velec);
1134 velecsum = _mm_add_ps(velecsum,velec);
1138 fscal = _mm_andnot_ps(dummy_mask,fscal);
1140 /* Calculate temporary vectorial force */
1141 tx = _mm_mul_ps(fscal,dx23);
1142 ty = _mm_mul_ps(fscal,dy23);
1143 tz = _mm_mul_ps(fscal,dz23);
1145 /* Update vectorial force */
1146 fix2 = _mm_add_ps(fix2,tx);
1147 fiy2 = _mm_add_ps(fiy2,ty);
1148 fiz2 = _mm_add_ps(fiz2,tz);
1150 fjx3 = _mm_add_ps(fjx3,tx);
1151 fjy3 = _mm_add_ps(fjy3,ty);
1152 fjz3 = _mm_add_ps(fjz3,tz);
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 r31 = _mm_mul_ps(rsq31,rinv31);
1159 r31 = _mm_andnot_ps(dummy_mask,r31);
1161 /* EWALD ELECTROSTATICS */
1163 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1164 ewrt = _mm_mul_ps(r31,ewtabscale);
1165 ewitab = _mm_cvttps_epi32(ewrt);
1166 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1167 ewitab = _mm_slli_epi32(ewitab,2);
1168 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1169 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1170 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1171 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1172 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1173 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1174 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1175 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1176 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm_add_ps(velecsum,velec);
1184 fscal = _mm_andnot_ps(dummy_mask,fscal);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm_mul_ps(fscal,dx31);
1188 ty = _mm_mul_ps(fscal,dy31);
1189 tz = _mm_mul_ps(fscal,dz31);
1191 /* Update vectorial force */
1192 fix3 = _mm_add_ps(fix3,tx);
1193 fiy3 = _mm_add_ps(fiy3,ty);
1194 fiz3 = _mm_add_ps(fiz3,tz);
1196 fjx1 = _mm_add_ps(fjx1,tx);
1197 fjy1 = _mm_add_ps(fjy1,ty);
1198 fjz1 = _mm_add_ps(fjz1,tz);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 r32 = _mm_mul_ps(rsq32,rinv32);
1205 r32 = _mm_andnot_ps(dummy_mask,r32);
1207 /* EWALD ELECTROSTATICS */
1209 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1210 ewrt = _mm_mul_ps(r32,ewtabscale);
1211 ewitab = _mm_cvttps_epi32(ewrt);
1212 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1213 ewitab = _mm_slli_epi32(ewitab,2);
1214 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1215 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1216 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1217 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1218 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1219 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1220 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1221 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1222 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1224 /* Update potential sum for this i atom from the interaction with this j atom. */
1225 velec = _mm_andnot_ps(dummy_mask,velec);
1226 velecsum = _mm_add_ps(velecsum,velec);
1230 fscal = _mm_andnot_ps(dummy_mask,fscal);
1232 /* Calculate temporary vectorial force */
1233 tx = _mm_mul_ps(fscal,dx32);
1234 ty = _mm_mul_ps(fscal,dy32);
1235 tz = _mm_mul_ps(fscal,dz32);
1237 /* Update vectorial force */
1238 fix3 = _mm_add_ps(fix3,tx);
1239 fiy3 = _mm_add_ps(fiy3,ty);
1240 fiz3 = _mm_add_ps(fiz3,tz);
1242 fjx2 = _mm_add_ps(fjx2,tx);
1243 fjy2 = _mm_add_ps(fjy2,ty);
1244 fjz2 = _mm_add_ps(fjz2,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 r33 = _mm_mul_ps(rsq33,rinv33);
1251 r33 = _mm_andnot_ps(dummy_mask,r33);
1253 /* EWALD ELECTROSTATICS */
1255 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256 ewrt = _mm_mul_ps(r33,ewtabscale);
1257 ewitab = _mm_cvttps_epi32(ewrt);
1258 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1259 ewitab = _mm_slli_epi32(ewitab,2);
1260 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1261 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1262 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1263 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1264 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1265 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1266 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1267 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1268 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1270 /* Update potential sum for this i atom from the interaction with this j atom. */
1271 velec = _mm_andnot_ps(dummy_mask,velec);
1272 velecsum = _mm_add_ps(velecsum,velec);
1276 fscal = _mm_andnot_ps(dummy_mask,fscal);
1278 /* Calculate temporary vectorial force */
1279 tx = _mm_mul_ps(fscal,dx33);
1280 ty = _mm_mul_ps(fscal,dy33);
1281 tz = _mm_mul_ps(fscal,dz33);
1283 /* Update vectorial force */
1284 fix3 = _mm_add_ps(fix3,tx);
1285 fiy3 = _mm_add_ps(fiy3,ty);
1286 fiz3 = _mm_add_ps(fiz3,tz);
1288 fjx3 = _mm_add_ps(fjx3,tx);
1289 fjy3 = _mm_add_ps(fjy3,ty);
1290 fjz3 = _mm_add_ps(fjz3,tz);
1292 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1293 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1294 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1295 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1297 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1298 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1299 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1301 /* Inner loop uses 413 flops */
1304 /* End of innermost loop */
1306 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1307 f+i_coord_offset,fshift+i_shift_offset);
1310 /* Update potential energies */
1311 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1312 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1314 /* Increment number of inner iterations */
1315 inneriter += j_index_end - j_index_start;
1317 /* Outer loop uses 26 flops */
1320 /* Increment number of outer iterations */
1323 /* Update outer/inner flops */
1325 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
1328 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single
1329 * Electrostatics interaction: Ewald
1330 * VdW interaction: LennardJones
1331 * Geometry: Water4-Water4
1332 * Calculate force/pot: Force
1335 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single
1336 (t_nblist * gmx_restrict nlist,
1337 rvec * gmx_restrict xx,
1338 rvec * gmx_restrict ff,
1339 t_forcerec * gmx_restrict fr,
1340 t_mdatoms * gmx_restrict mdatoms,
1341 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1342 t_nrnb * gmx_restrict nrnb)
1344 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1345 * just 0 for non-waters.
1346 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1347 * jnr indices corresponding to data put in the four positions in the SIMD register.
1349 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1350 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1351 int jnrA,jnrB,jnrC,jnrD;
1352 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1353 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1354 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1355 real rcutoff_scalar;
1356 real *shiftvec,*fshift,*x,*f;
1357 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1358 real scratch[4*DIM];
1359 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1361 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1363 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1365 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1367 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1368 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1369 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1370 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1371 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1372 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1373 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1374 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1375 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1376 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1377 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1378 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1379 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1380 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1381 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1382 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1383 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1384 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1385 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1386 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1389 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1392 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1393 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1395 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1397 __m128 dummy_mask,cutoff_mask;
1398 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1399 __m128 one = _mm_set1_ps(1.0);
1400 __m128 two = _mm_set1_ps(2.0);
1406 jindex = nlist->jindex;
1408 shiftidx = nlist->shift;
1410 shiftvec = fr->shift_vec[0];
1411 fshift = fr->fshift[0];
1412 facel = _mm_set1_ps(fr->epsfac);
1413 charge = mdatoms->chargeA;
1414 nvdwtype = fr->ntype;
1415 vdwparam = fr->nbfp;
1416 vdwtype = mdatoms->typeA;
1418 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1419 ewtab = fr->ic->tabq_coul_F;
1420 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1421 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1423 /* Setup water-specific parameters */
1424 inr = nlist->iinr[0];
1425 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1426 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1427 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1428 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1430 jq1 = _mm_set1_ps(charge[inr+1]);
1431 jq2 = _mm_set1_ps(charge[inr+2]);
1432 jq3 = _mm_set1_ps(charge[inr+3]);
1433 vdwjidx0A = 2*vdwtype[inr+0];
1434 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1435 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1436 qq11 = _mm_mul_ps(iq1,jq1);
1437 qq12 = _mm_mul_ps(iq1,jq2);
1438 qq13 = _mm_mul_ps(iq1,jq3);
1439 qq21 = _mm_mul_ps(iq2,jq1);
1440 qq22 = _mm_mul_ps(iq2,jq2);
1441 qq23 = _mm_mul_ps(iq2,jq3);
1442 qq31 = _mm_mul_ps(iq3,jq1);
1443 qq32 = _mm_mul_ps(iq3,jq2);
1444 qq33 = _mm_mul_ps(iq3,jq3);
1446 /* Avoid stupid compiler warnings */
1447 jnrA = jnrB = jnrC = jnrD = 0;
1448 j_coord_offsetA = 0;
1449 j_coord_offsetB = 0;
1450 j_coord_offsetC = 0;
1451 j_coord_offsetD = 0;
1456 for(iidx=0;iidx<4*DIM;iidx++)
1458 scratch[iidx] = 0.0;
1461 /* Start outer loop over neighborlists */
1462 for(iidx=0; iidx<nri; iidx++)
1464 /* Load shift vector for this list */
1465 i_shift_offset = DIM*shiftidx[iidx];
1467 /* Load limits for loop over neighbors */
1468 j_index_start = jindex[iidx];
1469 j_index_end = jindex[iidx+1];
1471 /* Get outer coordinate index */
1473 i_coord_offset = DIM*inr;
1475 /* Load i particle coords and add shift vector */
1476 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1477 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1479 fix0 = _mm_setzero_ps();
1480 fiy0 = _mm_setzero_ps();
1481 fiz0 = _mm_setzero_ps();
1482 fix1 = _mm_setzero_ps();
1483 fiy1 = _mm_setzero_ps();
1484 fiz1 = _mm_setzero_ps();
1485 fix2 = _mm_setzero_ps();
1486 fiy2 = _mm_setzero_ps();
1487 fiz2 = _mm_setzero_ps();
1488 fix3 = _mm_setzero_ps();
1489 fiy3 = _mm_setzero_ps();
1490 fiz3 = _mm_setzero_ps();
1492 /* Start inner kernel loop */
1493 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1496 /* Get j neighbor index, and coordinate index */
1498 jnrB = jjnr[jidx+1];
1499 jnrC = jjnr[jidx+2];
1500 jnrD = jjnr[jidx+3];
1501 j_coord_offsetA = DIM*jnrA;
1502 j_coord_offsetB = DIM*jnrB;
1503 j_coord_offsetC = DIM*jnrC;
1504 j_coord_offsetD = DIM*jnrD;
1506 /* load j atom coordinates */
1507 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1508 x+j_coord_offsetC,x+j_coord_offsetD,
1509 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1510 &jy2,&jz2,&jx3,&jy3,&jz3);
1512 /* Calculate displacement vector */
1513 dx00 = _mm_sub_ps(ix0,jx0);
1514 dy00 = _mm_sub_ps(iy0,jy0);
1515 dz00 = _mm_sub_ps(iz0,jz0);
1516 dx11 = _mm_sub_ps(ix1,jx1);
1517 dy11 = _mm_sub_ps(iy1,jy1);
1518 dz11 = _mm_sub_ps(iz1,jz1);
1519 dx12 = _mm_sub_ps(ix1,jx2);
1520 dy12 = _mm_sub_ps(iy1,jy2);
1521 dz12 = _mm_sub_ps(iz1,jz2);
1522 dx13 = _mm_sub_ps(ix1,jx3);
1523 dy13 = _mm_sub_ps(iy1,jy3);
1524 dz13 = _mm_sub_ps(iz1,jz3);
1525 dx21 = _mm_sub_ps(ix2,jx1);
1526 dy21 = _mm_sub_ps(iy2,jy1);
1527 dz21 = _mm_sub_ps(iz2,jz1);
1528 dx22 = _mm_sub_ps(ix2,jx2);
1529 dy22 = _mm_sub_ps(iy2,jy2);
1530 dz22 = _mm_sub_ps(iz2,jz2);
1531 dx23 = _mm_sub_ps(ix2,jx3);
1532 dy23 = _mm_sub_ps(iy2,jy3);
1533 dz23 = _mm_sub_ps(iz2,jz3);
1534 dx31 = _mm_sub_ps(ix3,jx1);
1535 dy31 = _mm_sub_ps(iy3,jy1);
1536 dz31 = _mm_sub_ps(iz3,jz1);
1537 dx32 = _mm_sub_ps(ix3,jx2);
1538 dy32 = _mm_sub_ps(iy3,jy2);
1539 dz32 = _mm_sub_ps(iz3,jz2);
1540 dx33 = _mm_sub_ps(ix3,jx3);
1541 dy33 = _mm_sub_ps(iy3,jy3);
1542 dz33 = _mm_sub_ps(iz3,jz3);
1544 /* Calculate squared distance and things based on it */
1545 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1546 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1547 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1548 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1549 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1550 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1551 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1552 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1553 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1554 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1556 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1557 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1558 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1559 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1560 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1561 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1562 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1563 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1564 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1566 rinvsq00 = gmx_mm_inv_ps(rsq00);
1567 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1568 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1569 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1570 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1571 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1572 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1573 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1574 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1575 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1577 fjx0 = _mm_setzero_ps();
1578 fjy0 = _mm_setzero_ps();
1579 fjz0 = _mm_setzero_ps();
1580 fjx1 = _mm_setzero_ps();
1581 fjy1 = _mm_setzero_ps();
1582 fjz1 = _mm_setzero_ps();
1583 fjx2 = _mm_setzero_ps();
1584 fjy2 = _mm_setzero_ps();
1585 fjz2 = _mm_setzero_ps();
1586 fjx3 = _mm_setzero_ps();
1587 fjy3 = _mm_setzero_ps();
1588 fjz3 = _mm_setzero_ps();
1590 /**************************
1591 * CALCULATE INTERACTIONS *
1592 **************************/
1594 /* LENNARD-JONES DISPERSION/REPULSION */
1596 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1597 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1601 /* Calculate temporary vectorial force */
1602 tx = _mm_mul_ps(fscal,dx00);
1603 ty = _mm_mul_ps(fscal,dy00);
1604 tz = _mm_mul_ps(fscal,dz00);
1606 /* Update vectorial force */
1607 fix0 = _mm_add_ps(fix0,tx);
1608 fiy0 = _mm_add_ps(fiy0,ty);
1609 fiz0 = _mm_add_ps(fiz0,tz);
1611 fjx0 = _mm_add_ps(fjx0,tx);
1612 fjy0 = _mm_add_ps(fjy0,ty);
1613 fjz0 = _mm_add_ps(fjz0,tz);
1615 /**************************
1616 * CALCULATE INTERACTIONS *
1617 **************************/
1619 r11 = _mm_mul_ps(rsq11,rinv11);
1621 /* EWALD ELECTROSTATICS */
1623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1624 ewrt = _mm_mul_ps(r11,ewtabscale);
1625 ewitab = _mm_cvttps_epi32(ewrt);
1626 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1627 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1628 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1630 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1631 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1635 /* Calculate temporary vectorial force */
1636 tx = _mm_mul_ps(fscal,dx11);
1637 ty = _mm_mul_ps(fscal,dy11);
1638 tz = _mm_mul_ps(fscal,dz11);
1640 /* Update vectorial force */
1641 fix1 = _mm_add_ps(fix1,tx);
1642 fiy1 = _mm_add_ps(fiy1,ty);
1643 fiz1 = _mm_add_ps(fiz1,tz);
1645 fjx1 = _mm_add_ps(fjx1,tx);
1646 fjy1 = _mm_add_ps(fjy1,ty);
1647 fjz1 = _mm_add_ps(fjz1,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r12 = _mm_mul_ps(rsq12,rinv12);
1655 /* EWALD ELECTROSTATICS */
1657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1658 ewrt = _mm_mul_ps(r12,ewtabscale);
1659 ewitab = _mm_cvttps_epi32(ewrt);
1660 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1661 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1662 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1664 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1665 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1669 /* Calculate temporary vectorial force */
1670 tx = _mm_mul_ps(fscal,dx12);
1671 ty = _mm_mul_ps(fscal,dy12);
1672 tz = _mm_mul_ps(fscal,dz12);
1674 /* Update vectorial force */
1675 fix1 = _mm_add_ps(fix1,tx);
1676 fiy1 = _mm_add_ps(fiy1,ty);
1677 fiz1 = _mm_add_ps(fiz1,tz);
1679 fjx2 = _mm_add_ps(fjx2,tx);
1680 fjy2 = _mm_add_ps(fjy2,ty);
1681 fjz2 = _mm_add_ps(fjz2,tz);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r13 = _mm_mul_ps(rsq13,rinv13);
1689 /* EWALD ELECTROSTATICS */
1691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1692 ewrt = _mm_mul_ps(r13,ewtabscale);
1693 ewitab = _mm_cvttps_epi32(ewrt);
1694 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1695 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1696 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1698 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1699 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1703 /* Calculate temporary vectorial force */
1704 tx = _mm_mul_ps(fscal,dx13);
1705 ty = _mm_mul_ps(fscal,dy13);
1706 tz = _mm_mul_ps(fscal,dz13);
1708 /* Update vectorial force */
1709 fix1 = _mm_add_ps(fix1,tx);
1710 fiy1 = _mm_add_ps(fiy1,ty);
1711 fiz1 = _mm_add_ps(fiz1,tz);
1713 fjx3 = _mm_add_ps(fjx3,tx);
1714 fjy3 = _mm_add_ps(fjy3,ty);
1715 fjz3 = _mm_add_ps(fjz3,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r21 = _mm_mul_ps(rsq21,rinv21);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt = _mm_mul_ps(r21,ewtabscale);
1727 ewitab = _mm_cvttps_epi32(ewrt);
1728 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1729 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1730 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1732 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1737 /* Calculate temporary vectorial force */
1738 tx = _mm_mul_ps(fscal,dx21);
1739 ty = _mm_mul_ps(fscal,dy21);
1740 tz = _mm_mul_ps(fscal,dz21);
1742 /* Update vectorial force */
1743 fix2 = _mm_add_ps(fix2,tx);
1744 fiy2 = _mm_add_ps(fiy2,ty);
1745 fiz2 = _mm_add_ps(fiz2,tz);
1747 fjx1 = _mm_add_ps(fjx1,tx);
1748 fjy1 = _mm_add_ps(fjy1,ty);
1749 fjz1 = _mm_add_ps(fjz1,tz);
1751 /**************************
1752 * CALCULATE INTERACTIONS *
1753 **************************/
1755 r22 = _mm_mul_ps(rsq22,rinv22);
1757 /* EWALD ELECTROSTATICS */
1759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1760 ewrt = _mm_mul_ps(r22,ewtabscale);
1761 ewitab = _mm_cvttps_epi32(ewrt);
1762 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1763 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1764 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1766 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1767 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1771 /* Calculate temporary vectorial force */
1772 tx = _mm_mul_ps(fscal,dx22);
1773 ty = _mm_mul_ps(fscal,dy22);
1774 tz = _mm_mul_ps(fscal,dz22);
1776 /* Update vectorial force */
1777 fix2 = _mm_add_ps(fix2,tx);
1778 fiy2 = _mm_add_ps(fiy2,ty);
1779 fiz2 = _mm_add_ps(fiz2,tz);
1781 fjx2 = _mm_add_ps(fjx2,tx);
1782 fjy2 = _mm_add_ps(fjy2,ty);
1783 fjz2 = _mm_add_ps(fjz2,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r23 = _mm_mul_ps(rsq23,rinv23);
1791 /* EWALD ELECTROSTATICS */
1793 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794 ewrt = _mm_mul_ps(r23,ewtabscale);
1795 ewitab = _mm_cvttps_epi32(ewrt);
1796 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1797 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1798 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1800 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1801 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1805 /* Calculate temporary vectorial force */
1806 tx = _mm_mul_ps(fscal,dx23);
1807 ty = _mm_mul_ps(fscal,dy23);
1808 tz = _mm_mul_ps(fscal,dz23);
1810 /* Update vectorial force */
1811 fix2 = _mm_add_ps(fix2,tx);
1812 fiy2 = _mm_add_ps(fiy2,ty);
1813 fiz2 = _mm_add_ps(fiz2,tz);
1815 fjx3 = _mm_add_ps(fjx3,tx);
1816 fjy3 = _mm_add_ps(fjy3,ty);
1817 fjz3 = _mm_add_ps(fjz3,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 r31 = _mm_mul_ps(rsq31,rinv31);
1825 /* EWALD ELECTROSTATICS */
1827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828 ewrt = _mm_mul_ps(r31,ewtabscale);
1829 ewitab = _mm_cvttps_epi32(ewrt);
1830 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1831 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1832 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1834 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1835 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1839 /* Calculate temporary vectorial force */
1840 tx = _mm_mul_ps(fscal,dx31);
1841 ty = _mm_mul_ps(fscal,dy31);
1842 tz = _mm_mul_ps(fscal,dz31);
1844 /* Update vectorial force */
1845 fix3 = _mm_add_ps(fix3,tx);
1846 fiy3 = _mm_add_ps(fiy3,ty);
1847 fiz3 = _mm_add_ps(fiz3,tz);
1849 fjx1 = _mm_add_ps(fjx1,tx);
1850 fjy1 = _mm_add_ps(fjy1,ty);
1851 fjz1 = _mm_add_ps(fjz1,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r32 = _mm_mul_ps(rsq32,rinv32);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt = _mm_mul_ps(r32,ewtabscale);
1863 ewitab = _mm_cvttps_epi32(ewrt);
1864 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1865 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1866 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1868 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1869 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1873 /* Calculate temporary vectorial force */
1874 tx = _mm_mul_ps(fscal,dx32);
1875 ty = _mm_mul_ps(fscal,dy32);
1876 tz = _mm_mul_ps(fscal,dz32);
1878 /* Update vectorial force */
1879 fix3 = _mm_add_ps(fix3,tx);
1880 fiy3 = _mm_add_ps(fiy3,ty);
1881 fiz3 = _mm_add_ps(fiz3,tz);
1883 fjx2 = _mm_add_ps(fjx2,tx);
1884 fjy2 = _mm_add_ps(fjy2,ty);
1885 fjz2 = _mm_add_ps(fjz2,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r33 = _mm_mul_ps(rsq33,rinv33);
1893 /* EWALD ELECTROSTATICS */
1895 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1896 ewrt = _mm_mul_ps(r33,ewtabscale);
1897 ewitab = _mm_cvttps_epi32(ewrt);
1898 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1899 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1900 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1902 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1903 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1907 /* Calculate temporary vectorial force */
1908 tx = _mm_mul_ps(fscal,dx33);
1909 ty = _mm_mul_ps(fscal,dy33);
1910 tz = _mm_mul_ps(fscal,dz33);
1912 /* Update vectorial force */
1913 fix3 = _mm_add_ps(fix3,tx);
1914 fiy3 = _mm_add_ps(fiy3,ty);
1915 fiz3 = _mm_add_ps(fiz3,tz);
1917 fjx3 = _mm_add_ps(fjx3,tx);
1918 fjy3 = _mm_add_ps(fjy3,ty);
1919 fjz3 = _mm_add_ps(fjz3,tz);
1921 fjptrA = f+j_coord_offsetA;
1922 fjptrB = f+j_coord_offsetB;
1923 fjptrC = f+j_coord_offsetC;
1924 fjptrD = f+j_coord_offsetD;
1926 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1927 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1928 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1930 /* Inner loop uses 354 flops */
1933 if(jidx<j_index_end)
1936 /* Get j neighbor index, and coordinate index */
1937 jnrlistA = jjnr[jidx];
1938 jnrlistB = jjnr[jidx+1];
1939 jnrlistC = jjnr[jidx+2];
1940 jnrlistD = jjnr[jidx+3];
1941 /* Sign of each element will be negative for non-real atoms.
1942 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1943 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1945 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1946 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1947 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1948 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1949 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1950 j_coord_offsetA = DIM*jnrA;
1951 j_coord_offsetB = DIM*jnrB;
1952 j_coord_offsetC = DIM*jnrC;
1953 j_coord_offsetD = DIM*jnrD;
1955 /* load j atom coordinates */
1956 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1957 x+j_coord_offsetC,x+j_coord_offsetD,
1958 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1959 &jy2,&jz2,&jx3,&jy3,&jz3);
1961 /* Calculate displacement vector */
1962 dx00 = _mm_sub_ps(ix0,jx0);
1963 dy00 = _mm_sub_ps(iy0,jy0);
1964 dz00 = _mm_sub_ps(iz0,jz0);
1965 dx11 = _mm_sub_ps(ix1,jx1);
1966 dy11 = _mm_sub_ps(iy1,jy1);
1967 dz11 = _mm_sub_ps(iz1,jz1);
1968 dx12 = _mm_sub_ps(ix1,jx2);
1969 dy12 = _mm_sub_ps(iy1,jy2);
1970 dz12 = _mm_sub_ps(iz1,jz2);
1971 dx13 = _mm_sub_ps(ix1,jx3);
1972 dy13 = _mm_sub_ps(iy1,jy3);
1973 dz13 = _mm_sub_ps(iz1,jz3);
1974 dx21 = _mm_sub_ps(ix2,jx1);
1975 dy21 = _mm_sub_ps(iy2,jy1);
1976 dz21 = _mm_sub_ps(iz2,jz1);
1977 dx22 = _mm_sub_ps(ix2,jx2);
1978 dy22 = _mm_sub_ps(iy2,jy2);
1979 dz22 = _mm_sub_ps(iz2,jz2);
1980 dx23 = _mm_sub_ps(ix2,jx3);
1981 dy23 = _mm_sub_ps(iy2,jy3);
1982 dz23 = _mm_sub_ps(iz2,jz3);
1983 dx31 = _mm_sub_ps(ix3,jx1);
1984 dy31 = _mm_sub_ps(iy3,jy1);
1985 dz31 = _mm_sub_ps(iz3,jz1);
1986 dx32 = _mm_sub_ps(ix3,jx2);
1987 dy32 = _mm_sub_ps(iy3,jy2);
1988 dz32 = _mm_sub_ps(iz3,jz2);
1989 dx33 = _mm_sub_ps(ix3,jx3);
1990 dy33 = _mm_sub_ps(iy3,jy3);
1991 dz33 = _mm_sub_ps(iz3,jz3);
1993 /* Calculate squared distance and things based on it */
1994 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1995 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1996 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1997 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1998 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1999 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2000 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2001 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2002 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2003 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2005 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2006 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2007 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2008 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2009 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2010 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2011 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2012 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2013 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2015 rinvsq00 = gmx_mm_inv_ps(rsq00);
2016 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2017 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2018 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2019 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2020 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2021 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2022 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2023 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2024 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2026 fjx0 = _mm_setzero_ps();
2027 fjy0 = _mm_setzero_ps();
2028 fjz0 = _mm_setzero_ps();
2029 fjx1 = _mm_setzero_ps();
2030 fjy1 = _mm_setzero_ps();
2031 fjz1 = _mm_setzero_ps();
2032 fjx2 = _mm_setzero_ps();
2033 fjy2 = _mm_setzero_ps();
2034 fjz2 = _mm_setzero_ps();
2035 fjx3 = _mm_setzero_ps();
2036 fjy3 = _mm_setzero_ps();
2037 fjz3 = _mm_setzero_ps();
2039 /**************************
2040 * CALCULATE INTERACTIONS *
2041 **************************/
2043 /* LENNARD-JONES DISPERSION/REPULSION */
2045 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2046 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2050 fscal = _mm_andnot_ps(dummy_mask,fscal);
2052 /* Calculate temporary vectorial force */
2053 tx = _mm_mul_ps(fscal,dx00);
2054 ty = _mm_mul_ps(fscal,dy00);
2055 tz = _mm_mul_ps(fscal,dz00);
2057 /* Update vectorial force */
2058 fix0 = _mm_add_ps(fix0,tx);
2059 fiy0 = _mm_add_ps(fiy0,ty);
2060 fiz0 = _mm_add_ps(fiz0,tz);
2062 fjx0 = _mm_add_ps(fjx0,tx);
2063 fjy0 = _mm_add_ps(fjy0,ty);
2064 fjz0 = _mm_add_ps(fjz0,tz);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 r11 = _mm_mul_ps(rsq11,rinv11);
2071 r11 = _mm_andnot_ps(dummy_mask,r11);
2073 /* EWALD ELECTROSTATICS */
2075 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2076 ewrt = _mm_mul_ps(r11,ewtabscale);
2077 ewitab = _mm_cvttps_epi32(ewrt);
2078 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2079 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2080 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2082 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2083 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2087 fscal = _mm_andnot_ps(dummy_mask,fscal);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm_mul_ps(fscal,dx11);
2091 ty = _mm_mul_ps(fscal,dy11);
2092 tz = _mm_mul_ps(fscal,dz11);
2094 /* Update vectorial force */
2095 fix1 = _mm_add_ps(fix1,tx);
2096 fiy1 = _mm_add_ps(fiy1,ty);
2097 fiz1 = _mm_add_ps(fiz1,tz);
2099 fjx1 = _mm_add_ps(fjx1,tx);
2100 fjy1 = _mm_add_ps(fjy1,ty);
2101 fjz1 = _mm_add_ps(fjz1,tz);
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 r12 = _mm_mul_ps(rsq12,rinv12);
2108 r12 = _mm_andnot_ps(dummy_mask,r12);
2110 /* EWALD ELECTROSTATICS */
2112 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2113 ewrt = _mm_mul_ps(r12,ewtabscale);
2114 ewitab = _mm_cvttps_epi32(ewrt);
2115 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2116 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2117 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2119 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2120 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2124 fscal = _mm_andnot_ps(dummy_mask,fscal);
2126 /* Calculate temporary vectorial force */
2127 tx = _mm_mul_ps(fscal,dx12);
2128 ty = _mm_mul_ps(fscal,dy12);
2129 tz = _mm_mul_ps(fscal,dz12);
2131 /* Update vectorial force */
2132 fix1 = _mm_add_ps(fix1,tx);
2133 fiy1 = _mm_add_ps(fiy1,ty);
2134 fiz1 = _mm_add_ps(fiz1,tz);
2136 fjx2 = _mm_add_ps(fjx2,tx);
2137 fjy2 = _mm_add_ps(fjy2,ty);
2138 fjz2 = _mm_add_ps(fjz2,tz);
2140 /**************************
2141 * CALCULATE INTERACTIONS *
2142 **************************/
2144 r13 = _mm_mul_ps(rsq13,rinv13);
2145 r13 = _mm_andnot_ps(dummy_mask,r13);
2147 /* EWALD ELECTROSTATICS */
2149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2150 ewrt = _mm_mul_ps(r13,ewtabscale);
2151 ewitab = _mm_cvttps_epi32(ewrt);
2152 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2153 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2154 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2156 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2157 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2161 fscal = _mm_andnot_ps(dummy_mask,fscal);
2163 /* Calculate temporary vectorial force */
2164 tx = _mm_mul_ps(fscal,dx13);
2165 ty = _mm_mul_ps(fscal,dy13);
2166 tz = _mm_mul_ps(fscal,dz13);
2168 /* Update vectorial force */
2169 fix1 = _mm_add_ps(fix1,tx);
2170 fiy1 = _mm_add_ps(fiy1,ty);
2171 fiz1 = _mm_add_ps(fiz1,tz);
2173 fjx3 = _mm_add_ps(fjx3,tx);
2174 fjy3 = _mm_add_ps(fjy3,ty);
2175 fjz3 = _mm_add_ps(fjz3,tz);
2177 /**************************
2178 * CALCULATE INTERACTIONS *
2179 **************************/
2181 r21 = _mm_mul_ps(rsq21,rinv21);
2182 r21 = _mm_andnot_ps(dummy_mask,r21);
2184 /* EWALD ELECTROSTATICS */
2186 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2187 ewrt = _mm_mul_ps(r21,ewtabscale);
2188 ewitab = _mm_cvttps_epi32(ewrt);
2189 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2190 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2191 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2193 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2194 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2198 fscal = _mm_andnot_ps(dummy_mask,fscal);
2200 /* Calculate temporary vectorial force */
2201 tx = _mm_mul_ps(fscal,dx21);
2202 ty = _mm_mul_ps(fscal,dy21);
2203 tz = _mm_mul_ps(fscal,dz21);
2205 /* Update vectorial force */
2206 fix2 = _mm_add_ps(fix2,tx);
2207 fiy2 = _mm_add_ps(fiy2,ty);
2208 fiz2 = _mm_add_ps(fiz2,tz);
2210 fjx1 = _mm_add_ps(fjx1,tx);
2211 fjy1 = _mm_add_ps(fjy1,ty);
2212 fjz1 = _mm_add_ps(fjz1,tz);
2214 /**************************
2215 * CALCULATE INTERACTIONS *
2216 **************************/
2218 r22 = _mm_mul_ps(rsq22,rinv22);
2219 r22 = _mm_andnot_ps(dummy_mask,r22);
2221 /* EWALD ELECTROSTATICS */
2223 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2224 ewrt = _mm_mul_ps(r22,ewtabscale);
2225 ewitab = _mm_cvttps_epi32(ewrt);
2226 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2227 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2228 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2230 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2231 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2235 fscal = _mm_andnot_ps(dummy_mask,fscal);
2237 /* Calculate temporary vectorial force */
2238 tx = _mm_mul_ps(fscal,dx22);
2239 ty = _mm_mul_ps(fscal,dy22);
2240 tz = _mm_mul_ps(fscal,dz22);
2242 /* Update vectorial force */
2243 fix2 = _mm_add_ps(fix2,tx);
2244 fiy2 = _mm_add_ps(fiy2,ty);
2245 fiz2 = _mm_add_ps(fiz2,tz);
2247 fjx2 = _mm_add_ps(fjx2,tx);
2248 fjy2 = _mm_add_ps(fjy2,ty);
2249 fjz2 = _mm_add_ps(fjz2,tz);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r23 = _mm_mul_ps(rsq23,rinv23);
2256 r23 = _mm_andnot_ps(dummy_mask,r23);
2258 /* EWALD ELECTROSTATICS */
2260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2261 ewrt = _mm_mul_ps(r23,ewtabscale);
2262 ewitab = _mm_cvttps_epi32(ewrt);
2263 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2264 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2265 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2267 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2268 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2272 fscal = _mm_andnot_ps(dummy_mask,fscal);
2274 /* Calculate temporary vectorial force */
2275 tx = _mm_mul_ps(fscal,dx23);
2276 ty = _mm_mul_ps(fscal,dy23);
2277 tz = _mm_mul_ps(fscal,dz23);
2279 /* Update vectorial force */
2280 fix2 = _mm_add_ps(fix2,tx);
2281 fiy2 = _mm_add_ps(fiy2,ty);
2282 fiz2 = _mm_add_ps(fiz2,tz);
2284 fjx3 = _mm_add_ps(fjx3,tx);
2285 fjy3 = _mm_add_ps(fjy3,ty);
2286 fjz3 = _mm_add_ps(fjz3,tz);
2288 /**************************
2289 * CALCULATE INTERACTIONS *
2290 **************************/
2292 r31 = _mm_mul_ps(rsq31,rinv31);
2293 r31 = _mm_andnot_ps(dummy_mask,r31);
2295 /* EWALD ELECTROSTATICS */
2297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2298 ewrt = _mm_mul_ps(r31,ewtabscale);
2299 ewitab = _mm_cvttps_epi32(ewrt);
2300 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2301 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2302 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2304 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2305 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2309 fscal = _mm_andnot_ps(dummy_mask,fscal);
2311 /* Calculate temporary vectorial force */
2312 tx = _mm_mul_ps(fscal,dx31);
2313 ty = _mm_mul_ps(fscal,dy31);
2314 tz = _mm_mul_ps(fscal,dz31);
2316 /* Update vectorial force */
2317 fix3 = _mm_add_ps(fix3,tx);
2318 fiy3 = _mm_add_ps(fiy3,ty);
2319 fiz3 = _mm_add_ps(fiz3,tz);
2321 fjx1 = _mm_add_ps(fjx1,tx);
2322 fjy1 = _mm_add_ps(fjy1,ty);
2323 fjz1 = _mm_add_ps(fjz1,tz);
2325 /**************************
2326 * CALCULATE INTERACTIONS *
2327 **************************/
2329 r32 = _mm_mul_ps(rsq32,rinv32);
2330 r32 = _mm_andnot_ps(dummy_mask,r32);
2332 /* EWALD ELECTROSTATICS */
2334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335 ewrt = _mm_mul_ps(r32,ewtabscale);
2336 ewitab = _mm_cvttps_epi32(ewrt);
2337 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2338 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2339 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2341 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2342 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2346 fscal = _mm_andnot_ps(dummy_mask,fscal);
2348 /* Calculate temporary vectorial force */
2349 tx = _mm_mul_ps(fscal,dx32);
2350 ty = _mm_mul_ps(fscal,dy32);
2351 tz = _mm_mul_ps(fscal,dz32);
2353 /* Update vectorial force */
2354 fix3 = _mm_add_ps(fix3,tx);
2355 fiy3 = _mm_add_ps(fiy3,ty);
2356 fiz3 = _mm_add_ps(fiz3,tz);
2358 fjx2 = _mm_add_ps(fjx2,tx);
2359 fjy2 = _mm_add_ps(fjy2,ty);
2360 fjz2 = _mm_add_ps(fjz2,tz);
2362 /**************************
2363 * CALCULATE INTERACTIONS *
2364 **************************/
2366 r33 = _mm_mul_ps(rsq33,rinv33);
2367 r33 = _mm_andnot_ps(dummy_mask,r33);
2369 /* EWALD ELECTROSTATICS */
2371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2372 ewrt = _mm_mul_ps(r33,ewtabscale);
2373 ewitab = _mm_cvttps_epi32(ewrt);
2374 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2375 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2376 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2378 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2379 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2383 fscal = _mm_andnot_ps(dummy_mask,fscal);
2385 /* Calculate temporary vectorial force */
2386 tx = _mm_mul_ps(fscal,dx33);
2387 ty = _mm_mul_ps(fscal,dy33);
2388 tz = _mm_mul_ps(fscal,dz33);
2390 /* Update vectorial force */
2391 fix3 = _mm_add_ps(fix3,tx);
2392 fiy3 = _mm_add_ps(fiy3,ty);
2393 fiz3 = _mm_add_ps(fiz3,tz);
2395 fjx3 = _mm_add_ps(fjx3,tx);
2396 fjy3 = _mm_add_ps(fjy3,ty);
2397 fjz3 = _mm_add_ps(fjz3,tz);
2399 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2400 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2401 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2402 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2404 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2405 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2406 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2408 /* Inner loop uses 363 flops */
2411 /* End of innermost loop */
2413 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2414 f+i_coord_offset,fshift+i_shift_offset);
2416 /* Increment number of inner iterations */
2417 inneriter += j_index_end - j_index_start;
2419 /* Outer loop uses 24 flops */
2422 /* Increment number of outer iterations */
2425 /* Update outer/inner flops */
2427 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);