2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq0 = _mm_set1_ps(charge[inr+0]);
149 jq1 = _mm_set1_ps(charge[inr+1]);
150 jq2 = _mm_set1_ps(charge[inr+2]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 qq00 = _mm_mul_ps(iq0,jq0);
153 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155 qq01 = _mm_mul_ps(iq0,jq1);
156 qq02 = _mm_mul_ps(iq0,jq2);
157 qq10 = _mm_mul_ps(iq1,jq0);
158 qq11 = _mm_mul_ps(iq1,jq1);
159 qq12 = _mm_mul_ps(iq1,jq2);
160 qq20 = _mm_mul_ps(iq2,jq0);
161 qq21 = _mm_mul_ps(iq2,jq1);
162 qq22 = _mm_mul_ps(iq2,jq2);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197 fix0 = _mm_setzero_ps();
198 fiy0 = _mm_setzero_ps();
199 fiz0 = _mm_setzero_ps();
200 fix1 = _mm_setzero_ps();
201 fiy1 = _mm_setzero_ps();
202 fiz1 = _mm_setzero_ps();
203 fix2 = _mm_setzero_ps();
204 fiy2 = _mm_setzero_ps();
205 fiz2 = _mm_setzero_ps();
207 /* Reset potential sums */
208 velecsum = _mm_setzero_ps();
209 vvdwsum = _mm_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx01 = _mm_sub_ps(ix0,jx1);
235 dy01 = _mm_sub_ps(iy0,jy1);
236 dz01 = _mm_sub_ps(iz0,jz1);
237 dx02 = _mm_sub_ps(ix0,jx2);
238 dy02 = _mm_sub_ps(iy0,jy2);
239 dz02 = _mm_sub_ps(iz0,jz2);
240 dx10 = _mm_sub_ps(ix1,jx0);
241 dy10 = _mm_sub_ps(iy1,jy0);
242 dz10 = _mm_sub_ps(iz1,jz0);
243 dx11 = _mm_sub_ps(ix1,jx1);
244 dy11 = _mm_sub_ps(iy1,jy1);
245 dz11 = _mm_sub_ps(iz1,jz1);
246 dx12 = _mm_sub_ps(ix1,jx2);
247 dy12 = _mm_sub_ps(iy1,jy2);
248 dz12 = _mm_sub_ps(iz1,jz2);
249 dx20 = _mm_sub_ps(ix2,jx0);
250 dy20 = _mm_sub_ps(iy2,jy0);
251 dz20 = _mm_sub_ps(iz2,jz0);
252 dx21 = _mm_sub_ps(ix2,jx1);
253 dy21 = _mm_sub_ps(iy2,jy1);
254 dz21 = _mm_sub_ps(iz2,jz1);
255 dx22 = _mm_sub_ps(ix2,jx2);
256 dy22 = _mm_sub_ps(iy2,jy2);
257 dz22 = _mm_sub_ps(iz2,jz2);
259 /* Calculate squared distance and things based on it */
260 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
261 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
262 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
263 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270 rinv00 = gmx_mm_invsqrt_ps(rsq00);
271 rinv01 = gmx_mm_invsqrt_ps(rsq01);
272 rinv02 = gmx_mm_invsqrt_ps(rsq02);
273 rinv10 = gmx_mm_invsqrt_ps(rsq10);
274 rinv11 = gmx_mm_invsqrt_ps(rsq11);
275 rinv12 = gmx_mm_invsqrt_ps(rsq12);
276 rinv20 = gmx_mm_invsqrt_ps(rsq20);
277 rinv21 = gmx_mm_invsqrt_ps(rsq21);
278 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
281 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
282 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
283 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
284 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
285 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
286 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
287 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
288 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
290 fjx0 = _mm_setzero_ps();
291 fjy0 = _mm_setzero_ps();
292 fjz0 = _mm_setzero_ps();
293 fjx1 = _mm_setzero_ps();
294 fjy1 = _mm_setzero_ps();
295 fjz1 = _mm_setzero_ps();
296 fjx2 = _mm_setzero_ps();
297 fjy2 = _mm_setzero_ps();
298 fjz2 = _mm_setzero_ps();
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 r00 = _mm_mul_ps(rsq00,rinv00);
306 /* EWALD ELECTROSTATICS */
308 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
309 ewrt = _mm_mul_ps(r00,ewtabscale);
310 ewitab = _mm_cvttps_epi32(ewrt);
311 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
312 ewitab = _mm_slli_epi32(ewitab,2);
313 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
314 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
315 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
316 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
317 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
318 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
319 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
320 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
321 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
323 /* LENNARD-JONES DISPERSION/REPULSION */
325 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
326 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
327 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
328 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
329 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
331 /* Update potential sum for this i atom from the interaction with this j atom. */
332 velecsum = _mm_add_ps(velecsum,velec);
333 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
335 fscal = _mm_add_ps(felec,fvdw);
337 /* Calculate temporary vectorial force */
338 tx = _mm_mul_ps(fscal,dx00);
339 ty = _mm_mul_ps(fscal,dy00);
340 tz = _mm_mul_ps(fscal,dz00);
342 /* Update vectorial force */
343 fix0 = _mm_add_ps(fix0,tx);
344 fiy0 = _mm_add_ps(fiy0,ty);
345 fiz0 = _mm_add_ps(fiz0,tz);
347 fjx0 = _mm_add_ps(fjx0,tx);
348 fjy0 = _mm_add_ps(fjy0,ty);
349 fjz0 = _mm_add_ps(fjz0,tz);
351 /**************************
352 * CALCULATE INTERACTIONS *
353 **************************/
355 r01 = _mm_mul_ps(rsq01,rinv01);
357 /* EWALD ELECTROSTATICS */
359 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
360 ewrt = _mm_mul_ps(r01,ewtabscale);
361 ewitab = _mm_cvttps_epi32(ewrt);
362 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
363 ewitab = _mm_slli_epi32(ewitab,2);
364 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
365 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
366 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
367 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
368 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
369 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
370 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
371 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
372 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velecsum = _mm_add_ps(velecsum,velec);
379 /* Calculate temporary vectorial force */
380 tx = _mm_mul_ps(fscal,dx01);
381 ty = _mm_mul_ps(fscal,dy01);
382 tz = _mm_mul_ps(fscal,dz01);
384 /* Update vectorial force */
385 fix0 = _mm_add_ps(fix0,tx);
386 fiy0 = _mm_add_ps(fiy0,ty);
387 fiz0 = _mm_add_ps(fiz0,tz);
389 fjx1 = _mm_add_ps(fjx1,tx);
390 fjy1 = _mm_add_ps(fjy1,ty);
391 fjz1 = _mm_add_ps(fjz1,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 r02 = _mm_mul_ps(rsq02,rinv02);
399 /* EWALD ELECTROSTATICS */
401 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
402 ewrt = _mm_mul_ps(r02,ewtabscale);
403 ewitab = _mm_cvttps_epi32(ewrt);
404 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
405 ewitab = _mm_slli_epi32(ewitab,2);
406 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
407 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
408 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
409 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
410 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
411 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
412 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
413 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
414 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velecsum = _mm_add_ps(velecsum,velec);
421 /* Calculate temporary vectorial force */
422 tx = _mm_mul_ps(fscal,dx02);
423 ty = _mm_mul_ps(fscal,dy02);
424 tz = _mm_mul_ps(fscal,dz02);
426 /* Update vectorial force */
427 fix0 = _mm_add_ps(fix0,tx);
428 fiy0 = _mm_add_ps(fiy0,ty);
429 fiz0 = _mm_add_ps(fiz0,tz);
431 fjx2 = _mm_add_ps(fjx2,tx);
432 fjy2 = _mm_add_ps(fjy2,ty);
433 fjz2 = _mm_add_ps(fjz2,tz);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 r10 = _mm_mul_ps(rsq10,rinv10);
441 /* EWALD ELECTROSTATICS */
443 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
444 ewrt = _mm_mul_ps(r10,ewtabscale);
445 ewitab = _mm_cvttps_epi32(ewrt);
446 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
447 ewitab = _mm_slli_epi32(ewitab,2);
448 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
449 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
450 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
451 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
452 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
453 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
454 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
455 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
456 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm_add_ps(velecsum,velec);
463 /* Calculate temporary vectorial force */
464 tx = _mm_mul_ps(fscal,dx10);
465 ty = _mm_mul_ps(fscal,dy10);
466 tz = _mm_mul_ps(fscal,dz10);
468 /* Update vectorial force */
469 fix1 = _mm_add_ps(fix1,tx);
470 fiy1 = _mm_add_ps(fiy1,ty);
471 fiz1 = _mm_add_ps(fiz1,tz);
473 fjx0 = _mm_add_ps(fjx0,tx);
474 fjy0 = _mm_add_ps(fjy0,ty);
475 fjz0 = _mm_add_ps(fjz0,tz);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 r11 = _mm_mul_ps(rsq11,rinv11);
483 /* EWALD ELECTROSTATICS */
485 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486 ewrt = _mm_mul_ps(r11,ewtabscale);
487 ewitab = _mm_cvttps_epi32(ewrt);
488 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
489 ewitab = _mm_slli_epi32(ewitab,2);
490 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
491 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
492 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
493 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
494 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
495 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
496 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
497 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
498 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velecsum = _mm_add_ps(velecsum,velec);
505 /* Calculate temporary vectorial force */
506 tx = _mm_mul_ps(fscal,dx11);
507 ty = _mm_mul_ps(fscal,dy11);
508 tz = _mm_mul_ps(fscal,dz11);
510 /* Update vectorial force */
511 fix1 = _mm_add_ps(fix1,tx);
512 fiy1 = _mm_add_ps(fiy1,ty);
513 fiz1 = _mm_add_ps(fiz1,tz);
515 fjx1 = _mm_add_ps(fjx1,tx);
516 fjy1 = _mm_add_ps(fjy1,ty);
517 fjz1 = _mm_add_ps(fjz1,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 r12 = _mm_mul_ps(rsq12,rinv12);
525 /* EWALD ELECTROSTATICS */
527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528 ewrt = _mm_mul_ps(r12,ewtabscale);
529 ewitab = _mm_cvttps_epi32(ewrt);
530 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
531 ewitab = _mm_slli_epi32(ewitab,2);
532 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
533 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
534 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
535 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
536 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
537 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
538 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
539 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
540 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm_add_ps(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm_mul_ps(fscal,dx12);
549 ty = _mm_mul_ps(fscal,dy12);
550 tz = _mm_mul_ps(fscal,dz12);
552 /* Update vectorial force */
553 fix1 = _mm_add_ps(fix1,tx);
554 fiy1 = _mm_add_ps(fiy1,ty);
555 fiz1 = _mm_add_ps(fiz1,tz);
557 fjx2 = _mm_add_ps(fjx2,tx);
558 fjy2 = _mm_add_ps(fjy2,ty);
559 fjz2 = _mm_add_ps(fjz2,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 r20 = _mm_mul_ps(rsq20,rinv20);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _mm_mul_ps(r20,ewtabscale);
571 ewitab = _mm_cvttps_epi32(ewrt);
572 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
573 ewitab = _mm_slli_epi32(ewitab,2);
574 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
575 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
576 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
577 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
578 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
579 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
580 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
581 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
582 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velecsum = _mm_add_ps(velecsum,velec);
589 /* Calculate temporary vectorial force */
590 tx = _mm_mul_ps(fscal,dx20);
591 ty = _mm_mul_ps(fscal,dy20);
592 tz = _mm_mul_ps(fscal,dz20);
594 /* Update vectorial force */
595 fix2 = _mm_add_ps(fix2,tx);
596 fiy2 = _mm_add_ps(fiy2,ty);
597 fiz2 = _mm_add_ps(fiz2,tz);
599 fjx0 = _mm_add_ps(fjx0,tx);
600 fjy0 = _mm_add_ps(fjy0,ty);
601 fjz0 = _mm_add_ps(fjz0,tz);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 r21 = _mm_mul_ps(rsq21,rinv21);
609 /* EWALD ELECTROSTATICS */
611 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612 ewrt = _mm_mul_ps(r21,ewtabscale);
613 ewitab = _mm_cvttps_epi32(ewrt);
614 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
615 ewitab = _mm_slli_epi32(ewitab,2);
616 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
617 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
618 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
619 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
620 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
621 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
622 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
623 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
624 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velecsum = _mm_add_ps(velecsum,velec);
631 /* Calculate temporary vectorial force */
632 tx = _mm_mul_ps(fscal,dx21);
633 ty = _mm_mul_ps(fscal,dy21);
634 tz = _mm_mul_ps(fscal,dz21);
636 /* Update vectorial force */
637 fix2 = _mm_add_ps(fix2,tx);
638 fiy2 = _mm_add_ps(fiy2,ty);
639 fiz2 = _mm_add_ps(fiz2,tz);
641 fjx1 = _mm_add_ps(fjx1,tx);
642 fjy1 = _mm_add_ps(fjy1,ty);
643 fjz1 = _mm_add_ps(fjz1,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 r22 = _mm_mul_ps(rsq22,rinv22);
651 /* EWALD ELECTROSTATICS */
653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654 ewrt = _mm_mul_ps(r22,ewtabscale);
655 ewitab = _mm_cvttps_epi32(ewrt);
656 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
657 ewitab = _mm_slli_epi32(ewitab,2);
658 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
659 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
660 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
661 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
662 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
663 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
664 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
665 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
666 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velecsum = _mm_add_ps(velecsum,velec);
673 /* Calculate temporary vectorial force */
674 tx = _mm_mul_ps(fscal,dx22);
675 ty = _mm_mul_ps(fscal,dy22);
676 tz = _mm_mul_ps(fscal,dz22);
678 /* Update vectorial force */
679 fix2 = _mm_add_ps(fix2,tx);
680 fiy2 = _mm_add_ps(fiy2,ty);
681 fiz2 = _mm_add_ps(fiz2,tz);
683 fjx2 = _mm_add_ps(fjx2,tx);
684 fjy2 = _mm_add_ps(fjy2,ty);
685 fjz2 = _mm_add_ps(fjz2,tz);
687 fjptrA = f+j_coord_offsetA;
688 fjptrB = f+j_coord_offsetB;
689 fjptrC = f+j_coord_offsetC;
690 fjptrD = f+j_coord_offsetD;
692 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
693 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
695 /* Inner loop uses 381 flops */
701 /* Get j neighbor index, and coordinate index */
702 jnrlistA = jjnr[jidx];
703 jnrlistB = jjnr[jidx+1];
704 jnrlistC = jjnr[jidx+2];
705 jnrlistD = jjnr[jidx+3];
706 /* Sign of each element will be negative for non-real atoms.
707 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
708 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
710 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
711 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
712 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
713 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
714 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
715 j_coord_offsetA = DIM*jnrA;
716 j_coord_offsetB = DIM*jnrB;
717 j_coord_offsetC = DIM*jnrC;
718 j_coord_offsetD = DIM*jnrD;
720 /* load j atom coordinates */
721 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
722 x+j_coord_offsetC,x+j_coord_offsetD,
723 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
725 /* Calculate displacement vector */
726 dx00 = _mm_sub_ps(ix0,jx0);
727 dy00 = _mm_sub_ps(iy0,jy0);
728 dz00 = _mm_sub_ps(iz0,jz0);
729 dx01 = _mm_sub_ps(ix0,jx1);
730 dy01 = _mm_sub_ps(iy0,jy1);
731 dz01 = _mm_sub_ps(iz0,jz1);
732 dx02 = _mm_sub_ps(ix0,jx2);
733 dy02 = _mm_sub_ps(iy0,jy2);
734 dz02 = _mm_sub_ps(iz0,jz2);
735 dx10 = _mm_sub_ps(ix1,jx0);
736 dy10 = _mm_sub_ps(iy1,jy0);
737 dz10 = _mm_sub_ps(iz1,jz0);
738 dx11 = _mm_sub_ps(ix1,jx1);
739 dy11 = _mm_sub_ps(iy1,jy1);
740 dz11 = _mm_sub_ps(iz1,jz1);
741 dx12 = _mm_sub_ps(ix1,jx2);
742 dy12 = _mm_sub_ps(iy1,jy2);
743 dz12 = _mm_sub_ps(iz1,jz2);
744 dx20 = _mm_sub_ps(ix2,jx0);
745 dy20 = _mm_sub_ps(iy2,jy0);
746 dz20 = _mm_sub_ps(iz2,jz0);
747 dx21 = _mm_sub_ps(ix2,jx1);
748 dy21 = _mm_sub_ps(iy2,jy1);
749 dz21 = _mm_sub_ps(iz2,jz1);
750 dx22 = _mm_sub_ps(ix2,jx2);
751 dy22 = _mm_sub_ps(iy2,jy2);
752 dz22 = _mm_sub_ps(iz2,jz2);
754 /* Calculate squared distance and things based on it */
755 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
756 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
757 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
758 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
759 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
760 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
761 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
762 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
763 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
765 rinv00 = gmx_mm_invsqrt_ps(rsq00);
766 rinv01 = gmx_mm_invsqrt_ps(rsq01);
767 rinv02 = gmx_mm_invsqrt_ps(rsq02);
768 rinv10 = gmx_mm_invsqrt_ps(rsq10);
769 rinv11 = gmx_mm_invsqrt_ps(rsq11);
770 rinv12 = gmx_mm_invsqrt_ps(rsq12);
771 rinv20 = gmx_mm_invsqrt_ps(rsq20);
772 rinv21 = gmx_mm_invsqrt_ps(rsq21);
773 rinv22 = gmx_mm_invsqrt_ps(rsq22);
775 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
776 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
777 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
778 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
779 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
780 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
781 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
782 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
783 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
785 fjx0 = _mm_setzero_ps();
786 fjy0 = _mm_setzero_ps();
787 fjz0 = _mm_setzero_ps();
788 fjx1 = _mm_setzero_ps();
789 fjy1 = _mm_setzero_ps();
790 fjz1 = _mm_setzero_ps();
791 fjx2 = _mm_setzero_ps();
792 fjy2 = _mm_setzero_ps();
793 fjz2 = _mm_setzero_ps();
795 /**************************
796 * CALCULATE INTERACTIONS *
797 **************************/
799 r00 = _mm_mul_ps(rsq00,rinv00);
800 r00 = _mm_andnot_ps(dummy_mask,r00);
802 /* EWALD ELECTROSTATICS */
804 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
805 ewrt = _mm_mul_ps(r00,ewtabscale);
806 ewitab = _mm_cvttps_epi32(ewrt);
807 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
808 ewitab = _mm_slli_epi32(ewitab,2);
809 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
810 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
811 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
812 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
813 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
814 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
815 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
816 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
817 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
819 /* LENNARD-JONES DISPERSION/REPULSION */
821 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
822 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
823 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
824 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
825 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
827 /* Update potential sum for this i atom from the interaction with this j atom. */
828 velec = _mm_andnot_ps(dummy_mask,velec);
829 velecsum = _mm_add_ps(velecsum,velec);
830 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
831 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
833 fscal = _mm_add_ps(felec,fvdw);
835 fscal = _mm_andnot_ps(dummy_mask,fscal);
837 /* Calculate temporary vectorial force */
838 tx = _mm_mul_ps(fscal,dx00);
839 ty = _mm_mul_ps(fscal,dy00);
840 tz = _mm_mul_ps(fscal,dz00);
842 /* Update vectorial force */
843 fix0 = _mm_add_ps(fix0,tx);
844 fiy0 = _mm_add_ps(fiy0,ty);
845 fiz0 = _mm_add_ps(fiz0,tz);
847 fjx0 = _mm_add_ps(fjx0,tx);
848 fjy0 = _mm_add_ps(fjy0,ty);
849 fjz0 = _mm_add_ps(fjz0,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 r01 = _mm_mul_ps(rsq01,rinv01);
856 r01 = _mm_andnot_ps(dummy_mask,r01);
858 /* EWALD ELECTROSTATICS */
860 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
861 ewrt = _mm_mul_ps(r01,ewtabscale);
862 ewitab = _mm_cvttps_epi32(ewrt);
863 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
864 ewitab = _mm_slli_epi32(ewitab,2);
865 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
866 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
867 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
868 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
869 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
870 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
871 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
872 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
873 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
875 /* Update potential sum for this i atom from the interaction with this j atom. */
876 velec = _mm_andnot_ps(dummy_mask,velec);
877 velecsum = _mm_add_ps(velecsum,velec);
881 fscal = _mm_andnot_ps(dummy_mask,fscal);
883 /* Calculate temporary vectorial force */
884 tx = _mm_mul_ps(fscal,dx01);
885 ty = _mm_mul_ps(fscal,dy01);
886 tz = _mm_mul_ps(fscal,dz01);
888 /* Update vectorial force */
889 fix0 = _mm_add_ps(fix0,tx);
890 fiy0 = _mm_add_ps(fiy0,ty);
891 fiz0 = _mm_add_ps(fiz0,tz);
893 fjx1 = _mm_add_ps(fjx1,tx);
894 fjy1 = _mm_add_ps(fjy1,ty);
895 fjz1 = _mm_add_ps(fjz1,tz);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 r02 = _mm_mul_ps(rsq02,rinv02);
902 r02 = _mm_andnot_ps(dummy_mask,r02);
904 /* EWALD ELECTROSTATICS */
906 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907 ewrt = _mm_mul_ps(r02,ewtabscale);
908 ewitab = _mm_cvttps_epi32(ewrt);
909 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
910 ewitab = _mm_slli_epi32(ewitab,2);
911 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
919 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm_andnot_ps(dummy_mask,velec);
923 velecsum = _mm_add_ps(velecsum,velec);
927 fscal = _mm_andnot_ps(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm_mul_ps(fscal,dx02);
931 ty = _mm_mul_ps(fscal,dy02);
932 tz = _mm_mul_ps(fscal,dz02);
934 /* Update vectorial force */
935 fix0 = _mm_add_ps(fix0,tx);
936 fiy0 = _mm_add_ps(fiy0,ty);
937 fiz0 = _mm_add_ps(fiz0,tz);
939 fjx2 = _mm_add_ps(fjx2,tx);
940 fjy2 = _mm_add_ps(fjy2,ty);
941 fjz2 = _mm_add_ps(fjz2,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r10 = _mm_mul_ps(rsq10,rinv10);
948 r10 = _mm_andnot_ps(dummy_mask,r10);
950 /* EWALD ELECTROSTATICS */
952 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953 ewrt = _mm_mul_ps(r10,ewtabscale);
954 ewitab = _mm_cvttps_epi32(ewrt);
955 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
956 ewitab = _mm_slli_epi32(ewitab,2);
957 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
958 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
959 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
960 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
961 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
962 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
963 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
964 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
965 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_andnot_ps(dummy_mask,velec);
969 velecsum = _mm_add_ps(velecsum,velec);
973 fscal = _mm_andnot_ps(dummy_mask,fscal);
975 /* Calculate temporary vectorial force */
976 tx = _mm_mul_ps(fscal,dx10);
977 ty = _mm_mul_ps(fscal,dy10);
978 tz = _mm_mul_ps(fscal,dz10);
980 /* Update vectorial force */
981 fix1 = _mm_add_ps(fix1,tx);
982 fiy1 = _mm_add_ps(fiy1,ty);
983 fiz1 = _mm_add_ps(fiz1,tz);
985 fjx0 = _mm_add_ps(fjx0,tx);
986 fjy0 = _mm_add_ps(fjy0,ty);
987 fjz0 = _mm_add_ps(fjz0,tz);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 r11 = _mm_mul_ps(rsq11,rinv11);
994 r11 = _mm_andnot_ps(dummy_mask,r11);
996 /* EWALD ELECTROSTATICS */
998 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
999 ewrt = _mm_mul_ps(r11,ewtabscale);
1000 ewitab = _mm_cvttps_epi32(ewrt);
1001 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1002 ewitab = _mm_slli_epi32(ewitab,2);
1003 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1004 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1005 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1006 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1007 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1008 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1009 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1010 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1011 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1013 /* Update potential sum for this i atom from the interaction with this j atom. */
1014 velec = _mm_andnot_ps(dummy_mask,velec);
1015 velecsum = _mm_add_ps(velecsum,velec);
1019 fscal = _mm_andnot_ps(dummy_mask,fscal);
1021 /* Calculate temporary vectorial force */
1022 tx = _mm_mul_ps(fscal,dx11);
1023 ty = _mm_mul_ps(fscal,dy11);
1024 tz = _mm_mul_ps(fscal,dz11);
1026 /* Update vectorial force */
1027 fix1 = _mm_add_ps(fix1,tx);
1028 fiy1 = _mm_add_ps(fiy1,ty);
1029 fiz1 = _mm_add_ps(fiz1,tz);
1031 fjx1 = _mm_add_ps(fjx1,tx);
1032 fjy1 = _mm_add_ps(fjy1,ty);
1033 fjz1 = _mm_add_ps(fjz1,tz);
1035 /**************************
1036 * CALCULATE INTERACTIONS *
1037 **************************/
1039 r12 = _mm_mul_ps(rsq12,rinv12);
1040 r12 = _mm_andnot_ps(dummy_mask,r12);
1042 /* EWALD ELECTROSTATICS */
1044 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1045 ewrt = _mm_mul_ps(r12,ewtabscale);
1046 ewitab = _mm_cvttps_epi32(ewrt);
1047 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1048 ewitab = _mm_slli_epi32(ewitab,2);
1049 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1050 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1051 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1052 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1053 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1054 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1055 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1056 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1057 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1059 /* Update potential sum for this i atom from the interaction with this j atom. */
1060 velec = _mm_andnot_ps(dummy_mask,velec);
1061 velecsum = _mm_add_ps(velecsum,velec);
1065 fscal = _mm_andnot_ps(dummy_mask,fscal);
1067 /* Calculate temporary vectorial force */
1068 tx = _mm_mul_ps(fscal,dx12);
1069 ty = _mm_mul_ps(fscal,dy12);
1070 tz = _mm_mul_ps(fscal,dz12);
1072 /* Update vectorial force */
1073 fix1 = _mm_add_ps(fix1,tx);
1074 fiy1 = _mm_add_ps(fiy1,ty);
1075 fiz1 = _mm_add_ps(fiz1,tz);
1077 fjx2 = _mm_add_ps(fjx2,tx);
1078 fjy2 = _mm_add_ps(fjy2,ty);
1079 fjz2 = _mm_add_ps(fjz2,tz);
1081 /**************************
1082 * CALCULATE INTERACTIONS *
1083 **************************/
1085 r20 = _mm_mul_ps(rsq20,rinv20);
1086 r20 = _mm_andnot_ps(dummy_mask,r20);
1088 /* EWALD ELECTROSTATICS */
1090 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091 ewrt = _mm_mul_ps(r20,ewtabscale);
1092 ewitab = _mm_cvttps_epi32(ewrt);
1093 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1094 ewitab = _mm_slli_epi32(ewitab,2);
1095 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1096 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1097 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1098 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1099 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1100 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1101 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1102 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1103 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1105 /* Update potential sum for this i atom from the interaction with this j atom. */
1106 velec = _mm_andnot_ps(dummy_mask,velec);
1107 velecsum = _mm_add_ps(velecsum,velec);
1111 fscal = _mm_andnot_ps(dummy_mask,fscal);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm_mul_ps(fscal,dx20);
1115 ty = _mm_mul_ps(fscal,dy20);
1116 tz = _mm_mul_ps(fscal,dz20);
1118 /* Update vectorial force */
1119 fix2 = _mm_add_ps(fix2,tx);
1120 fiy2 = _mm_add_ps(fiy2,ty);
1121 fiz2 = _mm_add_ps(fiz2,tz);
1123 fjx0 = _mm_add_ps(fjx0,tx);
1124 fjy0 = _mm_add_ps(fjy0,ty);
1125 fjz0 = _mm_add_ps(fjz0,tz);
1127 /**************************
1128 * CALCULATE INTERACTIONS *
1129 **************************/
1131 r21 = _mm_mul_ps(rsq21,rinv21);
1132 r21 = _mm_andnot_ps(dummy_mask,r21);
1134 /* EWALD ELECTROSTATICS */
1136 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1137 ewrt = _mm_mul_ps(r21,ewtabscale);
1138 ewitab = _mm_cvttps_epi32(ewrt);
1139 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1140 ewitab = _mm_slli_epi32(ewitab,2);
1141 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1142 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1143 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1144 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1145 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1146 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1147 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1148 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1149 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1151 /* Update potential sum for this i atom from the interaction with this j atom. */
1152 velec = _mm_andnot_ps(dummy_mask,velec);
1153 velecsum = _mm_add_ps(velecsum,velec);
1157 fscal = _mm_andnot_ps(dummy_mask,fscal);
1159 /* Calculate temporary vectorial force */
1160 tx = _mm_mul_ps(fscal,dx21);
1161 ty = _mm_mul_ps(fscal,dy21);
1162 tz = _mm_mul_ps(fscal,dz21);
1164 /* Update vectorial force */
1165 fix2 = _mm_add_ps(fix2,tx);
1166 fiy2 = _mm_add_ps(fiy2,ty);
1167 fiz2 = _mm_add_ps(fiz2,tz);
1169 fjx1 = _mm_add_ps(fjx1,tx);
1170 fjy1 = _mm_add_ps(fjy1,ty);
1171 fjz1 = _mm_add_ps(fjz1,tz);
1173 /**************************
1174 * CALCULATE INTERACTIONS *
1175 **************************/
1177 r22 = _mm_mul_ps(rsq22,rinv22);
1178 r22 = _mm_andnot_ps(dummy_mask,r22);
1180 /* EWALD ELECTROSTATICS */
1182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1183 ewrt = _mm_mul_ps(r22,ewtabscale);
1184 ewitab = _mm_cvttps_epi32(ewrt);
1185 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1186 ewitab = _mm_slli_epi32(ewitab,2);
1187 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1188 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1189 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1190 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1191 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1192 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1193 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1194 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1195 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1197 /* Update potential sum for this i atom from the interaction with this j atom. */
1198 velec = _mm_andnot_ps(dummy_mask,velec);
1199 velecsum = _mm_add_ps(velecsum,velec);
1203 fscal = _mm_andnot_ps(dummy_mask,fscal);
1205 /* Calculate temporary vectorial force */
1206 tx = _mm_mul_ps(fscal,dx22);
1207 ty = _mm_mul_ps(fscal,dy22);
1208 tz = _mm_mul_ps(fscal,dz22);
1210 /* Update vectorial force */
1211 fix2 = _mm_add_ps(fix2,tx);
1212 fiy2 = _mm_add_ps(fiy2,ty);
1213 fiz2 = _mm_add_ps(fiz2,tz);
1215 fjx2 = _mm_add_ps(fjx2,tx);
1216 fjy2 = _mm_add_ps(fjy2,ty);
1217 fjz2 = _mm_add_ps(fjz2,tz);
1219 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1220 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1221 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1222 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1224 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1225 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1227 /* Inner loop uses 390 flops */
1230 /* End of innermost loop */
1232 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1233 f+i_coord_offset,fshift+i_shift_offset);
1236 /* Update potential energies */
1237 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1238 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1240 /* Increment number of inner iterations */
1241 inneriter += j_index_end - j_index_start;
1243 /* Outer loop uses 20 flops */
1246 /* Increment number of outer iterations */
1249 /* Update outer/inner flops */
1251 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1254 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1255 * Electrostatics interaction: Ewald
1256 * VdW interaction: LennardJones
1257 * Geometry: Water3-Water3
1258 * Calculate force/pot: Force
1261 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
1262 (t_nblist * gmx_restrict nlist,
1263 rvec * gmx_restrict xx,
1264 rvec * gmx_restrict ff,
1265 t_forcerec * gmx_restrict fr,
1266 t_mdatoms * gmx_restrict mdatoms,
1267 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1268 t_nrnb * gmx_restrict nrnb)
1270 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1271 * just 0 for non-waters.
1272 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1273 * jnr indices corresponding to data put in the four positions in the SIMD register.
1275 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1276 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1277 int jnrA,jnrB,jnrC,jnrD;
1278 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1279 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1280 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1281 real rcutoff_scalar;
1282 real *shiftvec,*fshift,*x,*f;
1283 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1284 real scratch[4*DIM];
1285 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1287 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1289 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1291 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1292 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1293 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1294 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1295 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1296 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1297 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1298 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1299 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1300 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1301 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1302 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1303 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1304 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1305 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1306 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1307 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1310 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1313 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1314 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1316 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1318 __m128 dummy_mask,cutoff_mask;
1319 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1320 __m128 one = _mm_set1_ps(1.0);
1321 __m128 two = _mm_set1_ps(2.0);
1327 jindex = nlist->jindex;
1329 shiftidx = nlist->shift;
1331 shiftvec = fr->shift_vec[0];
1332 fshift = fr->fshift[0];
1333 facel = _mm_set1_ps(fr->epsfac);
1334 charge = mdatoms->chargeA;
1335 nvdwtype = fr->ntype;
1336 vdwparam = fr->nbfp;
1337 vdwtype = mdatoms->typeA;
1339 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1340 ewtab = fr->ic->tabq_coul_F;
1341 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1342 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1344 /* Setup water-specific parameters */
1345 inr = nlist->iinr[0];
1346 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1347 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1348 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1349 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1351 jq0 = _mm_set1_ps(charge[inr+0]);
1352 jq1 = _mm_set1_ps(charge[inr+1]);
1353 jq2 = _mm_set1_ps(charge[inr+2]);
1354 vdwjidx0A = 2*vdwtype[inr+0];
1355 qq00 = _mm_mul_ps(iq0,jq0);
1356 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1357 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1358 qq01 = _mm_mul_ps(iq0,jq1);
1359 qq02 = _mm_mul_ps(iq0,jq2);
1360 qq10 = _mm_mul_ps(iq1,jq0);
1361 qq11 = _mm_mul_ps(iq1,jq1);
1362 qq12 = _mm_mul_ps(iq1,jq2);
1363 qq20 = _mm_mul_ps(iq2,jq0);
1364 qq21 = _mm_mul_ps(iq2,jq1);
1365 qq22 = _mm_mul_ps(iq2,jq2);
1367 /* Avoid stupid compiler warnings */
1368 jnrA = jnrB = jnrC = jnrD = 0;
1369 j_coord_offsetA = 0;
1370 j_coord_offsetB = 0;
1371 j_coord_offsetC = 0;
1372 j_coord_offsetD = 0;
1377 for(iidx=0;iidx<4*DIM;iidx++)
1379 scratch[iidx] = 0.0;
1382 /* Start outer loop over neighborlists */
1383 for(iidx=0; iidx<nri; iidx++)
1385 /* Load shift vector for this list */
1386 i_shift_offset = DIM*shiftidx[iidx];
1388 /* Load limits for loop over neighbors */
1389 j_index_start = jindex[iidx];
1390 j_index_end = jindex[iidx+1];
1392 /* Get outer coordinate index */
1394 i_coord_offset = DIM*inr;
1396 /* Load i particle coords and add shift vector */
1397 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1398 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1400 fix0 = _mm_setzero_ps();
1401 fiy0 = _mm_setzero_ps();
1402 fiz0 = _mm_setzero_ps();
1403 fix1 = _mm_setzero_ps();
1404 fiy1 = _mm_setzero_ps();
1405 fiz1 = _mm_setzero_ps();
1406 fix2 = _mm_setzero_ps();
1407 fiy2 = _mm_setzero_ps();
1408 fiz2 = _mm_setzero_ps();
1410 /* Start inner kernel loop */
1411 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1414 /* Get j neighbor index, and coordinate index */
1416 jnrB = jjnr[jidx+1];
1417 jnrC = jjnr[jidx+2];
1418 jnrD = jjnr[jidx+3];
1419 j_coord_offsetA = DIM*jnrA;
1420 j_coord_offsetB = DIM*jnrB;
1421 j_coord_offsetC = DIM*jnrC;
1422 j_coord_offsetD = DIM*jnrD;
1424 /* load j atom coordinates */
1425 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1426 x+j_coord_offsetC,x+j_coord_offsetD,
1427 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1429 /* Calculate displacement vector */
1430 dx00 = _mm_sub_ps(ix0,jx0);
1431 dy00 = _mm_sub_ps(iy0,jy0);
1432 dz00 = _mm_sub_ps(iz0,jz0);
1433 dx01 = _mm_sub_ps(ix0,jx1);
1434 dy01 = _mm_sub_ps(iy0,jy1);
1435 dz01 = _mm_sub_ps(iz0,jz1);
1436 dx02 = _mm_sub_ps(ix0,jx2);
1437 dy02 = _mm_sub_ps(iy0,jy2);
1438 dz02 = _mm_sub_ps(iz0,jz2);
1439 dx10 = _mm_sub_ps(ix1,jx0);
1440 dy10 = _mm_sub_ps(iy1,jy0);
1441 dz10 = _mm_sub_ps(iz1,jz0);
1442 dx11 = _mm_sub_ps(ix1,jx1);
1443 dy11 = _mm_sub_ps(iy1,jy1);
1444 dz11 = _mm_sub_ps(iz1,jz1);
1445 dx12 = _mm_sub_ps(ix1,jx2);
1446 dy12 = _mm_sub_ps(iy1,jy2);
1447 dz12 = _mm_sub_ps(iz1,jz2);
1448 dx20 = _mm_sub_ps(ix2,jx0);
1449 dy20 = _mm_sub_ps(iy2,jy0);
1450 dz20 = _mm_sub_ps(iz2,jz0);
1451 dx21 = _mm_sub_ps(ix2,jx1);
1452 dy21 = _mm_sub_ps(iy2,jy1);
1453 dz21 = _mm_sub_ps(iz2,jz1);
1454 dx22 = _mm_sub_ps(ix2,jx2);
1455 dy22 = _mm_sub_ps(iy2,jy2);
1456 dz22 = _mm_sub_ps(iz2,jz2);
1458 /* Calculate squared distance and things based on it */
1459 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1460 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1461 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1462 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1463 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1464 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1465 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1466 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1467 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1469 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1470 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1471 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1472 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1473 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1474 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1475 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1476 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1477 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1479 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1480 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1481 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1482 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1483 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1484 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1485 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1486 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1487 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1489 fjx0 = _mm_setzero_ps();
1490 fjy0 = _mm_setzero_ps();
1491 fjz0 = _mm_setzero_ps();
1492 fjx1 = _mm_setzero_ps();
1493 fjy1 = _mm_setzero_ps();
1494 fjz1 = _mm_setzero_ps();
1495 fjx2 = _mm_setzero_ps();
1496 fjy2 = _mm_setzero_ps();
1497 fjz2 = _mm_setzero_ps();
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 r00 = _mm_mul_ps(rsq00,rinv00);
1505 /* EWALD ELECTROSTATICS */
1507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1508 ewrt = _mm_mul_ps(r00,ewtabscale);
1509 ewitab = _mm_cvttps_epi32(ewrt);
1510 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1511 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1512 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1514 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1515 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1517 /* LENNARD-JONES DISPERSION/REPULSION */
1519 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1520 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1522 fscal = _mm_add_ps(felec,fvdw);
1524 /* Calculate temporary vectorial force */
1525 tx = _mm_mul_ps(fscal,dx00);
1526 ty = _mm_mul_ps(fscal,dy00);
1527 tz = _mm_mul_ps(fscal,dz00);
1529 /* Update vectorial force */
1530 fix0 = _mm_add_ps(fix0,tx);
1531 fiy0 = _mm_add_ps(fiy0,ty);
1532 fiz0 = _mm_add_ps(fiz0,tz);
1534 fjx0 = _mm_add_ps(fjx0,tx);
1535 fjy0 = _mm_add_ps(fjy0,ty);
1536 fjz0 = _mm_add_ps(fjz0,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 r01 = _mm_mul_ps(rsq01,rinv01);
1544 /* EWALD ELECTROSTATICS */
1546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547 ewrt = _mm_mul_ps(r01,ewtabscale);
1548 ewitab = _mm_cvttps_epi32(ewrt);
1549 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1550 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1551 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1553 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1554 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1558 /* Calculate temporary vectorial force */
1559 tx = _mm_mul_ps(fscal,dx01);
1560 ty = _mm_mul_ps(fscal,dy01);
1561 tz = _mm_mul_ps(fscal,dz01);
1563 /* Update vectorial force */
1564 fix0 = _mm_add_ps(fix0,tx);
1565 fiy0 = _mm_add_ps(fiy0,ty);
1566 fiz0 = _mm_add_ps(fiz0,tz);
1568 fjx1 = _mm_add_ps(fjx1,tx);
1569 fjy1 = _mm_add_ps(fjy1,ty);
1570 fjz1 = _mm_add_ps(fjz1,tz);
1572 /**************************
1573 * CALCULATE INTERACTIONS *
1574 **************************/
1576 r02 = _mm_mul_ps(rsq02,rinv02);
1578 /* EWALD ELECTROSTATICS */
1580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1581 ewrt = _mm_mul_ps(r02,ewtabscale);
1582 ewitab = _mm_cvttps_epi32(ewrt);
1583 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1584 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1585 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1587 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1588 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1592 /* Calculate temporary vectorial force */
1593 tx = _mm_mul_ps(fscal,dx02);
1594 ty = _mm_mul_ps(fscal,dy02);
1595 tz = _mm_mul_ps(fscal,dz02);
1597 /* Update vectorial force */
1598 fix0 = _mm_add_ps(fix0,tx);
1599 fiy0 = _mm_add_ps(fiy0,ty);
1600 fiz0 = _mm_add_ps(fiz0,tz);
1602 fjx2 = _mm_add_ps(fjx2,tx);
1603 fjy2 = _mm_add_ps(fjy2,ty);
1604 fjz2 = _mm_add_ps(fjz2,tz);
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1610 r10 = _mm_mul_ps(rsq10,rinv10);
1612 /* EWALD ELECTROSTATICS */
1614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1615 ewrt = _mm_mul_ps(r10,ewtabscale);
1616 ewitab = _mm_cvttps_epi32(ewrt);
1617 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1618 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1619 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1621 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1622 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1626 /* Calculate temporary vectorial force */
1627 tx = _mm_mul_ps(fscal,dx10);
1628 ty = _mm_mul_ps(fscal,dy10);
1629 tz = _mm_mul_ps(fscal,dz10);
1631 /* Update vectorial force */
1632 fix1 = _mm_add_ps(fix1,tx);
1633 fiy1 = _mm_add_ps(fiy1,ty);
1634 fiz1 = _mm_add_ps(fiz1,tz);
1636 fjx0 = _mm_add_ps(fjx0,tx);
1637 fjy0 = _mm_add_ps(fjy0,ty);
1638 fjz0 = _mm_add_ps(fjz0,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 r11 = _mm_mul_ps(rsq11,rinv11);
1646 /* EWALD ELECTROSTATICS */
1648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1649 ewrt = _mm_mul_ps(r11,ewtabscale);
1650 ewitab = _mm_cvttps_epi32(ewrt);
1651 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1652 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1653 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1655 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1656 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1660 /* Calculate temporary vectorial force */
1661 tx = _mm_mul_ps(fscal,dx11);
1662 ty = _mm_mul_ps(fscal,dy11);
1663 tz = _mm_mul_ps(fscal,dz11);
1665 /* Update vectorial force */
1666 fix1 = _mm_add_ps(fix1,tx);
1667 fiy1 = _mm_add_ps(fiy1,ty);
1668 fiz1 = _mm_add_ps(fiz1,tz);
1670 fjx1 = _mm_add_ps(fjx1,tx);
1671 fjy1 = _mm_add_ps(fjy1,ty);
1672 fjz1 = _mm_add_ps(fjz1,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 r12 = _mm_mul_ps(rsq12,rinv12);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm_mul_ps(r12,ewtabscale);
1684 ewitab = _mm_cvttps_epi32(ewrt);
1685 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1687 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1689 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1694 /* Calculate temporary vectorial force */
1695 tx = _mm_mul_ps(fscal,dx12);
1696 ty = _mm_mul_ps(fscal,dy12);
1697 tz = _mm_mul_ps(fscal,dz12);
1699 /* Update vectorial force */
1700 fix1 = _mm_add_ps(fix1,tx);
1701 fiy1 = _mm_add_ps(fiy1,ty);
1702 fiz1 = _mm_add_ps(fiz1,tz);
1704 fjx2 = _mm_add_ps(fjx2,tx);
1705 fjy2 = _mm_add_ps(fjy2,ty);
1706 fjz2 = _mm_add_ps(fjz2,tz);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 r20 = _mm_mul_ps(rsq20,rinv20);
1714 /* EWALD ELECTROSTATICS */
1716 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1717 ewrt = _mm_mul_ps(r20,ewtabscale);
1718 ewitab = _mm_cvttps_epi32(ewrt);
1719 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1720 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1721 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1723 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1724 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1728 /* Calculate temporary vectorial force */
1729 tx = _mm_mul_ps(fscal,dx20);
1730 ty = _mm_mul_ps(fscal,dy20);
1731 tz = _mm_mul_ps(fscal,dz20);
1733 /* Update vectorial force */
1734 fix2 = _mm_add_ps(fix2,tx);
1735 fiy2 = _mm_add_ps(fiy2,ty);
1736 fiz2 = _mm_add_ps(fiz2,tz);
1738 fjx0 = _mm_add_ps(fjx0,tx);
1739 fjy0 = _mm_add_ps(fjy0,ty);
1740 fjz0 = _mm_add_ps(fjz0,tz);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 r21 = _mm_mul_ps(rsq21,rinv21);
1748 /* EWALD ELECTROSTATICS */
1750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751 ewrt = _mm_mul_ps(r21,ewtabscale);
1752 ewitab = _mm_cvttps_epi32(ewrt);
1753 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1754 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1755 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1757 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1758 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1762 /* Calculate temporary vectorial force */
1763 tx = _mm_mul_ps(fscal,dx21);
1764 ty = _mm_mul_ps(fscal,dy21);
1765 tz = _mm_mul_ps(fscal,dz21);
1767 /* Update vectorial force */
1768 fix2 = _mm_add_ps(fix2,tx);
1769 fiy2 = _mm_add_ps(fiy2,ty);
1770 fiz2 = _mm_add_ps(fiz2,tz);
1772 fjx1 = _mm_add_ps(fjx1,tx);
1773 fjy1 = _mm_add_ps(fjy1,ty);
1774 fjz1 = _mm_add_ps(fjz1,tz);
1776 /**************************
1777 * CALCULATE INTERACTIONS *
1778 **************************/
1780 r22 = _mm_mul_ps(rsq22,rinv22);
1782 /* EWALD ELECTROSTATICS */
1784 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1785 ewrt = _mm_mul_ps(r22,ewtabscale);
1786 ewitab = _mm_cvttps_epi32(ewrt);
1787 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1788 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1789 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1791 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1792 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_ps(fscal,dx22);
1798 ty = _mm_mul_ps(fscal,dy22);
1799 tz = _mm_mul_ps(fscal,dz22);
1801 /* Update vectorial force */
1802 fix2 = _mm_add_ps(fix2,tx);
1803 fiy2 = _mm_add_ps(fiy2,ty);
1804 fiz2 = _mm_add_ps(fiz2,tz);
1806 fjx2 = _mm_add_ps(fjx2,tx);
1807 fjy2 = _mm_add_ps(fjy2,ty);
1808 fjz2 = _mm_add_ps(fjz2,tz);
1810 fjptrA = f+j_coord_offsetA;
1811 fjptrB = f+j_coord_offsetB;
1812 fjptrC = f+j_coord_offsetC;
1813 fjptrD = f+j_coord_offsetD;
1815 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1816 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1818 /* Inner loop uses 331 flops */
1821 if(jidx<j_index_end)
1824 /* Get j neighbor index, and coordinate index */
1825 jnrlistA = jjnr[jidx];
1826 jnrlistB = jjnr[jidx+1];
1827 jnrlistC = jjnr[jidx+2];
1828 jnrlistD = jjnr[jidx+3];
1829 /* Sign of each element will be negative for non-real atoms.
1830 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1831 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1833 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1834 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1835 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1836 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1837 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1838 j_coord_offsetA = DIM*jnrA;
1839 j_coord_offsetB = DIM*jnrB;
1840 j_coord_offsetC = DIM*jnrC;
1841 j_coord_offsetD = DIM*jnrD;
1843 /* load j atom coordinates */
1844 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1845 x+j_coord_offsetC,x+j_coord_offsetD,
1846 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1848 /* Calculate displacement vector */
1849 dx00 = _mm_sub_ps(ix0,jx0);
1850 dy00 = _mm_sub_ps(iy0,jy0);
1851 dz00 = _mm_sub_ps(iz0,jz0);
1852 dx01 = _mm_sub_ps(ix0,jx1);
1853 dy01 = _mm_sub_ps(iy0,jy1);
1854 dz01 = _mm_sub_ps(iz0,jz1);
1855 dx02 = _mm_sub_ps(ix0,jx2);
1856 dy02 = _mm_sub_ps(iy0,jy2);
1857 dz02 = _mm_sub_ps(iz0,jz2);
1858 dx10 = _mm_sub_ps(ix1,jx0);
1859 dy10 = _mm_sub_ps(iy1,jy0);
1860 dz10 = _mm_sub_ps(iz1,jz0);
1861 dx11 = _mm_sub_ps(ix1,jx1);
1862 dy11 = _mm_sub_ps(iy1,jy1);
1863 dz11 = _mm_sub_ps(iz1,jz1);
1864 dx12 = _mm_sub_ps(ix1,jx2);
1865 dy12 = _mm_sub_ps(iy1,jy2);
1866 dz12 = _mm_sub_ps(iz1,jz2);
1867 dx20 = _mm_sub_ps(ix2,jx0);
1868 dy20 = _mm_sub_ps(iy2,jy0);
1869 dz20 = _mm_sub_ps(iz2,jz0);
1870 dx21 = _mm_sub_ps(ix2,jx1);
1871 dy21 = _mm_sub_ps(iy2,jy1);
1872 dz21 = _mm_sub_ps(iz2,jz1);
1873 dx22 = _mm_sub_ps(ix2,jx2);
1874 dy22 = _mm_sub_ps(iy2,jy2);
1875 dz22 = _mm_sub_ps(iz2,jz2);
1877 /* Calculate squared distance and things based on it */
1878 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1879 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1880 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1881 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1882 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1883 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1884 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1885 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1886 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1888 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1889 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1890 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1891 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1892 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1893 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1894 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1895 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1896 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1898 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1899 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1900 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1901 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1902 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1903 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1904 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1905 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1906 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1908 fjx0 = _mm_setzero_ps();
1909 fjy0 = _mm_setzero_ps();
1910 fjz0 = _mm_setzero_ps();
1911 fjx1 = _mm_setzero_ps();
1912 fjy1 = _mm_setzero_ps();
1913 fjz1 = _mm_setzero_ps();
1914 fjx2 = _mm_setzero_ps();
1915 fjy2 = _mm_setzero_ps();
1916 fjz2 = _mm_setzero_ps();
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 r00 = _mm_mul_ps(rsq00,rinv00);
1923 r00 = _mm_andnot_ps(dummy_mask,r00);
1925 /* EWALD ELECTROSTATICS */
1927 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1928 ewrt = _mm_mul_ps(r00,ewtabscale);
1929 ewitab = _mm_cvttps_epi32(ewrt);
1930 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1931 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1932 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1934 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1935 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1937 /* LENNARD-JONES DISPERSION/REPULSION */
1939 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1940 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1942 fscal = _mm_add_ps(felec,fvdw);
1944 fscal = _mm_andnot_ps(dummy_mask,fscal);
1946 /* Calculate temporary vectorial force */
1947 tx = _mm_mul_ps(fscal,dx00);
1948 ty = _mm_mul_ps(fscal,dy00);
1949 tz = _mm_mul_ps(fscal,dz00);
1951 /* Update vectorial force */
1952 fix0 = _mm_add_ps(fix0,tx);
1953 fiy0 = _mm_add_ps(fiy0,ty);
1954 fiz0 = _mm_add_ps(fiz0,tz);
1956 fjx0 = _mm_add_ps(fjx0,tx);
1957 fjy0 = _mm_add_ps(fjy0,ty);
1958 fjz0 = _mm_add_ps(fjz0,tz);
1960 /**************************
1961 * CALCULATE INTERACTIONS *
1962 **************************/
1964 r01 = _mm_mul_ps(rsq01,rinv01);
1965 r01 = _mm_andnot_ps(dummy_mask,r01);
1967 /* EWALD ELECTROSTATICS */
1969 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1970 ewrt = _mm_mul_ps(r01,ewtabscale);
1971 ewitab = _mm_cvttps_epi32(ewrt);
1972 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1973 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1974 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1976 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1977 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1981 fscal = _mm_andnot_ps(dummy_mask,fscal);
1983 /* Calculate temporary vectorial force */
1984 tx = _mm_mul_ps(fscal,dx01);
1985 ty = _mm_mul_ps(fscal,dy01);
1986 tz = _mm_mul_ps(fscal,dz01);
1988 /* Update vectorial force */
1989 fix0 = _mm_add_ps(fix0,tx);
1990 fiy0 = _mm_add_ps(fiy0,ty);
1991 fiz0 = _mm_add_ps(fiz0,tz);
1993 fjx1 = _mm_add_ps(fjx1,tx);
1994 fjy1 = _mm_add_ps(fjy1,ty);
1995 fjz1 = _mm_add_ps(fjz1,tz);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 r02 = _mm_mul_ps(rsq02,rinv02);
2002 r02 = _mm_andnot_ps(dummy_mask,r02);
2004 /* EWALD ELECTROSTATICS */
2006 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2007 ewrt = _mm_mul_ps(r02,ewtabscale);
2008 ewitab = _mm_cvttps_epi32(ewrt);
2009 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2010 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2011 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2013 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2014 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2018 fscal = _mm_andnot_ps(dummy_mask,fscal);
2020 /* Calculate temporary vectorial force */
2021 tx = _mm_mul_ps(fscal,dx02);
2022 ty = _mm_mul_ps(fscal,dy02);
2023 tz = _mm_mul_ps(fscal,dz02);
2025 /* Update vectorial force */
2026 fix0 = _mm_add_ps(fix0,tx);
2027 fiy0 = _mm_add_ps(fiy0,ty);
2028 fiz0 = _mm_add_ps(fiz0,tz);
2030 fjx2 = _mm_add_ps(fjx2,tx);
2031 fjy2 = _mm_add_ps(fjy2,ty);
2032 fjz2 = _mm_add_ps(fjz2,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r10 = _mm_mul_ps(rsq10,rinv10);
2039 r10 = _mm_andnot_ps(dummy_mask,r10);
2041 /* EWALD ELECTROSTATICS */
2043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044 ewrt = _mm_mul_ps(r10,ewtabscale);
2045 ewitab = _mm_cvttps_epi32(ewrt);
2046 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2047 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2048 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2050 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2051 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2055 fscal = _mm_andnot_ps(dummy_mask,fscal);
2057 /* Calculate temporary vectorial force */
2058 tx = _mm_mul_ps(fscal,dx10);
2059 ty = _mm_mul_ps(fscal,dy10);
2060 tz = _mm_mul_ps(fscal,dz10);
2062 /* Update vectorial force */
2063 fix1 = _mm_add_ps(fix1,tx);
2064 fiy1 = _mm_add_ps(fiy1,ty);
2065 fiz1 = _mm_add_ps(fiz1,tz);
2067 fjx0 = _mm_add_ps(fjx0,tx);
2068 fjy0 = _mm_add_ps(fjy0,ty);
2069 fjz0 = _mm_add_ps(fjz0,tz);
2071 /**************************
2072 * CALCULATE INTERACTIONS *
2073 **************************/
2075 r11 = _mm_mul_ps(rsq11,rinv11);
2076 r11 = _mm_andnot_ps(dummy_mask,r11);
2078 /* EWALD ELECTROSTATICS */
2080 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2081 ewrt = _mm_mul_ps(r11,ewtabscale);
2082 ewitab = _mm_cvttps_epi32(ewrt);
2083 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2084 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2085 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2087 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2088 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2092 fscal = _mm_andnot_ps(dummy_mask,fscal);
2094 /* Calculate temporary vectorial force */
2095 tx = _mm_mul_ps(fscal,dx11);
2096 ty = _mm_mul_ps(fscal,dy11);
2097 tz = _mm_mul_ps(fscal,dz11);
2099 /* Update vectorial force */
2100 fix1 = _mm_add_ps(fix1,tx);
2101 fiy1 = _mm_add_ps(fiy1,ty);
2102 fiz1 = _mm_add_ps(fiz1,tz);
2104 fjx1 = _mm_add_ps(fjx1,tx);
2105 fjy1 = _mm_add_ps(fjy1,ty);
2106 fjz1 = _mm_add_ps(fjz1,tz);
2108 /**************************
2109 * CALCULATE INTERACTIONS *
2110 **************************/
2112 r12 = _mm_mul_ps(rsq12,rinv12);
2113 r12 = _mm_andnot_ps(dummy_mask,r12);
2115 /* EWALD ELECTROSTATICS */
2117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2118 ewrt = _mm_mul_ps(r12,ewtabscale);
2119 ewitab = _mm_cvttps_epi32(ewrt);
2120 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2121 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2122 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2124 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2125 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2129 fscal = _mm_andnot_ps(dummy_mask,fscal);
2131 /* Calculate temporary vectorial force */
2132 tx = _mm_mul_ps(fscal,dx12);
2133 ty = _mm_mul_ps(fscal,dy12);
2134 tz = _mm_mul_ps(fscal,dz12);
2136 /* Update vectorial force */
2137 fix1 = _mm_add_ps(fix1,tx);
2138 fiy1 = _mm_add_ps(fiy1,ty);
2139 fiz1 = _mm_add_ps(fiz1,tz);
2141 fjx2 = _mm_add_ps(fjx2,tx);
2142 fjy2 = _mm_add_ps(fjy2,ty);
2143 fjz2 = _mm_add_ps(fjz2,tz);
2145 /**************************
2146 * CALCULATE INTERACTIONS *
2147 **************************/
2149 r20 = _mm_mul_ps(rsq20,rinv20);
2150 r20 = _mm_andnot_ps(dummy_mask,r20);
2152 /* EWALD ELECTROSTATICS */
2154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2155 ewrt = _mm_mul_ps(r20,ewtabscale);
2156 ewitab = _mm_cvttps_epi32(ewrt);
2157 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2158 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2159 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2161 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2162 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2166 fscal = _mm_andnot_ps(dummy_mask,fscal);
2168 /* Calculate temporary vectorial force */
2169 tx = _mm_mul_ps(fscal,dx20);
2170 ty = _mm_mul_ps(fscal,dy20);
2171 tz = _mm_mul_ps(fscal,dz20);
2173 /* Update vectorial force */
2174 fix2 = _mm_add_ps(fix2,tx);
2175 fiy2 = _mm_add_ps(fiy2,ty);
2176 fiz2 = _mm_add_ps(fiz2,tz);
2178 fjx0 = _mm_add_ps(fjx0,tx);
2179 fjy0 = _mm_add_ps(fjy0,ty);
2180 fjz0 = _mm_add_ps(fjz0,tz);
2182 /**************************
2183 * CALCULATE INTERACTIONS *
2184 **************************/
2186 r21 = _mm_mul_ps(rsq21,rinv21);
2187 r21 = _mm_andnot_ps(dummy_mask,r21);
2189 /* EWALD ELECTROSTATICS */
2191 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2192 ewrt = _mm_mul_ps(r21,ewtabscale);
2193 ewitab = _mm_cvttps_epi32(ewrt);
2194 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2195 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2196 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2198 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2199 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2203 fscal = _mm_andnot_ps(dummy_mask,fscal);
2205 /* Calculate temporary vectorial force */
2206 tx = _mm_mul_ps(fscal,dx21);
2207 ty = _mm_mul_ps(fscal,dy21);
2208 tz = _mm_mul_ps(fscal,dz21);
2210 /* Update vectorial force */
2211 fix2 = _mm_add_ps(fix2,tx);
2212 fiy2 = _mm_add_ps(fiy2,ty);
2213 fiz2 = _mm_add_ps(fiz2,tz);
2215 fjx1 = _mm_add_ps(fjx1,tx);
2216 fjy1 = _mm_add_ps(fjy1,ty);
2217 fjz1 = _mm_add_ps(fjz1,tz);
2219 /**************************
2220 * CALCULATE INTERACTIONS *
2221 **************************/
2223 r22 = _mm_mul_ps(rsq22,rinv22);
2224 r22 = _mm_andnot_ps(dummy_mask,r22);
2226 /* EWALD ELECTROSTATICS */
2228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229 ewrt = _mm_mul_ps(r22,ewtabscale);
2230 ewitab = _mm_cvttps_epi32(ewrt);
2231 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2232 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2233 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2235 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2236 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2240 fscal = _mm_andnot_ps(dummy_mask,fscal);
2242 /* Calculate temporary vectorial force */
2243 tx = _mm_mul_ps(fscal,dx22);
2244 ty = _mm_mul_ps(fscal,dy22);
2245 tz = _mm_mul_ps(fscal,dz22);
2247 /* Update vectorial force */
2248 fix2 = _mm_add_ps(fix2,tx);
2249 fiy2 = _mm_add_ps(fiy2,ty);
2250 fiz2 = _mm_add_ps(fiz2,tz);
2252 fjx2 = _mm_add_ps(fjx2,tx);
2253 fjy2 = _mm_add_ps(fjy2,ty);
2254 fjz2 = _mm_add_ps(fjz2,tz);
2256 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2257 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2258 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2259 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2261 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2262 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2264 /* Inner loop uses 340 flops */
2267 /* End of innermost loop */
2269 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2270 f+i_coord_offset,fshift+i_shift_offset);
2272 /* Increment number of inner iterations */
2273 inneriter += j_index_end - j_index_start;
2275 /* Outer loop uses 18 flops */
2278 /* Increment number of outer iterations */
2281 /* Update outer/inner flops */
2283 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);