2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
89 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
91 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
93 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
94 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
95 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
96 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
97 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
98 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
99 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
100 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
101 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
102 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128 dummy_mask,cutoff_mask;
109 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
110 __m128 one = _mm_set1_ps(1.0);
111 __m128 two = _mm_set1_ps(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm_set1_ps(fr->ic->epsfac);
124 charge = mdatoms->chargeA;
126 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137 jq1 = _mm_set1_ps(charge[inr+1]);
138 jq2 = _mm_set1_ps(charge[inr+2]);
139 jq3 = _mm_set1_ps(charge[inr+3]);
140 qq11 = _mm_mul_ps(iq1,jq1);
141 qq12 = _mm_mul_ps(iq1,jq2);
142 qq13 = _mm_mul_ps(iq1,jq3);
143 qq21 = _mm_mul_ps(iq2,jq1);
144 qq22 = _mm_mul_ps(iq2,jq2);
145 qq23 = _mm_mul_ps(iq2,jq3);
146 qq31 = _mm_mul_ps(iq3,jq1);
147 qq32 = _mm_mul_ps(iq3,jq2);
148 qq33 = _mm_mul_ps(iq3,jq3);
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
181 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix1 = _mm_setzero_ps();
184 fiy1 = _mm_setzero_ps();
185 fiz1 = _mm_setzero_ps();
186 fix2 = _mm_setzero_ps();
187 fiy2 = _mm_setzero_ps();
188 fiz2 = _mm_setzero_ps();
189 fix3 = _mm_setzero_ps();
190 fiy3 = _mm_setzero_ps();
191 fiz3 = _mm_setzero_ps();
193 /* Reset potential sums */
194 velecsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
212 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
213 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
215 /* Calculate displacement vector */
216 dx11 = _mm_sub_ps(ix1,jx1);
217 dy11 = _mm_sub_ps(iy1,jy1);
218 dz11 = _mm_sub_ps(iz1,jz1);
219 dx12 = _mm_sub_ps(ix1,jx2);
220 dy12 = _mm_sub_ps(iy1,jy2);
221 dz12 = _mm_sub_ps(iz1,jz2);
222 dx13 = _mm_sub_ps(ix1,jx3);
223 dy13 = _mm_sub_ps(iy1,jy3);
224 dz13 = _mm_sub_ps(iz1,jz3);
225 dx21 = _mm_sub_ps(ix2,jx1);
226 dy21 = _mm_sub_ps(iy2,jy1);
227 dz21 = _mm_sub_ps(iz2,jz1);
228 dx22 = _mm_sub_ps(ix2,jx2);
229 dy22 = _mm_sub_ps(iy2,jy2);
230 dz22 = _mm_sub_ps(iz2,jz2);
231 dx23 = _mm_sub_ps(ix2,jx3);
232 dy23 = _mm_sub_ps(iy2,jy3);
233 dz23 = _mm_sub_ps(iz2,jz3);
234 dx31 = _mm_sub_ps(ix3,jx1);
235 dy31 = _mm_sub_ps(iy3,jy1);
236 dz31 = _mm_sub_ps(iz3,jz1);
237 dx32 = _mm_sub_ps(ix3,jx2);
238 dy32 = _mm_sub_ps(iy3,jy2);
239 dz32 = _mm_sub_ps(iz3,jz2);
240 dx33 = _mm_sub_ps(ix3,jx3);
241 dy33 = _mm_sub_ps(iy3,jy3);
242 dz33 = _mm_sub_ps(iz3,jz3);
244 /* Calculate squared distance and things based on it */
245 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
246 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
247 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
248 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
249 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
250 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
251 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
252 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
253 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
255 rinv11 = sse2_invsqrt_f(rsq11);
256 rinv12 = sse2_invsqrt_f(rsq12);
257 rinv13 = sse2_invsqrt_f(rsq13);
258 rinv21 = sse2_invsqrt_f(rsq21);
259 rinv22 = sse2_invsqrt_f(rsq22);
260 rinv23 = sse2_invsqrt_f(rsq23);
261 rinv31 = sse2_invsqrt_f(rsq31);
262 rinv32 = sse2_invsqrt_f(rsq32);
263 rinv33 = sse2_invsqrt_f(rsq33);
265 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
266 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
267 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
268 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
269 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
270 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
271 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
272 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
273 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
275 fjx1 = _mm_setzero_ps();
276 fjy1 = _mm_setzero_ps();
277 fjz1 = _mm_setzero_ps();
278 fjx2 = _mm_setzero_ps();
279 fjy2 = _mm_setzero_ps();
280 fjz2 = _mm_setzero_ps();
281 fjx3 = _mm_setzero_ps();
282 fjy3 = _mm_setzero_ps();
283 fjz3 = _mm_setzero_ps();
285 /**************************
286 * CALCULATE INTERACTIONS *
287 **************************/
289 r11 = _mm_mul_ps(rsq11,rinv11);
291 /* EWALD ELECTROSTATICS */
293 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
294 ewrt = _mm_mul_ps(r11,ewtabscale);
295 ewitab = _mm_cvttps_epi32(ewrt);
296 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
297 ewitab = _mm_slli_epi32(ewitab,2);
298 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
299 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
300 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
301 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
302 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
303 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
304 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
305 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
306 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velecsum = _mm_add_ps(velecsum,velec);
313 /* Calculate temporary vectorial force */
314 tx = _mm_mul_ps(fscal,dx11);
315 ty = _mm_mul_ps(fscal,dy11);
316 tz = _mm_mul_ps(fscal,dz11);
318 /* Update vectorial force */
319 fix1 = _mm_add_ps(fix1,tx);
320 fiy1 = _mm_add_ps(fiy1,ty);
321 fiz1 = _mm_add_ps(fiz1,tz);
323 fjx1 = _mm_add_ps(fjx1,tx);
324 fjy1 = _mm_add_ps(fjy1,ty);
325 fjz1 = _mm_add_ps(fjz1,tz);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 r12 = _mm_mul_ps(rsq12,rinv12);
333 /* EWALD ELECTROSTATICS */
335 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
336 ewrt = _mm_mul_ps(r12,ewtabscale);
337 ewitab = _mm_cvttps_epi32(ewrt);
338 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
339 ewitab = _mm_slli_epi32(ewitab,2);
340 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
341 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
342 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
343 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
344 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
345 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
346 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
347 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
348 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velecsum = _mm_add_ps(velecsum,velec);
355 /* Calculate temporary vectorial force */
356 tx = _mm_mul_ps(fscal,dx12);
357 ty = _mm_mul_ps(fscal,dy12);
358 tz = _mm_mul_ps(fscal,dz12);
360 /* Update vectorial force */
361 fix1 = _mm_add_ps(fix1,tx);
362 fiy1 = _mm_add_ps(fiy1,ty);
363 fiz1 = _mm_add_ps(fiz1,tz);
365 fjx2 = _mm_add_ps(fjx2,tx);
366 fjy2 = _mm_add_ps(fjy2,ty);
367 fjz2 = _mm_add_ps(fjz2,tz);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 r13 = _mm_mul_ps(rsq13,rinv13);
375 /* EWALD ELECTROSTATICS */
377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
378 ewrt = _mm_mul_ps(r13,ewtabscale);
379 ewitab = _mm_cvttps_epi32(ewrt);
380 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
381 ewitab = _mm_slli_epi32(ewitab,2);
382 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
383 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
384 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
385 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
386 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
387 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
388 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
389 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
390 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velecsum = _mm_add_ps(velecsum,velec);
397 /* Calculate temporary vectorial force */
398 tx = _mm_mul_ps(fscal,dx13);
399 ty = _mm_mul_ps(fscal,dy13);
400 tz = _mm_mul_ps(fscal,dz13);
402 /* Update vectorial force */
403 fix1 = _mm_add_ps(fix1,tx);
404 fiy1 = _mm_add_ps(fiy1,ty);
405 fiz1 = _mm_add_ps(fiz1,tz);
407 fjx3 = _mm_add_ps(fjx3,tx);
408 fjy3 = _mm_add_ps(fjy3,ty);
409 fjz3 = _mm_add_ps(fjz3,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 r21 = _mm_mul_ps(rsq21,rinv21);
417 /* EWALD ELECTROSTATICS */
419 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
420 ewrt = _mm_mul_ps(r21,ewtabscale);
421 ewitab = _mm_cvttps_epi32(ewrt);
422 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
423 ewitab = _mm_slli_epi32(ewitab,2);
424 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
425 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
426 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
427 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
428 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
429 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
430 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
431 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
432 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velecsum = _mm_add_ps(velecsum,velec);
439 /* Calculate temporary vectorial force */
440 tx = _mm_mul_ps(fscal,dx21);
441 ty = _mm_mul_ps(fscal,dy21);
442 tz = _mm_mul_ps(fscal,dz21);
444 /* Update vectorial force */
445 fix2 = _mm_add_ps(fix2,tx);
446 fiy2 = _mm_add_ps(fiy2,ty);
447 fiz2 = _mm_add_ps(fiz2,tz);
449 fjx1 = _mm_add_ps(fjx1,tx);
450 fjy1 = _mm_add_ps(fjy1,ty);
451 fjz1 = _mm_add_ps(fjz1,tz);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 r22 = _mm_mul_ps(rsq22,rinv22);
459 /* EWALD ELECTROSTATICS */
461 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
462 ewrt = _mm_mul_ps(r22,ewtabscale);
463 ewitab = _mm_cvttps_epi32(ewrt);
464 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
465 ewitab = _mm_slli_epi32(ewitab,2);
466 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
467 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
468 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
469 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
470 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
471 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
472 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
473 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
474 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velecsum = _mm_add_ps(velecsum,velec);
481 /* Calculate temporary vectorial force */
482 tx = _mm_mul_ps(fscal,dx22);
483 ty = _mm_mul_ps(fscal,dy22);
484 tz = _mm_mul_ps(fscal,dz22);
486 /* Update vectorial force */
487 fix2 = _mm_add_ps(fix2,tx);
488 fiy2 = _mm_add_ps(fiy2,ty);
489 fiz2 = _mm_add_ps(fiz2,tz);
491 fjx2 = _mm_add_ps(fjx2,tx);
492 fjy2 = _mm_add_ps(fjy2,ty);
493 fjz2 = _mm_add_ps(fjz2,tz);
495 /**************************
496 * CALCULATE INTERACTIONS *
497 **************************/
499 r23 = _mm_mul_ps(rsq23,rinv23);
501 /* EWALD ELECTROSTATICS */
503 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
504 ewrt = _mm_mul_ps(r23,ewtabscale);
505 ewitab = _mm_cvttps_epi32(ewrt);
506 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
507 ewitab = _mm_slli_epi32(ewitab,2);
508 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
509 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
510 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
511 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
512 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
513 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
514 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
515 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
516 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
518 /* Update potential sum for this i atom from the interaction with this j atom. */
519 velecsum = _mm_add_ps(velecsum,velec);
523 /* Calculate temporary vectorial force */
524 tx = _mm_mul_ps(fscal,dx23);
525 ty = _mm_mul_ps(fscal,dy23);
526 tz = _mm_mul_ps(fscal,dz23);
528 /* Update vectorial force */
529 fix2 = _mm_add_ps(fix2,tx);
530 fiy2 = _mm_add_ps(fiy2,ty);
531 fiz2 = _mm_add_ps(fiz2,tz);
533 fjx3 = _mm_add_ps(fjx3,tx);
534 fjy3 = _mm_add_ps(fjy3,ty);
535 fjz3 = _mm_add_ps(fjz3,tz);
537 /**************************
538 * CALCULATE INTERACTIONS *
539 **************************/
541 r31 = _mm_mul_ps(rsq31,rinv31);
543 /* EWALD ELECTROSTATICS */
545 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546 ewrt = _mm_mul_ps(r31,ewtabscale);
547 ewitab = _mm_cvttps_epi32(ewrt);
548 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
549 ewitab = _mm_slli_epi32(ewitab,2);
550 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
551 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
552 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
553 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
554 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
555 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
556 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
557 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
558 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velecsum = _mm_add_ps(velecsum,velec);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_ps(fscal,dx31);
567 ty = _mm_mul_ps(fscal,dy31);
568 tz = _mm_mul_ps(fscal,dz31);
570 /* Update vectorial force */
571 fix3 = _mm_add_ps(fix3,tx);
572 fiy3 = _mm_add_ps(fiy3,ty);
573 fiz3 = _mm_add_ps(fiz3,tz);
575 fjx1 = _mm_add_ps(fjx1,tx);
576 fjy1 = _mm_add_ps(fjy1,ty);
577 fjz1 = _mm_add_ps(fjz1,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r32 = _mm_mul_ps(rsq32,rinv32);
585 /* EWALD ELECTROSTATICS */
587 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
588 ewrt = _mm_mul_ps(r32,ewtabscale);
589 ewitab = _mm_cvttps_epi32(ewrt);
590 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
591 ewitab = _mm_slli_epi32(ewitab,2);
592 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
593 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
594 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
595 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
596 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
597 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
598 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
599 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
600 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velecsum = _mm_add_ps(velecsum,velec);
607 /* Calculate temporary vectorial force */
608 tx = _mm_mul_ps(fscal,dx32);
609 ty = _mm_mul_ps(fscal,dy32);
610 tz = _mm_mul_ps(fscal,dz32);
612 /* Update vectorial force */
613 fix3 = _mm_add_ps(fix3,tx);
614 fiy3 = _mm_add_ps(fiy3,ty);
615 fiz3 = _mm_add_ps(fiz3,tz);
617 fjx2 = _mm_add_ps(fjx2,tx);
618 fjy2 = _mm_add_ps(fjy2,ty);
619 fjz2 = _mm_add_ps(fjz2,tz);
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 r33 = _mm_mul_ps(rsq33,rinv33);
627 /* EWALD ELECTROSTATICS */
629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
630 ewrt = _mm_mul_ps(r33,ewtabscale);
631 ewitab = _mm_cvttps_epi32(ewrt);
632 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
633 ewitab = _mm_slli_epi32(ewitab,2);
634 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
635 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
636 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
637 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
638 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
639 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
640 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
641 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
642 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velecsum = _mm_add_ps(velecsum,velec);
649 /* Calculate temporary vectorial force */
650 tx = _mm_mul_ps(fscal,dx33);
651 ty = _mm_mul_ps(fscal,dy33);
652 tz = _mm_mul_ps(fscal,dz33);
654 /* Update vectorial force */
655 fix3 = _mm_add_ps(fix3,tx);
656 fiy3 = _mm_add_ps(fiy3,ty);
657 fiz3 = _mm_add_ps(fiz3,tz);
659 fjx3 = _mm_add_ps(fjx3,tx);
660 fjy3 = _mm_add_ps(fjy3,ty);
661 fjz3 = _mm_add_ps(fjz3,tz);
663 fjptrA = f+j_coord_offsetA;
664 fjptrB = f+j_coord_offsetB;
665 fjptrC = f+j_coord_offsetC;
666 fjptrD = f+j_coord_offsetD;
668 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
669 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
671 /* Inner loop uses 369 flops */
677 /* Get j neighbor index, and coordinate index */
678 jnrlistA = jjnr[jidx];
679 jnrlistB = jjnr[jidx+1];
680 jnrlistC = jjnr[jidx+2];
681 jnrlistD = jjnr[jidx+3];
682 /* Sign of each element will be negative for non-real atoms.
683 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
684 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
686 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
687 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
688 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
689 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
690 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
691 j_coord_offsetA = DIM*jnrA;
692 j_coord_offsetB = DIM*jnrB;
693 j_coord_offsetC = DIM*jnrC;
694 j_coord_offsetD = DIM*jnrD;
696 /* load j atom coordinates */
697 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
698 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
699 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
701 /* Calculate displacement vector */
702 dx11 = _mm_sub_ps(ix1,jx1);
703 dy11 = _mm_sub_ps(iy1,jy1);
704 dz11 = _mm_sub_ps(iz1,jz1);
705 dx12 = _mm_sub_ps(ix1,jx2);
706 dy12 = _mm_sub_ps(iy1,jy2);
707 dz12 = _mm_sub_ps(iz1,jz2);
708 dx13 = _mm_sub_ps(ix1,jx3);
709 dy13 = _mm_sub_ps(iy1,jy3);
710 dz13 = _mm_sub_ps(iz1,jz3);
711 dx21 = _mm_sub_ps(ix2,jx1);
712 dy21 = _mm_sub_ps(iy2,jy1);
713 dz21 = _mm_sub_ps(iz2,jz1);
714 dx22 = _mm_sub_ps(ix2,jx2);
715 dy22 = _mm_sub_ps(iy2,jy2);
716 dz22 = _mm_sub_ps(iz2,jz2);
717 dx23 = _mm_sub_ps(ix2,jx3);
718 dy23 = _mm_sub_ps(iy2,jy3);
719 dz23 = _mm_sub_ps(iz2,jz3);
720 dx31 = _mm_sub_ps(ix3,jx1);
721 dy31 = _mm_sub_ps(iy3,jy1);
722 dz31 = _mm_sub_ps(iz3,jz1);
723 dx32 = _mm_sub_ps(ix3,jx2);
724 dy32 = _mm_sub_ps(iy3,jy2);
725 dz32 = _mm_sub_ps(iz3,jz2);
726 dx33 = _mm_sub_ps(ix3,jx3);
727 dy33 = _mm_sub_ps(iy3,jy3);
728 dz33 = _mm_sub_ps(iz3,jz3);
730 /* Calculate squared distance and things based on it */
731 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
732 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
733 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
734 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
735 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
736 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
737 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
738 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
739 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
741 rinv11 = sse2_invsqrt_f(rsq11);
742 rinv12 = sse2_invsqrt_f(rsq12);
743 rinv13 = sse2_invsqrt_f(rsq13);
744 rinv21 = sse2_invsqrt_f(rsq21);
745 rinv22 = sse2_invsqrt_f(rsq22);
746 rinv23 = sse2_invsqrt_f(rsq23);
747 rinv31 = sse2_invsqrt_f(rsq31);
748 rinv32 = sse2_invsqrt_f(rsq32);
749 rinv33 = sse2_invsqrt_f(rsq33);
751 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
752 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
753 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
754 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
755 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
756 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
757 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
758 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
759 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
761 fjx1 = _mm_setzero_ps();
762 fjy1 = _mm_setzero_ps();
763 fjz1 = _mm_setzero_ps();
764 fjx2 = _mm_setzero_ps();
765 fjy2 = _mm_setzero_ps();
766 fjz2 = _mm_setzero_ps();
767 fjx3 = _mm_setzero_ps();
768 fjy3 = _mm_setzero_ps();
769 fjz3 = _mm_setzero_ps();
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
775 r11 = _mm_mul_ps(rsq11,rinv11);
776 r11 = _mm_andnot_ps(dummy_mask,r11);
778 /* EWALD ELECTROSTATICS */
780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781 ewrt = _mm_mul_ps(r11,ewtabscale);
782 ewitab = _mm_cvttps_epi32(ewrt);
783 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
784 ewitab = _mm_slli_epi32(ewitab,2);
785 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
786 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
787 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
788 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
789 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
790 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
791 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
792 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
793 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
795 /* Update potential sum for this i atom from the interaction with this j atom. */
796 velec = _mm_andnot_ps(dummy_mask,velec);
797 velecsum = _mm_add_ps(velecsum,velec);
801 fscal = _mm_andnot_ps(dummy_mask,fscal);
803 /* Calculate temporary vectorial force */
804 tx = _mm_mul_ps(fscal,dx11);
805 ty = _mm_mul_ps(fscal,dy11);
806 tz = _mm_mul_ps(fscal,dz11);
808 /* Update vectorial force */
809 fix1 = _mm_add_ps(fix1,tx);
810 fiy1 = _mm_add_ps(fiy1,ty);
811 fiz1 = _mm_add_ps(fiz1,tz);
813 fjx1 = _mm_add_ps(fjx1,tx);
814 fjy1 = _mm_add_ps(fjy1,ty);
815 fjz1 = _mm_add_ps(fjz1,tz);
817 /**************************
818 * CALCULATE INTERACTIONS *
819 **************************/
821 r12 = _mm_mul_ps(rsq12,rinv12);
822 r12 = _mm_andnot_ps(dummy_mask,r12);
824 /* EWALD ELECTROSTATICS */
826 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
827 ewrt = _mm_mul_ps(r12,ewtabscale);
828 ewitab = _mm_cvttps_epi32(ewrt);
829 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
830 ewitab = _mm_slli_epi32(ewitab,2);
831 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
832 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
833 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
834 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
835 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
836 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
837 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
838 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
839 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec = _mm_andnot_ps(dummy_mask,velec);
843 velecsum = _mm_add_ps(velecsum,velec);
847 fscal = _mm_andnot_ps(dummy_mask,fscal);
849 /* Calculate temporary vectorial force */
850 tx = _mm_mul_ps(fscal,dx12);
851 ty = _mm_mul_ps(fscal,dy12);
852 tz = _mm_mul_ps(fscal,dz12);
854 /* Update vectorial force */
855 fix1 = _mm_add_ps(fix1,tx);
856 fiy1 = _mm_add_ps(fiy1,ty);
857 fiz1 = _mm_add_ps(fiz1,tz);
859 fjx2 = _mm_add_ps(fjx2,tx);
860 fjy2 = _mm_add_ps(fjy2,ty);
861 fjz2 = _mm_add_ps(fjz2,tz);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 r13 = _mm_mul_ps(rsq13,rinv13);
868 r13 = _mm_andnot_ps(dummy_mask,r13);
870 /* EWALD ELECTROSTATICS */
872 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
873 ewrt = _mm_mul_ps(r13,ewtabscale);
874 ewitab = _mm_cvttps_epi32(ewrt);
875 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
876 ewitab = _mm_slli_epi32(ewitab,2);
877 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
878 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
879 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
880 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
881 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
882 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
883 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
884 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
885 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
887 /* Update potential sum for this i atom from the interaction with this j atom. */
888 velec = _mm_andnot_ps(dummy_mask,velec);
889 velecsum = _mm_add_ps(velecsum,velec);
893 fscal = _mm_andnot_ps(dummy_mask,fscal);
895 /* Calculate temporary vectorial force */
896 tx = _mm_mul_ps(fscal,dx13);
897 ty = _mm_mul_ps(fscal,dy13);
898 tz = _mm_mul_ps(fscal,dz13);
900 /* Update vectorial force */
901 fix1 = _mm_add_ps(fix1,tx);
902 fiy1 = _mm_add_ps(fiy1,ty);
903 fiz1 = _mm_add_ps(fiz1,tz);
905 fjx3 = _mm_add_ps(fjx3,tx);
906 fjy3 = _mm_add_ps(fjy3,ty);
907 fjz3 = _mm_add_ps(fjz3,tz);
909 /**************************
910 * CALCULATE INTERACTIONS *
911 **************************/
913 r21 = _mm_mul_ps(rsq21,rinv21);
914 r21 = _mm_andnot_ps(dummy_mask,r21);
916 /* EWALD ELECTROSTATICS */
918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
919 ewrt = _mm_mul_ps(r21,ewtabscale);
920 ewitab = _mm_cvttps_epi32(ewrt);
921 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
922 ewitab = _mm_slli_epi32(ewitab,2);
923 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
924 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
925 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
926 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
927 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
928 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
929 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
930 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
931 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_andnot_ps(dummy_mask,velec);
935 velecsum = _mm_add_ps(velecsum,velec);
939 fscal = _mm_andnot_ps(dummy_mask,fscal);
941 /* Calculate temporary vectorial force */
942 tx = _mm_mul_ps(fscal,dx21);
943 ty = _mm_mul_ps(fscal,dy21);
944 tz = _mm_mul_ps(fscal,dz21);
946 /* Update vectorial force */
947 fix2 = _mm_add_ps(fix2,tx);
948 fiy2 = _mm_add_ps(fiy2,ty);
949 fiz2 = _mm_add_ps(fiz2,tz);
951 fjx1 = _mm_add_ps(fjx1,tx);
952 fjy1 = _mm_add_ps(fjy1,ty);
953 fjz1 = _mm_add_ps(fjz1,tz);
955 /**************************
956 * CALCULATE INTERACTIONS *
957 **************************/
959 r22 = _mm_mul_ps(rsq22,rinv22);
960 r22 = _mm_andnot_ps(dummy_mask,r22);
962 /* EWALD ELECTROSTATICS */
964 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
965 ewrt = _mm_mul_ps(r22,ewtabscale);
966 ewitab = _mm_cvttps_epi32(ewrt);
967 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
968 ewitab = _mm_slli_epi32(ewitab,2);
969 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
970 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
971 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
972 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
973 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
974 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
975 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
976 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
977 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm_andnot_ps(dummy_mask,velec);
981 velecsum = _mm_add_ps(velecsum,velec);
985 fscal = _mm_andnot_ps(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm_mul_ps(fscal,dx22);
989 ty = _mm_mul_ps(fscal,dy22);
990 tz = _mm_mul_ps(fscal,dz22);
992 /* Update vectorial force */
993 fix2 = _mm_add_ps(fix2,tx);
994 fiy2 = _mm_add_ps(fiy2,ty);
995 fiz2 = _mm_add_ps(fiz2,tz);
997 fjx2 = _mm_add_ps(fjx2,tx);
998 fjy2 = _mm_add_ps(fjy2,ty);
999 fjz2 = _mm_add_ps(fjz2,tz);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 r23 = _mm_mul_ps(rsq23,rinv23);
1006 r23 = _mm_andnot_ps(dummy_mask,r23);
1008 /* EWALD ELECTROSTATICS */
1010 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1011 ewrt = _mm_mul_ps(r23,ewtabscale);
1012 ewitab = _mm_cvttps_epi32(ewrt);
1013 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1014 ewitab = _mm_slli_epi32(ewitab,2);
1015 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1016 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1017 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1018 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1019 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1020 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1021 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1022 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1023 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm_andnot_ps(dummy_mask,velec);
1027 velecsum = _mm_add_ps(velecsum,velec);
1031 fscal = _mm_andnot_ps(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm_mul_ps(fscal,dx23);
1035 ty = _mm_mul_ps(fscal,dy23);
1036 tz = _mm_mul_ps(fscal,dz23);
1038 /* Update vectorial force */
1039 fix2 = _mm_add_ps(fix2,tx);
1040 fiy2 = _mm_add_ps(fiy2,ty);
1041 fiz2 = _mm_add_ps(fiz2,tz);
1043 fjx3 = _mm_add_ps(fjx3,tx);
1044 fjy3 = _mm_add_ps(fjy3,ty);
1045 fjz3 = _mm_add_ps(fjz3,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r31 = _mm_mul_ps(rsq31,rinv31);
1052 r31 = _mm_andnot_ps(dummy_mask,r31);
1054 /* EWALD ELECTROSTATICS */
1056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1057 ewrt = _mm_mul_ps(r31,ewtabscale);
1058 ewitab = _mm_cvttps_epi32(ewrt);
1059 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1060 ewitab = _mm_slli_epi32(ewitab,2);
1061 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1062 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1063 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1064 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1065 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1066 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1067 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1068 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1069 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1071 /* Update potential sum for this i atom from the interaction with this j atom. */
1072 velec = _mm_andnot_ps(dummy_mask,velec);
1073 velecsum = _mm_add_ps(velecsum,velec);
1077 fscal = _mm_andnot_ps(dummy_mask,fscal);
1079 /* Calculate temporary vectorial force */
1080 tx = _mm_mul_ps(fscal,dx31);
1081 ty = _mm_mul_ps(fscal,dy31);
1082 tz = _mm_mul_ps(fscal,dz31);
1084 /* Update vectorial force */
1085 fix3 = _mm_add_ps(fix3,tx);
1086 fiy3 = _mm_add_ps(fiy3,ty);
1087 fiz3 = _mm_add_ps(fiz3,tz);
1089 fjx1 = _mm_add_ps(fjx1,tx);
1090 fjy1 = _mm_add_ps(fjy1,ty);
1091 fjz1 = _mm_add_ps(fjz1,tz);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r32 = _mm_mul_ps(rsq32,rinv32);
1098 r32 = _mm_andnot_ps(dummy_mask,r32);
1100 /* EWALD ELECTROSTATICS */
1102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1103 ewrt = _mm_mul_ps(r32,ewtabscale);
1104 ewitab = _mm_cvttps_epi32(ewrt);
1105 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1106 ewitab = _mm_slli_epi32(ewitab,2);
1107 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1108 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1109 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1110 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1111 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1112 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1113 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1114 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1115 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 velec = _mm_andnot_ps(dummy_mask,velec);
1119 velecsum = _mm_add_ps(velecsum,velec);
1123 fscal = _mm_andnot_ps(dummy_mask,fscal);
1125 /* Calculate temporary vectorial force */
1126 tx = _mm_mul_ps(fscal,dx32);
1127 ty = _mm_mul_ps(fscal,dy32);
1128 tz = _mm_mul_ps(fscal,dz32);
1130 /* Update vectorial force */
1131 fix3 = _mm_add_ps(fix3,tx);
1132 fiy3 = _mm_add_ps(fiy3,ty);
1133 fiz3 = _mm_add_ps(fiz3,tz);
1135 fjx2 = _mm_add_ps(fjx2,tx);
1136 fjy2 = _mm_add_ps(fjy2,ty);
1137 fjz2 = _mm_add_ps(fjz2,tz);
1139 /**************************
1140 * CALCULATE INTERACTIONS *
1141 **************************/
1143 r33 = _mm_mul_ps(rsq33,rinv33);
1144 r33 = _mm_andnot_ps(dummy_mask,r33);
1146 /* EWALD ELECTROSTATICS */
1148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1149 ewrt = _mm_mul_ps(r33,ewtabscale);
1150 ewitab = _mm_cvttps_epi32(ewrt);
1151 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1152 ewitab = _mm_slli_epi32(ewitab,2);
1153 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1154 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1155 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1156 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1157 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1158 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1159 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1160 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1161 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec = _mm_andnot_ps(dummy_mask,velec);
1165 velecsum = _mm_add_ps(velecsum,velec);
1169 fscal = _mm_andnot_ps(dummy_mask,fscal);
1171 /* Calculate temporary vectorial force */
1172 tx = _mm_mul_ps(fscal,dx33);
1173 ty = _mm_mul_ps(fscal,dy33);
1174 tz = _mm_mul_ps(fscal,dz33);
1176 /* Update vectorial force */
1177 fix3 = _mm_add_ps(fix3,tx);
1178 fiy3 = _mm_add_ps(fiy3,ty);
1179 fiz3 = _mm_add_ps(fiz3,tz);
1181 fjx3 = _mm_add_ps(fjx3,tx);
1182 fjy3 = _mm_add_ps(fjy3,ty);
1183 fjz3 = _mm_add_ps(fjz3,tz);
1185 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1186 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1187 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1188 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1190 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1191 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1193 /* Inner loop uses 378 flops */
1196 /* End of innermost loop */
1198 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1199 f+i_coord_offset+DIM,fshift+i_shift_offset);
1202 /* Update potential energies */
1203 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1205 /* Increment number of inner iterations */
1206 inneriter += j_index_end - j_index_start;
1208 /* Outer loop uses 19 flops */
1211 /* Increment number of outer iterations */
1214 /* Update outer/inner flops */
1216 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
1219 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
1220 * Electrostatics interaction: Ewald
1221 * VdW interaction: None
1222 * Geometry: Water4-Water4
1223 * Calculate force/pot: Force
1226 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
1227 (t_nblist * gmx_restrict nlist,
1228 rvec * gmx_restrict xx,
1229 rvec * gmx_restrict ff,
1230 struct t_forcerec * gmx_restrict fr,
1231 t_mdatoms * gmx_restrict mdatoms,
1232 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1233 t_nrnb * gmx_restrict nrnb)
1235 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1236 * just 0 for non-waters.
1237 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1238 * jnr indices corresponding to data put in the four positions in the SIMD register.
1240 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1241 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1242 int jnrA,jnrB,jnrC,jnrD;
1243 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1244 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1245 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1246 real rcutoff_scalar;
1247 real *shiftvec,*fshift,*x,*f;
1248 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1249 real scratch[4*DIM];
1250 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1252 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1254 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1256 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1257 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1258 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1259 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1260 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1261 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1262 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1263 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1264 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1265 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1266 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1267 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1268 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1269 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1270 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1271 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1272 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1275 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1277 __m128 dummy_mask,cutoff_mask;
1278 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1279 __m128 one = _mm_set1_ps(1.0);
1280 __m128 two = _mm_set1_ps(2.0);
1286 jindex = nlist->jindex;
1288 shiftidx = nlist->shift;
1290 shiftvec = fr->shift_vec[0];
1291 fshift = fr->fshift[0];
1292 facel = _mm_set1_ps(fr->ic->epsfac);
1293 charge = mdatoms->chargeA;
1295 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1296 ewtab = fr->ic->tabq_coul_F;
1297 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1298 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1300 /* Setup water-specific parameters */
1301 inr = nlist->iinr[0];
1302 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1303 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1304 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1306 jq1 = _mm_set1_ps(charge[inr+1]);
1307 jq2 = _mm_set1_ps(charge[inr+2]);
1308 jq3 = _mm_set1_ps(charge[inr+3]);
1309 qq11 = _mm_mul_ps(iq1,jq1);
1310 qq12 = _mm_mul_ps(iq1,jq2);
1311 qq13 = _mm_mul_ps(iq1,jq3);
1312 qq21 = _mm_mul_ps(iq2,jq1);
1313 qq22 = _mm_mul_ps(iq2,jq2);
1314 qq23 = _mm_mul_ps(iq2,jq3);
1315 qq31 = _mm_mul_ps(iq3,jq1);
1316 qq32 = _mm_mul_ps(iq3,jq2);
1317 qq33 = _mm_mul_ps(iq3,jq3);
1319 /* Avoid stupid compiler warnings */
1320 jnrA = jnrB = jnrC = jnrD = 0;
1321 j_coord_offsetA = 0;
1322 j_coord_offsetB = 0;
1323 j_coord_offsetC = 0;
1324 j_coord_offsetD = 0;
1329 for(iidx=0;iidx<4*DIM;iidx++)
1331 scratch[iidx] = 0.0;
1334 /* Start outer loop over neighborlists */
1335 for(iidx=0; iidx<nri; iidx++)
1337 /* Load shift vector for this list */
1338 i_shift_offset = DIM*shiftidx[iidx];
1340 /* Load limits for loop over neighbors */
1341 j_index_start = jindex[iidx];
1342 j_index_end = jindex[iidx+1];
1344 /* Get outer coordinate index */
1346 i_coord_offset = DIM*inr;
1348 /* Load i particle coords and add shift vector */
1349 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1350 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1352 fix1 = _mm_setzero_ps();
1353 fiy1 = _mm_setzero_ps();
1354 fiz1 = _mm_setzero_ps();
1355 fix2 = _mm_setzero_ps();
1356 fiy2 = _mm_setzero_ps();
1357 fiz2 = _mm_setzero_ps();
1358 fix3 = _mm_setzero_ps();
1359 fiy3 = _mm_setzero_ps();
1360 fiz3 = _mm_setzero_ps();
1362 /* Start inner kernel loop */
1363 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1366 /* Get j neighbor index, and coordinate index */
1368 jnrB = jjnr[jidx+1];
1369 jnrC = jjnr[jidx+2];
1370 jnrD = jjnr[jidx+3];
1371 j_coord_offsetA = DIM*jnrA;
1372 j_coord_offsetB = DIM*jnrB;
1373 j_coord_offsetC = DIM*jnrC;
1374 j_coord_offsetD = DIM*jnrD;
1376 /* load j atom coordinates */
1377 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1378 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1379 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1381 /* Calculate displacement vector */
1382 dx11 = _mm_sub_ps(ix1,jx1);
1383 dy11 = _mm_sub_ps(iy1,jy1);
1384 dz11 = _mm_sub_ps(iz1,jz1);
1385 dx12 = _mm_sub_ps(ix1,jx2);
1386 dy12 = _mm_sub_ps(iy1,jy2);
1387 dz12 = _mm_sub_ps(iz1,jz2);
1388 dx13 = _mm_sub_ps(ix1,jx3);
1389 dy13 = _mm_sub_ps(iy1,jy3);
1390 dz13 = _mm_sub_ps(iz1,jz3);
1391 dx21 = _mm_sub_ps(ix2,jx1);
1392 dy21 = _mm_sub_ps(iy2,jy1);
1393 dz21 = _mm_sub_ps(iz2,jz1);
1394 dx22 = _mm_sub_ps(ix2,jx2);
1395 dy22 = _mm_sub_ps(iy2,jy2);
1396 dz22 = _mm_sub_ps(iz2,jz2);
1397 dx23 = _mm_sub_ps(ix2,jx3);
1398 dy23 = _mm_sub_ps(iy2,jy3);
1399 dz23 = _mm_sub_ps(iz2,jz3);
1400 dx31 = _mm_sub_ps(ix3,jx1);
1401 dy31 = _mm_sub_ps(iy3,jy1);
1402 dz31 = _mm_sub_ps(iz3,jz1);
1403 dx32 = _mm_sub_ps(ix3,jx2);
1404 dy32 = _mm_sub_ps(iy3,jy2);
1405 dz32 = _mm_sub_ps(iz3,jz2);
1406 dx33 = _mm_sub_ps(ix3,jx3);
1407 dy33 = _mm_sub_ps(iy3,jy3);
1408 dz33 = _mm_sub_ps(iz3,jz3);
1410 /* Calculate squared distance and things based on it */
1411 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1412 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1413 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1414 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1415 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1416 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1417 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1418 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1419 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1421 rinv11 = sse2_invsqrt_f(rsq11);
1422 rinv12 = sse2_invsqrt_f(rsq12);
1423 rinv13 = sse2_invsqrt_f(rsq13);
1424 rinv21 = sse2_invsqrt_f(rsq21);
1425 rinv22 = sse2_invsqrt_f(rsq22);
1426 rinv23 = sse2_invsqrt_f(rsq23);
1427 rinv31 = sse2_invsqrt_f(rsq31);
1428 rinv32 = sse2_invsqrt_f(rsq32);
1429 rinv33 = sse2_invsqrt_f(rsq33);
1431 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1432 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1433 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1434 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1435 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1436 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1437 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1438 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1439 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1441 fjx1 = _mm_setzero_ps();
1442 fjy1 = _mm_setzero_ps();
1443 fjz1 = _mm_setzero_ps();
1444 fjx2 = _mm_setzero_ps();
1445 fjy2 = _mm_setzero_ps();
1446 fjz2 = _mm_setzero_ps();
1447 fjx3 = _mm_setzero_ps();
1448 fjy3 = _mm_setzero_ps();
1449 fjz3 = _mm_setzero_ps();
1451 /**************************
1452 * CALCULATE INTERACTIONS *
1453 **************************/
1455 r11 = _mm_mul_ps(rsq11,rinv11);
1457 /* EWALD ELECTROSTATICS */
1459 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1460 ewrt = _mm_mul_ps(r11,ewtabscale);
1461 ewitab = _mm_cvttps_epi32(ewrt);
1462 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1463 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1464 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1466 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1467 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1471 /* Calculate temporary vectorial force */
1472 tx = _mm_mul_ps(fscal,dx11);
1473 ty = _mm_mul_ps(fscal,dy11);
1474 tz = _mm_mul_ps(fscal,dz11);
1476 /* Update vectorial force */
1477 fix1 = _mm_add_ps(fix1,tx);
1478 fiy1 = _mm_add_ps(fiy1,ty);
1479 fiz1 = _mm_add_ps(fiz1,tz);
1481 fjx1 = _mm_add_ps(fjx1,tx);
1482 fjy1 = _mm_add_ps(fjy1,ty);
1483 fjz1 = _mm_add_ps(fjz1,tz);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 r12 = _mm_mul_ps(rsq12,rinv12);
1491 /* EWALD ELECTROSTATICS */
1493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494 ewrt = _mm_mul_ps(r12,ewtabscale);
1495 ewitab = _mm_cvttps_epi32(ewrt);
1496 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1497 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1498 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1500 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1501 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1505 /* Calculate temporary vectorial force */
1506 tx = _mm_mul_ps(fscal,dx12);
1507 ty = _mm_mul_ps(fscal,dy12);
1508 tz = _mm_mul_ps(fscal,dz12);
1510 /* Update vectorial force */
1511 fix1 = _mm_add_ps(fix1,tx);
1512 fiy1 = _mm_add_ps(fiy1,ty);
1513 fiz1 = _mm_add_ps(fiz1,tz);
1515 fjx2 = _mm_add_ps(fjx2,tx);
1516 fjy2 = _mm_add_ps(fjy2,ty);
1517 fjz2 = _mm_add_ps(fjz2,tz);
1519 /**************************
1520 * CALCULATE INTERACTIONS *
1521 **************************/
1523 r13 = _mm_mul_ps(rsq13,rinv13);
1525 /* EWALD ELECTROSTATICS */
1527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1528 ewrt = _mm_mul_ps(r13,ewtabscale);
1529 ewitab = _mm_cvttps_epi32(ewrt);
1530 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1531 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1532 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1534 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1535 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1539 /* Calculate temporary vectorial force */
1540 tx = _mm_mul_ps(fscal,dx13);
1541 ty = _mm_mul_ps(fscal,dy13);
1542 tz = _mm_mul_ps(fscal,dz13);
1544 /* Update vectorial force */
1545 fix1 = _mm_add_ps(fix1,tx);
1546 fiy1 = _mm_add_ps(fiy1,ty);
1547 fiz1 = _mm_add_ps(fiz1,tz);
1549 fjx3 = _mm_add_ps(fjx3,tx);
1550 fjy3 = _mm_add_ps(fjy3,ty);
1551 fjz3 = _mm_add_ps(fjz3,tz);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 r21 = _mm_mul_ps(rsq21,rinv21);
1559 /* EWALD ELECTROSTATICS */
1561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1562 ewrt = _mm_mul_ps(r21,ewtabscale);
1563 ewitab = _mm_cvttps_epi32(ewrt);
1564 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1565 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1566 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1568 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1569 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1573 /* Calculate temporary vectorial force */
1574 tx = _mm_mul_ps(fscal,dx21);
1575 ty = _mm_mul_ps(fscal,dy21);
1576 tz = _mm_mul_ps(fscal,dz21);
1578 /* Update vectorial force */
1579 fix2 = _mm_add_ps(fix2,tx);
1580 fiy2 = _mm_add_ps(fiy2,ty);
1581 fiz2 = _mm_add_ps(fiz2,tz);
1583 fjx1 = _mm_add_ps(fjx1,tx);
1584 fjy1 = _mm_add_ps(fjy1,ty);
1585 fjz1 = _mm_add_ps(fjz1,tz);
1587 /**************************
1588 * CALCULATE INTERACTIONS *
1589 **************************/
1591 r22 = _mm_mul_ps(rsq22,rinv22);
1593 /* EWALD ELECTROSTATICS */
1595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1596 ewrt = _mm_mul_ps(r22,ewtabscale);
1597 ewitab = _mm_cvttps_epi32(ewrt);
1598 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1599 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1600 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1602 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1603 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1607 /* Calculate temporary vectorial force */
1608 tx = _mm_mul_ps(fscal,dx22);
1609 ty = _mm_mul_ps(fscal,dy22);
1610 tz = _mm_mul_ps(fscal,dz22);
1612 /* Update vectorial force */
1613 fix2 = _mm_add_ps(fix2,tx);
1614 fiy2 = _mm_add_ps(fiy2,ty);
1615 fiz2 = _mm_add_ps(fiz2,tz);
1617 fjx2 = _mm_add_ps(fjx2,tx);
1618 fjy2 = _mm_add_ps(fjy2,ty);
1619 fjz2 = _mm_add_ps(fjz2,tz);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r23 = _mm_mul_ps(rsq23,rinv23);
1627 /* EWALD ELECTROSTATICS */
1629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1630 ewrt = _mm_mul_ps(r23,ewtabscale);
1631 ewitab = _mm_cvttps_epi32(ewrt);
1632 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1633 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1634 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1636 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1637 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1641 /* Calculate temporary vectorial force */
1642 tx = _mm_mul_ps(fscal,dx23);
1643 ty = _mm_mul_ps(fscal,dy23);
1644 tz = _mm_mul_ps(fscal,dz23);
1646 /* Update vectorial force */
1647 fix2 = _mm_add_ps(fix2,tx);
1648 fiy2 = _mm_add_ps(fiy2,ty);
1649 fiz2 = _mm_add_ps(fiz2,tz);
1651 fjx3 = _mm_add_ps(fjx3,tx);
1652 fjy3 = _mm_add_ps(fjy3,ty);
1653 fjz3 = _mm_add_ps(fjz3,tz);
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 r31 = _mm_mul_ps(rsq31,rinv31);
1661 /* EWALD ELECTROSTATICS */
1663 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1664 ewrt = _mm_mul_ps(r31,ewtabscale);
1665 ewitab = _mm_cvttps_epi32(ewrt);
1666 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1667 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1668 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1670 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1671 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1675 /* Calculate temporary vectorial force */
1676 tx = _mm_mul_ps(fscal,dx31);
1677 ty = _mm_mul_ps(fscal,dy31);
1678 tz = _mm_mul_ps(fscal,dz31);
1680 /* Update vectorial force */
1681 fix3 = _mm_add_ps(fix3,tx);
1682 fiy3 = _mm_add_ps(fiy3,ty);
1683 fiz3 = _mm_add_ps(fiz3,tz);
1685 fjx1 = _mm_add_ps(fjx1,tx);
1686 fjy1 = _mm_add_ps(fjy1,ty);
1687 fjz1 = _mm_add_ps(fjz1,tz);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 r32 = _mm_mul_ps(rsq32,rinv32);
1695 /* EWALD ELECTROSTATICS */
1697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1698 ewrt = _mm_mul_ps(r32,ewtabscale);
1699 ewitab = _mm_cvttps_epi32(ewrt);
1700 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1701 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1702 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1704 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1705 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1709 /* Calculate temporary vectorial force */
1710 tx = _mm_mul_ps(fscal,dx32);
1711 ty = _mm_mul_ps(fscal,dy32);
1712 tz = _mm_mul_ps(fscal,dz32);
1714 /* Update vectorial force */
1715 fix3 = _mm_add_ps(fix3,tx);
1716 fiy3 = _mm_add_ps(fiy3,ty);
1717 fiz3 = _mm_add_ps(fiz3,tz);
1719 fjx2 = _mm_add_ps(fjx2,tx);
1720 fjy2 = _mm_add_ps(fjy2,ty);
1721 fjz2 = _mm_add_ps(fjz2,tz);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 r33 = _mm_mul_ps(rsq33,rinv33);
1729 /* EWALD ELECTROSTATICS */
1731 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1732 ewrt = _mm_mul_ps(r33,ewtabscale);
1733 ewitab = _mm_cvttps_epi32(ewrt);
1734 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1735 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1736 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1738 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1739 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1743 /* Calculate temporary vectorial force */
1744 tx = _mm_mul_ps(fscal,dx33);
1745 ty = _mm_mul_ps(fscal,dy33);
1746 tz = _mm_mul_ps(fscal,dz33);
1748 /* Update vectorial force */
1749 fix3 = _mm_add_ps(fix3,tx);
1750 fiy3 = _mm_add_ps(fiy3,ty);
1751 fiz3 = _mm_add_ps(fiz3,tz);
1753 fjx3 = _mm_add_ps(fjx3,tx);
1754 fjy3 = _mm_add_ps(fjy3,ty);
1755 fjz3 = _mm_add_ps(fjz3,tz);
1757 fjptrA = f+j_coord_offsetA;
1758 fjptrB = f+j_coord_offsetB;
1759 fjptrC = f+j_coord_offsetC;
1760 fjptrD = f+j_coord_offsetD;
1762 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1763 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1765 /* Inner loop uses 324 flops */
1768 if(jidx<j_index_end)
1771 /* Get j neighbor index, and coordinate index */
1772 jnrlistA = jjnr[jidx];
1773 jnrlistB = jjnr[jidx+1];
1774 jnrlistC = jjnr[jidx+2];
1775 jnrlistD = jjnr[jidx+3];
1776 /* Sign of each element will be negative for non-real atoms.
1777 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1778 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1780 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1781 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1782 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1783 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1784 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1785 j_coord_offsetA = DIM*jnrA;
1786 j_coord_offsetB = DIM*jnrB;
1787 j_coord_offsetC = DIM*jnrC;
1788 j_coord_offsetD = DIM*jnrD;
1790 /* load j atom coordinates */
1791 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1792 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1793 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1795 /* Calculate displacement vector */
1796 dx11 = _mm_sub_ps(ix1,jx1);
1797 dy11 = _mm_sub_ps(iy1,jy1);
1798 dz11 = _mm_sub_ps(iz1,jz1);
1799 dx12 = _mm_sub_ps(ix1,jx2);
1800 dy12 = _mm_sub_ps(iy1,jy2);
1801 dz12 = _mm_sub_ps(iz1,jz2);
1802 dx13 = _mm_sub_ps(ix1,jx3);
1803 dy13 = _mm_sub_ps(iy1,jy3);
1804 dz13 = _mm_sub_ps(iz1,jz3);
1805 dx21 = _mm_sub_ps(ix2,jx1);
1806 dy21 = _mm_sub_ps(iy2,jy1);
1807 dz21 = _mm_sub_ps(iz2,jz1);
1808 dx22 = _mm_sub_ps(ix2,jx2);
1809 dy22 = _mm_sub_ps(iy2,jy2);
1810 dz22 = _mm_sub_ps(iz2,jz2);
1811 dx23 = _mm_sub_ps(ix2,jx3);
1812 dy23 = _mm_sub_ps(iy2,jy3);
1813 dz23 = _mm_sub_ps(iz2,jz3);
1814 dx31 = _mm_sub_ps(ix3,jx1);
1815 dy31 = _mm_sub_ps(iy3,jy1);
1816 dz31 = _mm_sub_ps(iz3,jz1);
1817 dx32 = _mm_sub_ps(ix3,jx2);
1818 dy32 = _mm_sub_ps(iy3,jy2);
1819 dz32 = _mm_sub_ps(iz3,jz2);
1820 dx33 = _mm_sub_ps(ix3,jx3);
1821 dy33 = _mm_sub_ps(iy3,jy3);
1822 dz33 = _mm_sub_ps(iz3,jz3);
1824 /* Calculate squared distance and things based on it */
1825 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1826 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1827 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1828 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1829 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1830 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1831 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1832 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1833 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1835 rinv11 = sse2_invsqrt_f(rsq11);
1836 rinv12 = sse2_invsqrt_f(rsq12);
1837 rinv13 = sse2_invsqrt_f(rsq13);
1838 rinv21 = sse2_invsqrt_f(rsq21);
1839 rinv22 = sse2_invsqrt_f(rsq22);
1840 rinv23 = sse2_invsqrt_f(rsq23);
1841 rinv31 = sse2_invsqrt_f(rsq31);
1842 rinv32 = sse2_invsqrt_f(rsq32);
1843 rinv33 = sse2_invsqrt_f(rsq33);
1845 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1846 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1847 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1848 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1849 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1850 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1851 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1852 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1853 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1855 fjx1 = _mm_setzero_ps();
1856 fjy1 = _mm_setzero_ps();
1857 fjz1 = _mm_setzero_ps();
1858 fjx2 = _mm_setzero_ps();
1859 fjy2 = _mm_setzero_ps();
1860 fjz2 = _mm_setzero_ps();
1861 fjx3 = _mm_setzero_ps();
1862 fjy3 = _mm_setzero_ps();
1863 fjz3 = _mm_setzero_ps();
1865 /**************************
1866 * CALCULATE INTERACTIONS *
1867 **************************/
1869 r11 = _mm_mul_ps(rsq11,rinv11);
1870 r11 = _mm_andnot_ps(dummy_mask,r11);
1872 /* EWALD ELECTROSTATICS */
1874 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875 ewrt = _mm_mul_ps(r11,ewtabscale);
1876 ewitab = _mm_cvttps_epi32(ewrt);
1877 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1878 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1879 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1881 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1882 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1886 fscal = _mm_andnot_ps(dummy_mask,fscal);
1888 /* Calculate temporary vectorial force */
1889 tx = _mm_mul_ps(fscal,dx11);
1890 ty = _mm_mul_ps(fscal,dy11);
1891 tz = _mm_mul_ps(fscal,dz11);
1893 /* Update vectorial force */
1894 fix1 = _mm_add_ps(fix1,tx);
1895 fiy1 = _mm_add_ps(fiy1,ty);
1896 fiz1 = _mm_add_ps(fiz1,tz);
1898 fjx1 = _mm_add_ps(fjx1,tx);
1899 fjy1 = _mm_add_ps(fjy1,ty);
1900 fjz1 = _mm_add_ps(fjz1,tz);
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 r12 = _mm_mul_ps(rsq12,rinv12);
1907 r12 = _mm_andnot_ps(dummy_mask,r12);
1909 /* EWALD ELECTROSTATICS */
1911 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1912 ewrt = _mm_mul_ps(r12,ewtabscale);
1913 ewitab = _mm_cvttps_epi32(ewrt);
1914 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1915 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1916 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1918 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1919 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1923 fscal = _mm_andnot_ps(dummy_mask,fscal);
1925 /* Calculate temporary vectorial force */
1926 tx = _mm_mul_ps(fscal,dx12);
1927 ty = _mm_mul_ps(fscal,dy12);
1928 tz = _mm_mul_ps(fscal,dz12);
1930 /* Update vectorial force */
1931 fix1 = _mm_add_ps(fix1,tx);
1932 fiy1 = _mm_add_ps(fiy1,ty);
1933 fiz1 = _mm_add_ps(fiz1,tz);
1935 fjx2 = _mm_add_ps(fjx2,tx);
1936 fjy2 = _mm_add_ps(fjy2,ty);
1937 fjz2 = _mm_add_ps(fjz2,tz);
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 r13 = _mm_mul_ps(rsq13,rinv13);
1944 r13 = _mm_andnot_ps(dummy_mask,r13);
1946 /* EWALD ELECTROSTATICS */
1948 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1949 ewrt = _mm_mul_ps(r13,ewtabscale);
1950 ewitab = _mm_cvttps_epi32(ewrt);
1951 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1952 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1953 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1955 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1956 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1960 fscal = _mm_andnot_ps(dummy_mask,fscal);
1962 /* Calculate temporary vectorial force */
1963 tx = _mm_mul_ps(fscal,dx13);
1964 ty = _mm_mul_ps(fscal,dy13);
1965 tz = _mm_mul_ps(fscal,dz13);
1967 /* Update vectorial force */
1968 fix1 = _mm_add_ps(fix1,tx);
1969 fiy1 = _mm_add_ps(fiy1,ty);
1970 fiz1 = _mm_add_ps(fiz1,tz);
1972 fjx3 = _mm_add_ps(fjx3,tx);
1973 fjy3 = _mm_add_ps(fjy3,ty);
1974 fjz3 = _mm_add_ps(fjz3,tz);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 r21 = _mm_mul_ps(rsq21,rinv21);
1981 r21 = _mm_andnot_ps(dummy_mask,r21);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _mm_mul_ps(r21,ewtabscale);
1987 ewitab = _mm_cvttps_epi32(ewrt);
1988 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1989 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1990 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1992 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1993 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1997 fscal = _mm_andnot_ps(dummy_mask,fscal);
1999 /* Calculate temporary vectorial force */
2000 tx = _mm_mul_ps(fscal,dx21);
2001 ty = _mm_mul_ps(fscal,dy21);
2002 tz = _mm_mul_ps(fscal,dz21);
2004 /* Update vectorial force */
2005 fix2 = _mm_add_ps(fix2,tx);
2006 fiy2 = _mm_add_ps(fiy2,ty);
2007 fiz2 = _mm_add_ps(fiz2,tz);
2009 fjx1 = _mm_add_ps(fjx1,tx);
2010 fjy1 = _mm_add_ps(fjy1,ty);
2011 fjz1 = _mm_add_ps(fjz1,tz);
2013 /**************************
2014 * CALCULATE INTERACTIONS *
2015 **************************/
2017 r22 = _mm_mul_ps(rsq22,rinv22);
2018 r22 = _mm_andnot_ps(dummy_mask,r22);
2020 /* EWALD ELECTROSTATICS */
2022 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2023 ewrt = _mm_mul_ps(r22,ewtabscale);
2024 ewitab = _mm_cvttps_epi32(ewrt);
2025 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2026 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2027 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2029 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2030 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2034 fscal = _mm_andnot_ps(dummy_mask,fscal);
2036 /* Calculate temporary vectorial force */
2037 tx = _mm_mul_ps(fscal,dx22);
2038 ty = _mm_mul_ps(fscal,dy22);
2039 tz = _mm_mul_ps(fscal,dz22);
2041 /* Update vectorial force */
2042 fix2 = _mm_add_ps(fix2,tx);
2043 fiy2 = _mm_add_ps(fiy2,ty);
2044 fiz2 = _mm_add_ps(fiz2,tz);
2046 fjx2 = _mm_add_ps(fjx2,tx);
2047 fjy2 = _mm_add_ps(fjy2,ty);
2048 fjz2 = _mm_add_ps(fjz2,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 r23 = _mm_mul_ps(rsq23,rinv23);
2055 r23 = _mm_andnot_ps(dummy_mask,r23);
2057 /* EWALD ELECTROSTATICS */
2059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060 ewrt = _mm_mul_ps(r23,ewtabscale);
2061 ewitab = _mm_cvttps_epi32(ewrt);
2062 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2063 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2064 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2066 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2067 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2071 fscal = _mm_andnot_ps(dummy_mask,fscal);
2073 /* Calculate temporary vectorial force */
2074 tx = _mm_mul_ps(fscal,dx23);
2075 ty = _mm_mul_ps(fscal,dy23);
2076 tz = _mm_mul_ps(fscal,dz23);
2078 /* Update vectorial force */
2079 fix2 = _mm_add_ps(fix2,tx);
2080 fiy2 = _mm_add_ps(fiy2,ty);
2081 fiz2 = _mm_add_ps(fiz2,tz);
2083 fjx3 = _mm_add_ps(fjx3,tx);
2084 fjy3 = _mm_add_ps(fjy3,ty);
2085 fjz3 = _mm_add_ps(fjz3,tz);
2087 /**************************
2088 * CALCULATE INTERACTIONS *
2089 **************************/
2091 r31 = _mm_mul_ps(rsq31,rinv31);
2092 r31 = _mm_andnot_ps(dummy_mask,r31);
2094 /* EWALD ELECTROSTATICS */
2096 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2097 ewrt = _mm_mul_ps(r31,ewtabscale);
2098 ewitab = _mm_cvttps_epi32(ewrt);
2099 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2100 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2101 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2103 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2104 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2108 fscal = _mm_andnot_ps(dummy_mask,fscal);
2110 /* Calculate temporary vectorial force */
2111 tx = _mm_mul_ps(fscal,dx31);
2112 ty = _mm_mul_ps(fscal,dy31);
2113 tz = _mm_mul_ps(fscal,dz31);
2115 /* Update vectorial force */
2116 fix3 = _mm_add_ps(fix3,tx);
2117 fiy3 = _mm_add_ps(fiy3,ty);
2118 fiz3 = _mm_add_ps(fiz3,tz);
2120 fjx1 = _mm_add_ps(fjx1,tx);
2121 fjy1 = _mm_add_ps(fjy1,ty);
2122 fjz1 = _mm_add_ps(fjz1,tz);
2124 /**************************
2125 * CALCULATE INTERACTIONS *
2126 **************************/
2128 r32 = _mm_mul_ps(rsq32,rinv32);
2129 r32 = _mm_andnot_ps(dummy_mask,r32);
2131 /* EWALD ELECTROSTATICS */
2133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2134 ewrt = _mm_mul_ps(r32,ewtabscale);
2135 ewitab = _mm_cvttps_epi32(ewrt);
2136 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2137 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2138 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2140 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2141 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2145 fscal = _mm_andnot_ps(dummy_mask,fscal);
2147 /* Calculate temporary vectorial force */
2148 tx = _mm_mul_ps(fscal,dx32);
2149 ty = _mm_mul_ps(fscal,dy32);
2150 tz = _mm_mul_ps(fscal,dz32);
2152 /* Update vectorial force */
2153 fix3 = _mm_add_ps(fix3,tx);
2154 fiy3 = _mm_add_ps(fiy3,ty);
2155 fiz3 = _mm_add_ps(fiz3,tz);
2157 fjx2 = _mm_add_ps(fjx2,tx);
2158 fjy2 = _mm_add_ps(fjy2,ty);
2159 fjz2 = _mm_add_ps(fjz2,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 r33 = _mm_mul_ps(rsq33,rinv33);
2166 r33 = _mm_andnot_ps(dummy_mask,r33);
2168 /* EWALD ELECTROSTATICS */
2170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2171 ewrt = _mm_mul_ps(r33,ewtabscale);
2172 ewitab = _mm_cvttps_epi32(ewrt);
2173 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2174 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2175 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2177 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2178 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2182 fscal = _mm_andnot_ps(dummy_mask,fscal);
2184 /* Calculate temporary vectorial force */
2185 tx = _mm_mul_ps(fscal,dx33);
2186 ty = _mm_mul_ps(fscal,dy33);
2187 tz = _mm_mul_ps(fscal,dz33);
2189 /* Update vectorial force */
2190 fix3 = _mm_add_ps(fix3,tx);
2191 fiy3 = _mm_add_ps(fiy3,ty);
2192 fiz3 = _mm_add_ps(fiz3,tz);
2194 fjx3 = _mm_add_ps(fjx3,tx);
2195 fjy3 = _mm_add_ps(fjy3,ty);
2196 fjz3 = _mm_add_ps(fjz3,tz);
2198 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2199 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2200 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2201 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2203 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2204 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2206 /* Inner loop uses 333 flops */
2209 /* End of innermost loop */
2211 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2212 f+i_coord_offset+DIM,fshift+i_shift_offset);
2214 /* Increment number of inner iterations */
2215 inneriter += j_index_end - j_index_start;
2217 /* Outer loop uses 18 flops */
2220 /* Increment number of outer iterations */
2223 /* Update outer/inner flops */
2225 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);