2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LJEwald
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
123 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
125 __m128 one_half = _mm_set1_ps(0.5);
126 __m128 minus_one = _mm_set1_ps(-1.0);
128 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
130 __m128 dummy_mask,cutoff_mask;
131 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
132 __m128 one = _mm_set1_ps(1.0);
133 __m128 two = _mm_set1_ps(2.0);
139 jindex = nlist->jindex;
141 shiftidx = nlist->shift;
143 shiftvec = fr->shift_vec[0];
144 fshift = fr->fshift[0];
145 facel = _mm_set1_ps(fr->epsfac);
146 charge = mdatoms->chargeA;
147 nvdwtype = fr->ntype;
149 vdwtype = mdatoms->typeA;
150 vdwgridparam = fr->ljpme_c6grid;
151 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
152 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
153 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
155 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
156 ewtab = fr->ic->tabq_coul_FDV0;
157 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
158 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
160 /* Setup water-specific parameters */
161 inr = nlist->iinr[0];
162 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
163 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
164 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
165 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
167 jq0 = _mm_set1_ps(charge[inr+0]);
168 jq1 = _mm_set1_ps(charge[inr+1]);
169 jq2 = _mm_set1_ps(charge[inr+2]);
170 vdwjidx0A = 2*vdwtype[inr+0];
171 qq00 = _mm_mul_ps(iq0,jq0);
172 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
173 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
174 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
175 qq01 = _mm_mul_ps(iq0,jq1);
176 qq02 = _mm_mul_ps(iq0,jq2);
177 qq10 = _mm_mul_ps(iq1,jq0);
178 qq11 = _mm_mul_ps(iq1,jq1);
179 qq12 = _mm_mul_ps(iq1,jq2);
180 qq20 = _mm_mul_ps(iq2,jq0);
181 qq21 = _mm_mul_ps(iq2,jq1);
182 qq22 = _mm_mul_ps(iq2,jq2);
184 /* Avoid stupid compiler warnings */
185 jnrA = jnrB = jnrC = jnrD = 0;
194 for(iidx=0;iidx<4*DIM;iidx++)
199 /* Start outer loop over neighborlists */
200 for(iidx=0; iidx<nri; iidx++)
202 /* Load shift vector for this list */
203 i_shift_offset = DIM*shiftidx[iidx];
205 /* Load limits for loop over neighbors */
206 j_index_start = jindex[iidx];
207 j_index_end = jindex[iidx+1];
209 /* Get outer coordinate index */
211 i_coord_offset = DIM*inr;
213 /* Load i particle coords and add shift vector */
214 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
217 fix0 = _mm_setzero_ps();
218 fiy0 = _mm_setzero_ps();
219 fiz0 = _mm_setzero_ps();
220 fix1 = _mm_setzero_ps();
221 fiy1 = _mm_setzero_ps();
222 fiz1 = _mm_setzero_ps();
223 fix2 = _mm_setzero_ps();
224 fiy2 = _mm_setzero_ps();
225 fiz2 = _mm_setzero_ps();
227 /* Reset potential sums */
228 velecsum = _mm_setzero_ps();
229 vvdwsum = _mm_setzero_ps();
231 /* Start inner kernel loop */
232 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
235 /* Get j neighbor index, and coordinate index */
240 j_coord_offsetA = DIM*jnrA;
241 j_coord_offsetB = DIM*jnrB;
242 j_coord_offsetC = DIM*jnrC;
243 j_coord_offsetD = DIM*jnrD;
245 /* load j atom coordinates */
246 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
247 x+j_coord_offsetC,x+j_coord_offsetD,
248 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
250 /* Calculate displacement vector */
251 dx00 = _mm_sub_ps(ix0,jx0);
252 dy00 = _mm_sub_ps(iy0,jy0);
253 dz00 = _mm_sub_ps(iz0,jz0);
254 dx01 = _mm_sub_ps(ix0,jx1);
255 dy01 = _mm_sub_ps(iy0,jy1);
256 dz01 = _mm_sub_ps(iz0,jz1);
257 dx02 = _mm_sub_ps(ix0,jx2);
258 dy02 = _mm_sub_ps(iy0,jy2);
259 dz02 = _mm_sub_ps(iz0,jz2);
260 dx10 = _mm_sub_ps(ix1,jx0);
261 dy10 = _mm_sub_ps(iy1,jy0);
262 dz10 = _mm_sub_ps(iz1,jz0);
263 dx11 = _mm_sub_ps(ix1,jx1);
264 dy11 = _mm_sub_ps(iy1,jy1);
265 dz11 = _mm_sub_ps(iz1,jz1);
266 dx12 = _mm_sub_ps(ix1,jx2);
267 dy12 = _mm_sub_ps(iy1,jy2);
268 dz12 = _mm_sub_ps(iz1,jz2);
269 dx20 = _mm_sub_ps(ix2,jx0);
270 dy20 = _mm_sub_ps(iy2,jy0);
271 dz20 = _mm_sub_ps(iz2,jz0);
272 dx21 = _mm_sub_ps(ix2,jx1);
273 dy21 = _mm_sub_ps(iy2,jy1);
274 dz21 = _mm_sub_ps(iz2,jz1);
275 dx22 = _mm_sub_ps(ix2,jx2);
276 dy22 = _mm_sub_ps(iy2,jy2);
277 dz22 = _mm_sub_ps(iz2,jz2);
279 /* Calculate squared distance and things based on it */
280 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
281 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
282 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
283 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
284 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
287 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
290 rinv00 = gmx_mm_invsqrt_ps(rsq00);
291 rinv01 = gmx_mm_invsqrt_ps(rsq01);
292 rinv02 = gmx_mm_invsqrt_ps(rsq02);
293 rinv10 = gmx_mm_invsqrt_ps(rsq10);
294 rinv11 = gmx_mm_invsqrt_ps(rsq11);
295 rinv12 = gmx_mm_invsqrt_ps(rsq12);
296 rinv20 = gmx_mm_invsqrt_ps(rsq20);
297 rinv21 = gmx_mm_invsqrt_ps(rsq21);
298 rinv22 = gmx_mm_invsqrt_ps(rsq22);
300 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
301 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
302 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
303 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
304 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
305 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
306 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
307 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
308 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
310 fjx0 = _mm_setzero_ps();
311 fjy0 = _mm_setzero_ps();
312 fjz0 = _mm_setzero_ps();
313 fjx1 = _mm_setzero_ps();
314 fjy1 = _mm_setzero_ps();
315 fjz1 = _mm_setzero_ps();
316 fjx2 = _mm_setzero_ps();
317 fjy2 = _mm_setzero_ps();
318 fjz2 = _mm_setzero_ps();
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 r00 = _mm_mul_ps(rsq00,rinv00);
326 /* EWALD ELECTROSTATICS */
328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
329 ewrt = _mm_mul_ps(r00,ewtabscale);
330 ewitab = _mm_cvttps_epi32(ewrt);
331 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
332 ewitab = _mm_slli_epi32(ewitab,2);
333 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
334 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
335 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
336 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
337 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
338 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
339 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
340 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
341 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
343 /* Analytical LJ-PME */
344 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
345 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
346 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
347 exponent = gmx_simd_exp_r(ewcljrsq);
348 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
349 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
350 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
351 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
352 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
354 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
355 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm_add_ps(velecsum,velec);
359 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
361 fscal = _mm_add_ps(felec,fvdw);
363 /* Calculate temporary vectorial force */
364 tx = _mm_mul_ps(fscal,dx00);
365 ty = _mm_mul_ps(fscal,dy00);
366 tz = _mm_mul_ps(fscal,dz00);
368 /* Update vectorial force */
369 fix0 = _mm_add_ps(fix0,tx);
370 fiy0 = _mm_add_ps(fiy0,ty);
371 fiz0 = _mm_add_ps(fiz0,tz);
373 fjx0 = _mm_add_ps(fjx0,tx);
374 fjy0 = _mm_add_ps(fjy0,ty);
375 fjz0 = _mm_add_ps(fjz0,tz);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 r01 = _mm_mul_ps(rsq01,rinv01);
383 /* EWALD ELECTROSTATICS */
385 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
386 ewrt = _mm_mul_ps(r01,ewtabscale);
387 ewitab = _mm_cvttps_epi32(ewrt);
388 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
389 ewitab = _mm_slli_epi32(ewitab,2);
390 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
391 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
392 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
393 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
394 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
395 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
396 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
397 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
398 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum = _mm_add_ps(velecsum,velec);
405 /* Calculate temporary vectorial force */
406 tx = _mm_mul_ps(fscal,dx01);
407 ty = _mm_mul_ps(fscal,dy01);
408 tz = _mm_mul_ps(fscal,dz01);
410 /* Update vectorial force */
411 fix0 = _mm_add_ps(fix0,tx);
412 fiy0 = _mm_add_ps(fiy0,ty);
413 fiz0 = _mm_add_ps(fiz0,tz);
415 fjx1 = _mm_add_ps(fjx1,tx);
416 fjy1 = _mm_add_ps(fjy1,ty);
417 fjz1 = _mm_add_ps(fjz1,tz);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 r02 = _mm_mul_ps(rsq02,rinv02);
425 /* EWALD ELECTROSTATICS */
427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428 ewrt = _mm_mul_ps(r02,ewtabscale);
429 ewitab = _mm_cvttps_epi32(ewrt);
430 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
431 ewitab = _mm_slli_epi32(ewitab,2);
432 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
433 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
434 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
435 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
436 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
437 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
438 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
439 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
440 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velecsum = _mm_add_ps(velecsum,velec);
447 /* Calculate temporary vectorial force */
448 tx = _mm_mul_ps(fscal,dx02);
449 ty = _mm_mul_ps(fscal,dy02);
450 tz = _mm_mul_ps(fscal,dz02);
452 /* Update vectorial force */
453 fix0 = _mm_add_ps(fix0,tx);
454 fiy0 = _mm_add_ps(fiy0,ty);
455 fiz0 = _mm_add_ps(fiz0,tz);
457 fjx2 = _mm_add_ps(fjx2,tx);
458 fjy2 = _mm_add_ps(fjy2,ty);
459 fjz2 = _mm_add_ps(fjz2,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 r10 = _mm_mul_ps(rsq10,rinv10);
467 /* EWALD ELECTROSTATICS */
469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470 ewrt = _mm_mul_ps(r10,ewtabscale);
471 ewitab = _mm_cvttps_epi32(ewrt);
472 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
473 ewitab = _mm_slli_epi32(ewitab,2);
474 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
475 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
476 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
477 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
478 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
479 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
480 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
481 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
482 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm_add_ps(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_ps(fscal,dx10);
491 ty = _mm_mul_ps(fscal,dy10);
492 tz = _mm_mul_ps(fscal,dz10);
494 /* Update vectorial force */
495 fix1 = _mm_add_ps(fix1,tx);
496 fiy1 = _mm_add_ps(fiy1,ty);
497 fiz1 = _mm_add_ps(fiz1,tz);
499 fjx0 = _mm_add_ps(fjx0,tx);
500 fjy0 = _mm_add_ps(fjy0,ty);
501 fjz0 = _mm_add_ps(fjz0,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 r11 = _mm_mul_ps(rsq11,rinv11);
509 /* EWALD ELECTROSTATICS */
511 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
512 ewrt = _mm_mul_ps(r11,ewtabscale);
513 ewitab = _mm_cvttps_epi32(ewrt);
514 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
515 ewitab = _mm_slli_epi32(ewitab,2);
516 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
517 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
518 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
519 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
520 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
521 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
522 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
523 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
524 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velecsum = _mm_add_ps(velecsum,velec);
531 /* Calculate temporary vectorial force */
532 tx = _mm_mul_ps(fscal,dx11);
533 ty = _mm_mul_ps(fscal,dy11);
534 tz = _mm_mul_ps(fscal,dz11);
536 /* Update vectorial force */
537 fix1 = _mm_add_ps(fix1,tx);
538 fiy1 = _mm_add_ps(fiy1,ty);
539 fiz1 = _mm_add_ps(fiz1,tz);
541 fjx1 = _mm_add_ps(fjx1,tx);
542 fjy1 = _mm_add_ps(fjy1,ty);
543 fjz1 = _mm_add_ps(fjz1,tz);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 r12 = _mm_mul_ps(rsq12,rinv12);
551 /* EWALD ELECTROSTATICS */
553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
554 ewrt = _mm_mul_ps(r12,ewtabscale);
555 ewitab = _mm_cvttps_epi32(ewrt);
556 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
557 ewitab = _mm_slli_epi32(ewitab,2);
558 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
559 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
560 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
561 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
562 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
563 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
564 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
565 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
566 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velecsum = _mm_add_ps(velecsum,velec);
573 /* Calculate temporary vectorial force */
574 tx = _mm_mul_ps(fscal,dx12);
575 ty = _mm_mul_ps(fscal,dy12);
576 tz = _mm_mul_ps(fscal,dz12);
578 /* Update vectorial force */
579 fix1 = _mm_add_ps(fix1,tx);
580 fiy1 = _mm_add_ps(fiy1,ty);
581 fiz1 = _mm_add_ps(fiz1,tz);
583 fjx2 = _mm_add_ps(fjx2,tx);
584 fjy2 = _mm_add_ps(fjy2,ty);
585 fjz2 = _mm_add_ps(fjz2,tz);
587 /**************************
588 * CALCULATE INTERACTIONS *
589 **************************/
591 r20 = _mm_mul_ps(rsq20,rinv20);
593 /* EWALD ELECTROSTATICS */
595 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
596 ewrt = _mm_mul_ps(r20,ewtabscale);
597 ewitab = _mm_cvttps_epi32(ewrt);
598 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
599 ewitab = _mm_slli_epi32(ewitab,2);
600 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
601 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
602 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
603 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
604 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
605 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
606 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
607 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
608 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velecsum = _mm_add_ps(velecsum,velec);
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_ps(fscal,dx20);
617 ty = _mm_mul_ps(fscal,dy20);
618 tz = _mm_mul_ps(fscal,dz20);
620 /* Update vectorial force */
621 fix2 = _mm_add_ps(fix2,tx);
622 fiy2 = _mm_add_ps(fiy2,ty);
623 fiz2 = _mm_add_ps(fiz2,tz);
625 fjx0 = _mm_add_ps(fjx0,tx);
626 fjy0 = _mm_add_ps(fjy0,ty);
627 fjz0 = _mm_add_ps(fjz0,tz);
629 /**************************
630 * CALCULATE INTERACTIONS *
631 **************************/
633 r21 = _mm_mul_ps(rsq21,rinv21);
635 /* EWALD ELECTROSTATICS */
637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
638 ewrt = _mm_mul_ps(r21,ewtabscale);
639 ewitab = _mm_cvttps_epi32(ewrt);
640 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
641 ewitab = _mm_slli_epi32(ewitab,2);
642 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
643 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
644 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
645 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
646 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
647 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
648 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
649 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
650 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velecsum = _mm_add_ps(velecsum,velec);
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_ps(fscal,dx21);
659 ty = _mm_mul_ps(fscal,dy21);
660 tz = _mm_mul_ps(fscal,dz21);
662 /* Update vectorial force */
663 fix2 = _mm_add_ps(fix2,tx);
664 fiy2 = _mm_add_ps(fiy2,ty);
665 fiz2 = _mm_add_ps(fiz2,tz);
667 fjx1 = _mm_add_ps(fjx1,tx);
668 fjy1 = _mm_add_ps(fjy1,ty);
669 fjz1 = _mm_add_ps(fjz1,tz);
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 r22 = _mm_mul_ps(rsq22,rinv22);
677 /* EWALD ELECTROSTATICS */
679 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
680 ewrt = _mm_mul_ps(r22,ewtabscale);
681 ewitab = _mm_cvttps_epi32(ewrt);
682 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
683 ewitab = _mm_slli_epi32(ewitab,2);
684 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
685 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
686 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
687 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
688 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
689 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
690 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
691 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
692 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum = _mm_add_ps(velecsum,velec);
699 /* Calculate temporary vectorial force */
700 tx = _mm_mul_ps(fscal,dx22);
701 ty = _mm_mul_ps(fscal,dy22);
702 tz = _mm_mul_ps(fscal,dz22);
704 /* Update vectorial force */
705 fix2 = _mm_add_ps(fix2,tx);
706 fiy2 = _mm_add_ps(fiy2,ty);
707 fiz2 = _mm_add_ps(fiz2,tz);
709 fjx2 = _mm_add_ps(fjx2,tx);
710 fjy2 = _mm_add_ps(fjy2,ty);
711 fjz2 = _mm_add_ps(fjz2,tz);
713 fjptrA = f+j_coord_offsetA;
714 fjptrB = f+j_coord_offsetB;
715 fjptrC = f+j_coord_offsetC;
716 fjptrD = f+j_coord_offsetD;
718 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
719 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
721 /* Inner loop uses 397 flops */
727 /* Get j neighbor index, and coordinate index */
728 jnrlistA = jjnr[jidx];
729 jnrlistB = jjnr[jidx+1];
730 jnrlistC = jjnr[jidx+2];
731 jnrlistD = jjnr[jidx+3];
732 /* Sign of each element will be negative for non-real atoms.
733 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
734 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
736 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
737 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
738 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
739 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
740 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
741 j_coord_offsetA = DIM*jnrA;
742 j_coord_offsetB = DIM*jnrB;
743 j_coord_offsetC = DIM*jnrC;
744 j_coord_offsetD = DIM*jnrD;
746 /* load j atom coordinates */
747 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
748 x+j_coord_offsetC,x+j_coord_offsetD,
749 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
751 /* Calculate displacement vector */
752 dx00 = _mm_sub_ps(ix0,jx0);
753 dy00 = _mm_sub_ps(iy0,jy0);
754 dz00 = _mm_sub_ps(iz0,jz0);
755 dx01 = _mm_sub_ps(ix0,jx1);
756 dy01 = _mm_sub_ps(iy0,jy1);
757 dz01 = _mm_sub_ps(iz0,jz1);
758 dx02 = _mm_sub_ps(ix0,jx2);
759 dy02 = _mm_sub_ps(iy0,jy2);
760 dz02 = _mm_sub_ps(iz0,jz2);
761 dx10 = _mm_sub_ps(ix1,jx0);
762 dy10 = _mm_sub_ps(iy1,jy0);
763 dz10 = _mm_sub_ps(iz1,jz0);
764 dx11 = _mm_sub_ps(ix1,jx1);
765 dy11 = _mm_sub_ps(iy1,jy1);
766 dz11 = _mm_sub_ps(iz1,jz1);
767 dx12 = _mm_sub_ps(ix1,jx2);
768 dy12 = _mm_sub_ps(iy1,jy2);
769 dz12 = _mm_sub_ps(iz1,jz2);
770 dx20 = _mm_sub_ps(ix2,jx0);
771 dy20 = _mm_sub_ps(iy2,jy0);
772 dz20 = _mm_sub_ps(iz2,jz0);
773 dx21 = _mm_sub_ps(ix2,jx1);
774 dy21 = _mm_sub_ps(iy2,jy1);
775 dz21 = _mm_sub_ps(iz2,jz1);
776 dx22 = _mm_sub_ps(ix2,jx2);
777 dy22 = _mm_sub_ps(iy2,jy2);
778 dz22 = _mm_sub_ps(iz2,jz2);
780 /* Calculate squared distance and things based on it */
781 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
782 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
783 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
784 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
785 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
786 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
787 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
788 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
789 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
791 rinv00 = gmx_mm_invsqrt_ps(rsq00);
792 rinv01 = gmx_mm_invsqrt_ps(rsq01);
793 rinv02 = gmx_mm_invsqrt_ps(rsq02);
794 rinv10 = gmx_mm_invsqrt_ps(rsq10);
795 rinv11 = gmx_mm_invsqrt_ps(rsq11);
796 rinv12 = gmx_mm_invsqrt_ps(rsq12);
797 rinv20 = gmx_mm_invsqrt_ps(rsq20);
798 rinv21 = gmx_mm_invsqrt_ps(rsq21);
799 rinv22 = gmx_mm_invsqrt_ps(rsq22);
801 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
802 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
803 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
804 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
805 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
806 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
807 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
808 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
809 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
811 fjx0 = _mm_setzero_ps();
812 fjy0 = _mm_setzero_ps();
813 fjz0 = _mm_setzero_ps();
814 fjx1 = _mm_setzero_ps();
815 fjy1 = _mm_setzero_ps();
816 fjz1 = _mm_setzero_ps();
817 fjx2 = _mm_setzero_ps();
818 fjy2 = _mm_setzero_ps();
819 fjz2 = _mm_setzero_ps();
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 r00 = _mm_mul_ps(rsq00,rinv00);
826 r00 = _mm_andnot_ps(dummy_mask,r00);
828 /* EWALD ELECTROSTATICS */
830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831 ewrt = _mm_mul_ps(r00,ewtabscale);
832 ewitab = _mm_cvttps_epi32(ewrt);
833 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
834 ewitab = _mm_slli_epi32(ewitab,2);
835 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
836 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
837 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
838 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
839 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
840 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
841 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
842 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
843 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
845 /* Analytical LJ-PME */
846 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
847 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
848 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
849 exponent = gmx_simd_exp_r(ewcljrsq);
850 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
851 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
852 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
853 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
854 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
855 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
856 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
857 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm_andnot_ps(dummy_mask,velec);
861 velecsum = _mm_add_ps(velecsum,velec);
862 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
863 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
865 fscal = _mm_add_ps(felec,fvdw);
867 fscal = _mm_andnot_ps(dummy_mask,fscal);
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_ps(fscal,dx00);
871 ty = _mm_mul_ps(fscal,dy00);
872 tz = _mm_mul_ps(fscal,dz00);
874 /* Update vectorial force */
875 fix0 = _mm_add_ps(fix0,tx);
876 fiy0 = _mm_add_ps(fiy0,ty);
877 fiz0 = _mm_add_ps(fiz0,tz);
879 fjx0 = _mm_add_ps(fjx0,tx);
880 fjy0 = _mm_add_ps(fjy0,ty);
881 fjz0 = _mm_add_ps(fjz0,tz);
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 r01 = _mm_mul_ps(rsq01,rinv01);
888 r01 = _mm_andnot_ps(dummy_mask,r01);
890 /* EWALD ELECTROSTATICS */
892 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893 ewrt = _mm_mul_ps(r01,ewtabscale);
894 ewitab = _mm_cvttps_epi32(ewrt);
895 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
896 ewitab = _mm_slli_epi32(ewitab,2);
897 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
899 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
900 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
901 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
902 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
905 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_andnot_ps(dummy_mask,velec);
909 velecsum = _mm_add_ps(velecsum,velec);
913 fscal = _mm_andnot_ps(dummy_mask,fscal);
915 /* Calculate temporary vectorial force */
916 tx = _mm_mul_ps(fscal,dx01);
917 ty = _mm_mul_ps(fscal,dy01);
918 tz = _mm_mul_ps(fscal,dz01);
920 /* Update vectorial force */
921 fix0 = _mm_add_ps(fix0,tx);
922 fiy0 = _mm_add_ps(fiy0,ty);
923 fiz0 = _mm_add_ps(fiz0,tz);
925 fjx1 = _mm_add_ps(fjx1,tx);
926 fjy1 = _mm_add_ps(fjy1,ty);
927 fjz1 = _mm_add_ps(fjz1,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 r02 = _mm_mul_ps(rsq02,rinv02);
934 r02 = _mm_andnot_ps(dummy_mask,r02);
936 /* EWALD ELECTROSTATICS */
938 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939 ewrt = _mm_mul_ps(r02,ewtabscale);
940 ewitab = _mm_cvttps_epi32(ewrt);
941 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
942 ewitab = _mm_slli_epi32(ewitab,2);
943 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
944 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
945 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
946 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
947 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
948 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
949 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
950 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
951 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_andnot_ps(dummy_mask,velec);
955 velecsum = _mm_add_ps(velecsum,velec);
959 fscal = _mm_andnot_ps(dummy_mask,fscal);
961 /* Calculate temporary vectorial force */
962 tx = _mm_mul_ps(fscal,dx02);
963 ty = _mm_mul_ps(fscal,dy02);
964 tz = _mm_mul_ps(fscal,dz02);
966 /* Update vectorial force */
967 fix0 = _mm_add_ps(fix0,tx);
968 fiy0 = _mm_add_ps(fiy0,ty);
969 fiz0 = _mm_add_ps(fiz0,tz);
971 fjx2 = _mm_add_ps(fjx2,tx);
972 fjy2 = _mm_add_ps(fjy2,ty);
973 fjz2 = _mm_add_ps(fjz2,tz);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 r10 = _mm_mul_ps(rsq10,rinv10);
980 r10 = _mm_andnot_ps(dummy_mask,r10);
982 /* EWALD ELECTROSTATICS */
984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985 ewrt = _mm_mul_ps(r10,ewtabscale);
986 ewitab = _mm_cvttps_epi32(ewrt);
987 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
988 ewitab = _mm_slli_epi32(ewitab,2);
989 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
990 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
991 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
992 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
993 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
994 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
995 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
996 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
997 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm_add_ps(velecsum,velec);
1005 fscal = _mm_andnot_ps(dummy_mask,fscal);
1007 /* Calculate temporary vectorial force */
1008 tx = _mm_mul_ps(fscal,dx10);
1009 ty = _mm_mul_ps(fscal,dy10);
1010 tz = _mm_mul_ps(fscal,dz10);
1012 /* Update vectorial force */
1013 fix1 = _mm_add_ps(fix1,tx);
1014 fiy1 = _mm_add_ps(fiy1,ty);
1015 fiz1 = _mm_add_ps(fiz1,tz);
1017 fjx0 = _mm_add_ps(fjx0,tx);
1018 fjy0 = _mm_add_ps(fjy0,ty);
1019 fjz0 = _mm_add_ps(fjz0,tz);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 r11 = _mm_mul_ps(rsq11,rinv11);
1026 r11 = _mm_andnot_ps(dummy_mask,r11);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_ps(r11,ewtabscale);
1032 ewitab = _mm_cvttps_epi32(ewrt);
1033 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1034 ewitab = _mm_slli_epi32(ewitab,2);
1035 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1043 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm_andnot_ps(dummy_mask,velec);
1047 velecsum = _mm_add_ps(velecsum,velec);
1051 fscal = _mm_andnot_ps(dummy_mask,fscal);
1053 /* Calculate temporary vectorial force */
1054 tx = _mm_mul_ps(fscal,dx11);
1055 ty = _mm_mul_ps(fscal,dy11);
1056 tz = _mm_mul_ps(fscal,dz11);
1058 /* Update vectorial force */
1059 fix1 = _mm_add_ps(fix1,tx);
1060 fiy1 = _mm_add_ps(fiy1,ty);
1061 fiz1 = _mm_add_ps(fiz1,tz);
1063 fjx1 = _mm_add_ps(fjx1,tx);
1064 fjy1 = _mm_add_ps(fjy1,ty);
1065 fjz1 = _mm_add_ps(fjz1,tz);
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1071 r12 = _mm_mul_ps(rsq12,rinv12);
1072 r12 = _mm_andnot_ps(dummy_mask,r12);
1074 /* EWALD ELECTROSTATICS */
1076 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077 ewrt = _mm_mul_ps(r12,ewtabscale);
1078 ewitab = _mm_cvttps_epi32(ewrt);
1079 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1080 ewitab = _mm_slli_epi32(ewitab,2);
1081 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1082 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1083 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1084 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1085 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1086 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1087 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1088 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1089 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec = _mm_andnot_ps(dummy_mask,velec);
1093 velecsum = _mm_add_ps(velecsum,velec);
1097 fscal = _mm_andnot_ps(dummy_mask,fscal);
1099 /* Calculate temporary vectorial force */
1100 tx = _mm_mul_ps(fscal,dx12);
1101 ty = _mm_mul_ps(fscal,dy12);
1102 tz = _mm_mul_ps(fscal,dz12);
1104 /* Update vectorial force */
1105 fix1 = _mm_add_ps(fix1,tx);
1106 fiy1 = _mm_add_ps(fiy1,ty);
1107 fiz1 = _mm_add_ps(fiz1,tz);
1109 fjx2 = _mm_add_ps(fjx2,tx);
1110 fjy2 = _mm_add_ps(fjy2,ty);
1111 fjz2 = _mm_add_ps(fjz2,tz);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 r20 = _mm_mul_ps(rsq20,rinv20);
1118 r20 = _mm_andnot_ps(dummy_mask,r20);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt = _mm_mul_ps(r20,ewtabscale);
1124 ewitab = _mm_cvttps_epi32(ewrt);
1125 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1126 ewitab = _mm_slli_epi32(ewitab,2);
1127 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1129 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1130 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1131 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1132 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1135 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm_andnot_ps(dummy_mask,velec);
1139 velecsum = _mm_add_ps(velecsum,velec);
1143 fscal = _mm_andnot_ps(dummy_mask,fscal);
1145 /* Calculate temporary vectorial force */
1146 tx = _mm_mul_ps(fscal,dx20);
1147 ty = _mm_mul_ps(fscal,dy20);
1148 tz = _mm_mul_ps(fscal,dz20);
1150 /* Update vectorial force */
1151 fix2 = _mm_add_ps(fix2,tx);
1152 fiy2 = _mm_add_ps(fiy2,ty);
1153 fiz2 = _mm_add_ps(fiz2,tz);
1155 fjx0 = _mm_add_ps(fjx0,tx);
1156 fjy0 = _mm_add_ps(fjy0,ty);
1157 fjz0 = _mm_add_ps(fjz0,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 r21 = _mm_mul_ps(rsq21,rinv21);
1164 r21 = _mm_andnot_ps(dummy_mask,r21);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm_mul_ps(r21,ewtabscale);
1170 ewitab = _mm_cvttps_epi32(ewrt);
1171 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1175 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1176 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1177 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1178 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1179 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1180 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1181 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1183 /* Update potential sum for this i atom from the interaction with this j atom. */
1184 velec = _mm_andnot_ps(dummy_mask,velec);
1185 velecsum = _mm_add_ps(velecsum,velec);
1189 fscal = _mm_andnot_ps(dummy_mask,fscal);
1191 /* Calculate temporary vectorial force */
1192 tx = _mm_mul_ps(fscal,dx21);
1193 ty = _mm_mul_ps(fscal,dy21);
1194 tz = _mm_mul_ps(fscal,dz21);
1196 /* Update vectorial force */
1197 fix2 = _mm_add_ps(fix2,tx);
1198 fiy2 = _mm_add_ps(fiy2,ty);
1199 fiz2 = _mm_add_ps(fiz2,tz);
1201 fjx1 = _mm_add_ps(fjx1,tx);
1202 fjy1 = _mm_add_ps(fjy1,ty);
1203 fjz1 = _mm_add_ps(fjz1,tz);
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 r22 = _mm_mul_ps(rsq22,rinv22);
1210 r22 = _mm_andnot_ps(dummy_mask,r22);
1212 /* EWALD ELECTROSTATICS */
1214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215 ewrt = _mm_mul_ps(r22,ewtabscale);
1216 ewitab = _mm_cvttps_epi32(ewrt);
1217 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1218 ewitab = _mm_slli_epi32(ewitab,2);
1219 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1220 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1221 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1222 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1223 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1224 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1225 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1226 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1227 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1229 /* Update potential sum for this i atom from the interaction with this j atom. */
1230 velec = _mm_andnot_ps(dummy_mask,velec);
1231 velecsum = _mm_add_ps(velecsum,velec);
1235 fscal = _mm_andnot_ps(dummy_mask,fscal);
1237 /* Calculate temporary vectorial force */
1238 tx = _mm_mul_ps(fscal,dx22);
1239 ty = _mm_mul_ps(fscal,dy22);
1240 tz = _mm_mul_ps(fscal,dz22);
1242 /* Update vectorial force */
1243 fix2 = _mm_add_ps(fix2,tx);
1244 fiy2 = _mm_add_ps(fiy2,ty);
1245 fiz2 = _mm_add_ps(fiz2,tz);
1247 fjx2 = _mm_add_ps(fjx2,tx);
1248 fjy2 = _mm_add_ps(fjy2,ty);
1249 fjz2 = _mm_add_ps(fjz2,tz);
1251 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1252 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1253 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1254 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1256 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1257 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1259 /* Inner loop uses 406 flops */
1262 /* End of innermost loop */
1264 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1265 f+i_coord_offset,fshift+i_shift_offset);
1268 /* Update potential energies */
1269 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1270 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1272 /* Increment number of inner iterations */
1273 inneriter += j_index_end - j_index_start;
1275 /* Outer loop uses 20 flops */
1278 /* Increment number of outer iterations */
1281 /* Update outer/inner flops */
1283 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406);
1286 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
1287 * Electrostatics interaction: Ewald
1288 * VdW interaction: LJEwald
1289 * Geometry: Water3-Water3
1290 * Calculate force/pot: Force
1293 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_single
1294 (t_nblist * gmx_restrict nlist,
1295 rvec * gmx_restrict xx,
1296 rvec * gmx_restrict ff,
1297 t_forcerec * gmx_restrict fr,
1298 t_mdatoms * gmx_restrict mdatoms,
1299 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1300 t_nrnb * gmx_restrict nrnb)
1302 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1303 * just 0 for non-waters.
1304 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1305 * jnr indices corresponding to data put in the four positions in the SIMD register.
1307 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1308 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1309 int jnrA,jnrB,jnrC,jnrD;
1310 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1311 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1312 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1313 real rcutoff_scalar;
1314 real *shiftvec,*fshift,*x,*f;
1315 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1316 real scratch[4*DIM];
1317 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1319 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1321 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1323 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1324 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1325 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1326 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1327 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1328 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1329 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1330 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1331 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1332 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1333 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1334 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1335 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1336 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1337 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1338 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1339 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1342 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1345 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1346 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1356 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1358 __m128 one_half = _mm_set1_ps(0.5);
1359 __m128 minus_one = _mm_set1_ps(-1.0);
1361 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1363 __m128 dummy_mask,cutoff_mask;
1364 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1365 __m128 one = _mm_set1_ps(1.0);
1366 __m128 two = _mm_set1_ps(2.0);
1372 jindex = nlist->jindex;
1374 shiftidx = nlist->shift;
1376 shiftvec = fr->shift_vec[0];
1377 fshift = fr->fshift[0];
1378 facel = _mm_set1_ps(fr->epsfac);
1379 charge = mdatoms->chargeA;
1380 nvdwtype = fr->ntype;
1381 vdwparam = fr->nbfp;
1382 vdwtype = mdatoms->typeA;
1383 vdwgridparam = fr->ljpme_c6grid;
1384 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1385 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1386 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1388 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1389 ewtab = fr->ic->tabq_coul_F;
1390 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1391 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1393 /* Setup water-specific parameters */
1394 inr = nlist->iinr[0];
1395 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1396 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1397 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1398 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1400 jq0 = _mm_set1_ps(charge[inr+0]);
1401 jq1 = _mm_set1_ps(charge[inr+1]);
1402 jq2 = _mm_set1_ps(charge[inr+2]);
1403 vdwjidx0A = 2*vdwtype[inr+0];
1404 qq00 = _mm_mul_ps(iq0,jq0);
1405 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1406 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1407 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1408 qq01 = _mm_mul_ps(iq0,jq1);
1409 qq02 = _mm_mul_ps(iq0,jq2);
1410 qq10 = _mm_mul_ps(iq1,jq0);
1411 qq11 = _mm_mul_ps(iq1,jq1);
1412 qq12 = _mm_mul_ps(iq1,jq2);
1413 qq20 = _mm_mul_ps(iq2,jq0);
1414 qq21 = _mm_mul_ps(iq2,jq1);
1415 qq22 = _mm_mul_ps(iq2,jq2);
1417 /* Avoid stupid compiler warnings */
1418 jnrA = jnrB = jnrC = jnrD = 0;
1419 j_coord_offsetA = 0;
1420 j_coord_offsetB = 0;
1421 j_coord_offsetC = 0;
1422 j_coord_offsetD = 0;
1427 for(iidx=0;iidx<4*DIM;iidx++)
1429 scratch[iidx] = 0.0;
1432 /* Start outer loop over neighborlists */
1433 for(iidx=0; iidx<nri; iidx++)
1435 /* Load shift vector for this list */
1436 i_shift_offset = DIM*shiftidx[iidx];
1438 /* Load limits for loop over neighbors */
1439 j_index_start = jindex[iidx];
1440 j_index_end = jindex[iidx+1];
1442 /* Get outer coordinate index */
1444 i_coord_offset = DIM*inr;
1446 /* Load i particle coords and add shift vector */
1447 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1448 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1450 fix0 = _mm_setzero_ps();
1451 fiy0 = _mm_setzero_ps();
1452 fiz0 = _mm_setzero_ps();
1453 fix1 = _mm_setzero_ps();
1454 fiy1 = _mm_setzero_ps();
1455 fiz1 = _mm_setzero_ps();
1456 fix2 = _mm_setzero_ps();
1457 fiy2 = _mm_setzero_ps();
1458 fiz2 = _mm_setzero_ps();
1460 /* Start inner kernel loop */
1461 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1464 /* Get j neighbor index, and coordinate index */
1466 jnrB = jjnr[jidx+1];
1467 jnrC = jjnr[jidx+2];
1468 jnrD = jjnr[jidx+3];
1469 j_coord_offsetA = DIM*jnrA;
1470 j_coord_offsetB = DIM*jnrB;
1471 j_coord_offsetC = DIM*jnrC;
1472 j_coord_offsetD = DIM*jnrD;
1474 /* load j atom coordinates */
1475 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1476 x+j_coord_offsetC,x+j_coord_offsetD,
1477 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1479 /* Calculate displacement vector */
1480 dx00 = _mm_sub_ps(ix0,jx0);
1481 dy00 = _mm_sub_ps(iy0,jy0);
1482 dz00 = _mm_sub_ps(iz0,jz0);
1483 dx01 = _mm_sub_ps(ix0,jx1);
1484 dy01 = _mm_sub_ps(iy0,jy1);
1485 dz01 = _mm_sub_ps(iz0,jz1);
1486 dx02 = _mm_sub_ps(ix0,jx2);
1487 dy02 = _mm_sub_ps(iy0,jy2);
1488 dz02 = _mm_sub_ps(iz0,jz2);
1489 dx10 = _mm_sub_ps(ix1,jx0);
1490 dy10 = _mm_sub_ps(iy1,jy0);
1491 dz10 = _mm_sub_ps(iz1,jz0);
1492 dx11 = _mm_sub_ps(ix1,jx1);
1493 dy11 = _mm_sub_ps(iy1,jy1);
1494 dz11 = _mm_sub_ps(iz1,jz1);
1495 dx12 = _mm_sub_ps(ix1,jx2);
1496 dy12 = _mm_sub_ps(iy1,jy2);
1497 dz12 = _mm_sub_ps(iz1,jz2);
1498 dx20 = _mm_sub_ps(ix2,jx0);
1499 dy20 = _mm_sub_ps(iy2,jy0);
1500 dz20 = _mm_sub_ps(iz2,jz0);
1501 dx21 = _mm_sub_ps(ix2,jx1);
1502 dy21 = _mm_sub_ps(iy2,jy1);
1503 dz21 = _mm_sub_ps(iz2,jz1);
1504 dx22 = _mm_sub_ps(ix2,jx2);
1505 dy22 = _mm_sub_ps(iy2,jy2);
1506 dz22 = _mm_sub_ps(iz2,jz2);
1508 /* Calculate squared distance and things based on it */
1509 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1510 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1511 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1512 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1513 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1514 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1515 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1516 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1517 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1519 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1520 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1521 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1522 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1523 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1524 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1525 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1526 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1527 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1529 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1530 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1531 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1532 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1533 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1534 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1535 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1536 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1537 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1539 fjx0 = _mm_setzero_ps();
1540 fjy0 = _mm_setzero_ps();
1541 fjz0 = _mm_setzero_ps();
1542 fjx1 = _mm_setzero_ps();
1543 fjy1 = _mm_setzero_ps();
1544 fjz1 = _mm_setzero_ps();
1545 fjx2 = _mm_setzero_ps();
1546 fjy2 = _mm_setzero_ps();
1547 fjz2 = _mm_setzero_ps();
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 r00 = _mm_mul_ps(rsq00,rinv00);
1555 /* EWALD ELECTROSTATICS */
1557 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1558 ewrt = _mm_mul_ps(r00,ewtabscale);
1559 ewitab = _mm_cvttps_epi32(ewrt);
1560 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1561 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1562 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1564 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1565 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1567 /* Analytical LJ-PME */
1568 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1569 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1570 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1571 exponent = gmx_simd_exp_r(ewcljrsq);
1572 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1573 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1574 /* f6A = 6 * C6grid * (1 - poly) */
1575 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1576 /* f6B = C6grid * exponent * beta^6 */
1577 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1578 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1579 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1581 fscal = _mm_add_ps(felec,fvdw);
1583 /* Calculate temporary vectorial force */
1584 tx = _mm_mul_ps(fscal,dx00);
1585 ty = _mm_mul_ps(fscal,dy00);
1586 tz = _mm_mul_ps(fscal,dz00);
1588 /* Update vectorial force */
1589 fix0 = _mm_add_ps(fix0,tx);
1590 fiy0 = _mm_add_ps(fiy0,ty);
1591 fiz0 = _mm_add_ps(fiz0,tz);
1593 fjx0 = _mm_add_ps(fjx0,tx);
1594 fjy0 = _mm_add_ps(fjy0,ty);
1595 fjz0 = _mm_add_ps(fjz0,tz);
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1601 r01 = _mm_mul_ps(rsq01,rinv01);
1603 /* EWALD ELECTROSTATICS */
1605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1606 ewrt = _mm_mul_ps(r01,ewtabscale);
1607 ewitab = _mm_cvttps_epi32(ewrt);
1608 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1609 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1610 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1612 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1613 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1617 /* Calculate temporary vectorial force */
1618 tx = _mm_mul_ps(fscal,dx01);
1619 ty = _mm_mul_ps(fscal,dy01);
1620 tz = _mm_mul_ps(fscal,dz01);
1622 /* Update vectorial force */
1623 fix0 = _mm_add_ps(fix0,tx);
1624 fiy0 = _mm_add_ps(fiy0,ty);
1625 fiz0 = _mm_add_ps(fiz0,tz);
1627 fjx1 = _mm_add_ps(fjx1,tx);
1628 fjy1 = _mm_add_ps(fjy1,ty);
1629 fjz1 = _mm_add_ps(fjz1,tz);
1631 /**************************
1632 * CALCULATE INTERACTIONS *
1633 **************************/
1635 r02 = _mm_mul_ps(rsq02,rinv02);
1637 /* EWALD ELECTROSTATICS */
1639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640 ewrt = _mm_mul_ps(r02,ewtabscale);
1641 ewitab = _mm_cvttps_epi32(ewrt);
1642 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1643 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1644 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1646 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1651 /* Calculate temporary vectorial force */
1652 tx = _mm_mul_ps(fscal,dx02);
1653 ty = _mm_mul_ps(fscal,dy02);
1654 tz = _mm_mul_ps(fscal,dz02);
1656 /* Update vectorial force */
1657 fix0 = _mm_add_ps(fix0,tx);
1658 fiy0 = _mm_add_ps(fiy0,ty);
1659 fiz0 = _mm_add_ps(fiz0,tz);
1661 fjx2 = _mm_add_ps(fjx2,tx);
1662 fjy2 = _mm_add_ps(fjy2,ty);
1663 fjz2 = _mm_add_ps(fjz2,tz);
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1669 r10 = _mm_mul_ps(rsq10,rinv10);
1671 /* EWALD ELECTROSTATICS */
1673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674 ewrt = _mm_mul_ps(r10,ewtabscale);
1675 ewitab = _mm_cvttps_epi32(ewrt);
1676 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1677 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1678 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1680 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1681 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1685 /* Calculate temporary vectorial force */
1686 tx = _mm_mul_ps(fscal,dx10);
1687 ty = _mm_mul_ps(fscal,dy10);
1688 tz = _mm_mul_ps(fscal,dz10);
1690 /* Update vectorial force */
1691 fix1 = _mm_add_ps(fix1,tx);
1692 fiy1 = _mm_add_ps(fiy1,ty);
1693 fiz1 = _mm_add_ps(fiz1,tz);
1695 fjx0 = _mm_add_ps(fjx0,tx);
1696 fjy0 = _mm_add_ps(fjy0,ty);
1697 fjz0 = _mm_add_ps(fjz0,tz);
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 r11 = _mm_mul_ps(rsq11,rinv11);
1705 /* EWALD ELECTROSTATICS */
1707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1708 ewrt = _mm_mul_ps(r11,ewtabscale);
1709 ewitab = _mm_cvttps_epi32(ewrt);
1710 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1711 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1712 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1714 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1715 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1719 /* Calculate temporary vectorial force */
1720 tx = _mm_mul_ps(fscal,dx11);
1721 ty = _mm_mul_ps(fscal,dy11);
1722 tz = _mm_mul_ps(fscal,dz11);
1724 /* Update vectorial force */
1725 fix1 = _mm_add_ps(fix1,tx);
1726 fiy1 = _mm_add_ps(fiy1,ty);
1727 fiz1 = _mm_add_ps(fiz1,tz);
1729 fjx1 = _mm_add_ps(fjx1,tx);
1730 fjy1 = _mm_add_ps(fjy1,ty);
1731 fjz1 = _mm_add_ps(fjz1,tz);
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1737 r12 = _mm_mul_ps(rsq12,rinv12);
1739 /* EWALD ELECTROSTATICS */
1741 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1742 ewrt = _mm_mul_ps(r12,ewtabscale);
1743 ewitab = _mm_cvttps_epi32(ewrt);
1744 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1745 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1746 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1748 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1749 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1753 /* Calculate temporary vectorial force */
1754 tx = _mm_mul_ps(fscal,dx12);
1755 ty = _mm_mul_ps(fscal,dy12);
1756 tz = _mm_mul_ps(fscal,dz12);
1758 /* Update vectorial force */
1759 fix1 = _mm_add_ps(fix1,tx);
1760 fiy1 = _mm_add_ps(fiy1,ty);
1761 fiz1 = _mm_add_ps(fiz1,tz);
1763 fjx2 = _mm_add_ps(fjx2,tx);
1764 fjy2 = _mm_add_ps(fjy2,ty);
1765 fjz2 = _mm_add_ps(fjz2,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r20 = _mm_mul_ps(rsq20,rinv20);
1773 /* EWALD ELECTROSTATICS */
1775 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1776 ewrt = _mm_mul_ps(r20,ewtabscale);
1777 ewitab = _mm_cvttps_epi32(ewrt);
1778 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1779 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1780 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1782 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1783 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1787 /* Calculate temporary vectorial force */
1788 tx = _mm_mul_ps(fscal,dx20);
1789 ty = _mm_mul_ps(fscal,dy20);
1790 tz = _mm_mul_ps(fscal,dz20);
1792 /* Update vectorial force */
1793 fix2 = _mm_add_ps(fix2,tx);
1794 fiy2 = _mm_add_ps(fiy2,ty);
1795 fiz2 = _mm_add_ps(fiz2,tz);
1797 fjx0 = _mm_add_ps(fjx0,tx);
1798 fjy0 = _mm_add_ps(fjy0,ty);
1799 fjz0 = _mm_add_ps(fjz0,tz);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 r21 = _mm_mul_ps(rsq21,rinv21);
1807 /* EWALD ELECTROSTATICS */
1809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1810 ewrt = _mm_mul_ps(r21,ewtabscale);
1811 ewitab = _mm_cvttps_epi32(ewrt);
1812 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1813 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1814 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1816 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1817 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1821 /* Calculate temporary vectorial force */
1822 tx = _mm_mul_ps(fscal,dx21);
1823 ty = _mm_mul_ps(fscal,dy21);
1824 tz = _mm_mul_ps(fscal,dz21);
1826 /* Update vectorial force */
1827 fix2 = _mm_add_ps(fix2,tx);
1828 fiy2 = _mm_add_ps(fiy2,ty);
1829 fiz2 = _mm_add_ps(fiz2,tz);
1831 fjx1 = _mm_add_ps(fjx1,tx);
1832 fjy1 = _mm_add_ps(fjy1,ty);
1833 fjz1 = _mm_add_ps(fjz1,tz);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 r22 = _mm_mul_ps(rsq22,rinv22);
1841 /* EWALD ELECTROSTATICS */
1843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844 ewrt = _mm_mul_ps(r22,ewtabscale);
1845 ewitab = _mm_cvttps_epi32(ewrt);
1846 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1847 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1848 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1850 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1851 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1855 /* Calculate temporary vectorial force */
1856 tx = _mm_mul_ps(fscal,dx22);
1857 ty = _mm_mul_ps(fscal,dy22);
1858 tz = _mm_mul_ps(fscal,dz22);
1860 /* Update vectorial force */
1861 fix2 = _mm_add_ps(fix2,tx);
1862 fiy2 = _mm_add_ps(fiy2,ty);
1863 fiz2 = _mm_add_ps(fiz2,tz);
1865 fjx2 = _mm_add_ps(fjx2,tx);
1866 fjy2 = _mm_add_ps(fjy2,ty);
1867 fjz2 = _mm_add_ps(fjz2,tz);
1869 fjptrA = f+j_coord_offsetA;
1870 fjptrB = f+j_coord_offsetB;
1871 fjptrC = f+j_coord_offsetC;
1872 fjptrD = f+j_coord_offsetD;
1874 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1875 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1877 /* Inner loop uses 347 flops */
1880 if(jidx<j_index_end)
1883 /* Get j neighbor index, and coordinate index */
1884 jnrlistA = jjnr[jidx];
1885 jnrlistB = jjnr[jidx+1];
1886 jnrlistC = jjnr[jidx+2];
1887 jnrlistD = jjnr[jidx+3];
1888 /* Sign of each element will be negative for non-real atoms.
1889 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1890 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1892 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1893 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1894 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1895 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1896 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1897 j_coord_offsetA = DIM*jnrA;
1898 j_coord_offsetB = DIM*jnrB;
1899 j_coord_offsetC = DIM*jnrC;
1900 j_coord_offsetD = DIM*jnrD;
1902 /* load j atom coordinates */
1903 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1904 x+j_coord_offsetC,x+j_coord_offsetD,
1905 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1907 /* Calculate displacement vector */
1908 dx00 = _mm_sub_ps(ix0,jx0);
1909 dy00 = _mm_sub_ps(iy0,jy0);
1910 dz00 = _mm_sub_ps(iz0,jz0);
1911 dx01 = _mm_sub_ps(ix0,jx1);
1912 dy01 = _mm_sub_ps(iy0,jy1);
1913 dz01 = _mm_sub_ps(iz0,jz1);
1914 dx02 = _mm_sub_ps(ix0,jx2);
1915 dy02 = _mm_sub_ps(iy0,jy2);
1916 dz02 = _mm_sub_ps(iz0,jz2);
1917 dx10 = _mm_sub_ps(ix1,jx0);
1918 dy10 = _mm_sub_ps(iy1,jy0);
1919 dz10 = _mm_sub_ps(iz1,jz0);
1920 dx11 = _mm_sub_ps(ix1,jx1);
1921 dy11 = _mm_sub_ps(iy1,jy1);
1922 dz11 = _mm_sub_ps(iz1,jz1);
1923 dx12 = _mm_sub_ps(ix1,jx2);
1924 dy12 = _mm_sub_ps(iy1,jy2);
1925 dz12 = _mm_sub_ps(iz1,jz2);
1926 dx20 = _mm_sub_ps(ix2,jx0);
1927 dy20 = _mm_sub_ps(iy2,jy0);
1928 dz20 = _mm_sub_ps(iz2,jz0);
1929 dx21 = _mm_sub_ps(ix2,jx1);
1930 dy21 = _mm_sub_ps(iy2,jy1);
1931 dz21 = _mm_sub_ps(iz2,jz1);
1932 dx22 = _mm_sub_ps(ix2,jx2);
1933 dy22 = _mm_sub_ps(iy2,jy2);
1934 dz22 = _mm_sub_ps(iz2,jz2);
1936 /* Calculate squared distance and things based on it */
1937 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1938 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1939 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1940 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1941 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1942 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1943 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1944 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1945 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1947 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1948 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1949 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1950 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1951 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1952 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1953 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1954 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1955 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1957 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1958 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1959 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1960 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1961 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1962 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1963 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1964 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1965 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1967 fjx0 = _mm_setzero_ps();
1968 fjy0 = _mm_setzero_ps();
1969 fjz0 = _mm_setzero_ps();
1970 fjx1 = _mm_setzero_ps();
1971 fjy1 = _mm_setzero_ps();
1972 fjz1 = _mm_setzero_ps();
1973 fjx2 = _mm_setzero_ps();
1974 fjy2 = _mm_setzero_ps();
1975 fjz2 = _mm_setzero_ps();
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1981 r00 = _mm_mul_ps(rsq00,rinv00);
1982 r00 = _mm_andnot_ps(dummy_mask,r00);
1984 /* EWALD ELECTROSTATICS */
1986 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1987 ewrt = _mm_mul_ps(r00,ewtabscale);
1988 ewitab = _mm_cvttps_epi32(ewrt);
1989 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1990 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1991 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1993 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1994 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1996 /* Analytical LJ-PME */
1997 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1998 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1999 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2000 exponent = gmx_simd_exp_r(ewcljrsq);
2001 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2002 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2003 /* f6A = 6 * C6grid * (1 - poly) */
2004 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2005 /* f6B = C6grid * exponent * beta^6 */
2006 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2007 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2008 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2010 fscal = _mm_add_ps(felec,fvdw);
2012 fscal = _mm_andnot_ps(dummy_mask,fscal);
2014 /* Calculate temporary vectorial force */
2015 tx = _mm_mul_ps(fscal,dx00);
2016 ty = _mm_mul_ps(fscal,dy00);
2017 tz = _mm_mul_ps(fscal,dz00);
2019 /* Update vectorial force */
2020 fix0 = _mm_add_ps(fix0,tx);
2021 fiy0 = _mm_add_ps(fiy0,ty);
2022 fiz0 = _mm_add_ps(fiz0,tz);
2024 fjx0 = _mm_add_ps(fjx0,tx);
2025 fjy0 = _mm_add_ps(fjy0,ty);
2026 fjz0 = _mm_add_ps(fjz0,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r01 = _mm_mul_ps(rsq01,rinv01);
2033 r01 = _mm_andnot_ps(dummy_mask,r01);
2035 /* EWALD ELECTROSTATICS */
2037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038 ewrt = _mm_mul_ps(r01,ewtabscale);
2039 ewitab = _mm_cvttps_epi32(ewrt);
2040 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2041 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2042 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2044 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2045 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2049 fscal = _mm_andnot_ps(dummy_mask,fscal);
2051 /* Calculate temporary vectorial force */
2052 tx = _mm_mul_ps(fscal,dx01);
2053 ty = _mm_mul_ps(fscal,dy01);
2054 tz = _mm_mul_ps(fscal,dz01);
2056 /* Update vectorial force */
2057 fix0 = _mm_add_ps(fix0,tx);
2058 fiy0 = _mm_add_ps(fiy0,ty);
2059 fiz0 = _mm_add_ps(fiz0,tz);
2061 fjx1 = _mm_add_ps(fjx1,tx);
2062 fjy1 = _mm_add_ps(fjy1,ty);
2063 fjz1 = _mm_add_ps(fjz1,tz);
2065 /**************************
2066 * CALCULATE INTERACTIONS *
2067 **************************/
2069 r02 = _mm_mul_ps(rsq02,rinv02);
2070 r02 = _mm_andnot_ps(dummy_mask,r02);
2072 /* EWALD ELECTROSTATICS */
2074 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2075 ewrt = _mm_mul_ps(r02,ewtabscale);
2076 ewitab = _mm_cvttps_epi32(ewrt);
2077 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2078 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2079 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2081 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2082 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2086 fscal = _mm_andnot_ps(dummy_mask,fscal);
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_ps(fscal,dx02);
2090 ty = _mm_mul_ps(fscal,dy02);
2091 tz = _mm_mul_ps(fscal,dz02);
2093 /* Update vectorial force */
2094 fix0 = _mm_add_ps(fix0,tx);
2095 fiy0 = _mm_add_ps(fiy0,ty);
2096 fiz0 = _mm_add_ps(fiz0,tz);
2098 fjx2 = _mm_add_ps(fjx2,tx);
2099 fjy2 = _mm_add_ps(fjy2,ty);
2100 fjz2 = _mm_add_ps(fjz2,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 r10 = _mm_mul_ps(rsq10,rinv10);
2107 r10 = _mm_andnot_ps(dummy_mask,r10);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt = _mm_mul_ps(r10,ewtabscale);
2113 ewitab = _mm_cvttps_epi32(ewrt);
2114 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2115 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2116 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2118 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2119 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2123 fscal = _mm_andnot_ps(dummy_mask,fscal);
2125 /* Calculate temporary vectorial force */
2126 tx = _mm_mul_ps(fscal,dx10);
2127 ty = _mm_mul_ps(fscal,dy10);
2128 tz = _mm_mul_ps(fscal,dz10);
2130 /* Update vectorial force */
2131 fix1 = _mm_add_ps(fix1,tx);
2132 fiy1 = _mm_add_ps(fiy1,ty);
2133 fiz1 = _mm_add_ps(fiz1,tz);
2135 fjx0 = _mm_add_ps(fjx0,tx);
2136 fjy0 = _mm_add_ps(fjy0,ty);
2137 fjz0 = _mm_add_ps(fjz0,tz);
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2143 r11 = _mm_mul_ps(rsq11,rinv11);
2144 r11 = _mm_andnot_ps(dummy_mask,r11);
2146 /* EWALD ELECTROSTATICS */
2148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2149 ewrt = _mm_mul_ps(r11,ewtabscale);
2150 ewitab = _mm_cvttps_epi32(ewrt);
2151 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2152 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2153 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2155 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2156 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2160 fscal = _mm_andnot_ps(dummy_mask,fscal);
2162 /* Calculate temporary vectorial force */
2163 tx = _mm_mul_ps(fscal,dx11);
2164 ty = _mm_mul_ps(fscal,dy11);
2165 tz = _mm_mul_ps(fscal,dz11);
2167 /* Update vectorial force */
2168 fix1 = _mm_add_ps(fix1,tx);
2169 fiy1 = _mm_add_ps(fiy1,ty);
2170 fiz1 = _mm_add_ps(fiz1,tz);
2172 fjx1 = _mm_add_ps(fjx1,tx);
2173 fjy1 = _mm_add_ps(fjy1,ty);
2174 fjz1 = _mm_add_ps(fjz1,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 r12 = _mm_mul_ps(rsq12,rinv12);
2181 r12 = _mm_andnot_ps(dummy_mask,r12);
2183 /* EWALD ELECTROSTATICS */
2185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186 ewrt = _mm_mul_ps(r12,ewtabscale);
2187 ewitab = _mm_cvttps_epi32(ewrt);
2188 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2189 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2190 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2192 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2193 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2197 fscal = _mm_andnot_ps(dummy_mask,fscal);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx12);
2201 ty = _mm_mul_ps(fscal,dy12);
2202 tz = _mm_mul_ps(fscal,dz12);
2204 /* Update vectorial force */
2205 fix1 = _mm_add_ps(fix1,tx);
2206 fiy1 = _mm_add_ps(fiy1,ty);
2207 fiz1 = _mm_add_ps(fiz1,tz);
2209 fjx2 = _mm_add_ps(fjx2,tx);
2210 fjy2 = _mm_add_ps(fjy2,ty);
2211 fjz2 = _mm_add_ps(fjz2,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 r20 = _mm_mul_ps(rsq20,rinv20);
2218 r20 = _mm_andnot_ps(dummy_mask,r20);
2220 /* EWALD ELECTROSTATICS */
2222 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2223 ewrt = _mm_mul_ps(r20,ewtabscale);
2224 ewitab = _mm_cvttps_epi32(ewrt);
2225 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2226 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2227 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2229 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2230 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2234 fscal = _mm_andnot_ps(dummy_mask,fscal);
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_ps(fscal,dx20);
2238 ty = _mm_mul_ps(fscal,dy20);
2239 tz = _mm_mul_ps(fscal,dz20);
2241 /* Update vectorial force */
2242 fix2 = _mm_add_ps(fix2,tx);
2243 fiy2 = _mm_add_ps(fiy2,ty);
2244 fiz2 = _mm_add_ps(fiz2,tz);
2246 fjx0 = _mm_add_ps(fjx0,tx);
2247 fjy0 = _mm_add_ps(fjy0,ty);
2248 fjz0 = _mm_add_ps(fjz0,tz);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 r21 = _mm_mul_ps(rsq21,rinv21);
2255 r21 = _mm_andnot_ps(dummy_mask,r21);
2257 /* EWALD ELECTROSTATICS */
2259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260 ewrt = _mm_mul_ps(r21,ewtabscale);
2261 ewitab = _mm_cvttps_epi32(ewrt);
2262 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2263 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2264 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2266 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2267 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2271 fscal = _mm_andnot_ps(dummy_mask,fscal);
2273 /* Calculate temporary vectorial force */
2274 tx = _mm_mul_ps(fscal,dx21);
2275 ty = _mm_mul_ps(fscal,dy21);
2276 tz = _mm_mul_ps(fscal,dz21);
2278 /* Update vectorial force */
2279 fix2 = _mm_add_ps(fix2,tx);
2280 fiy2 = _mm_add_ps(fiy2,ty);
2281 fiz2 = _mm_add_ps(fiz2,tz);
2283 fjx1 = _mm_add_ps(fjx1,tx);
2284 fjy1 = _mm_add_ps(fjy1,ty);
2285 fjz1 = _mm_add_ps(fjz1,tz);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 r22 = _mm_mul_ps(rsq22,rinv22);
2292 r22 = _mm_andnot_ps(dummy_mask,r22);
2294 /* EWALD ELECTROSTATICS */
2296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2297 ewrt = _mm_mul_ps(r22,ewtabscale);
2298 ewitab = _mm_cvttps_epi32(ewrt);
2299 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2300 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2301 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2303 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2304 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2308 fscal = _mm_andnot_ps(dummy_mask,fscal);
2310 /* Calculate temporary vectorial force */
2311 tx = _mm_mul_ps(fscal,dx22);
2312 ty = _mm_mul_ps(fscal,dy22);
2313 tz = _mm_mul_ps(fscal,dz22);
2315 /* Update vectorial force */
2316 fix2 = _mm_add_ps(fix2,tx);
2317 fiy2 = _mm_add_ps(fiy2,ty);
2318 fiz2 = _mm_add_ps(fiz2,tz);
2320 fjx2 = _mm_add_ps(fjx2,tx);
2321 fjy2 = _mm_add_ps(fjy2,ty);
2322 fjz2 = _mm_add_ps(fjz2,tz);
2324 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2325 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2326 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2327 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2329 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2330 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2332 /* Inner loop uses 356 flops */
2335 /* End of innermost loop */
2337 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2338 f+i_coord_offset,fshift+i_shift_offset);
2340 /* Increment number of inner iterations */
2341 inneriter += j_index_end - j_index_start;
2343 /* Outer loop uses 18 flops */
2346 /* Increment number of outer iterations */
2349 /* Update outer/inner flops */
2351 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);