2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LJEwald
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
120 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
122 __m128d one_half = _mm_set1_pd(0.5);
123 __m128d minus_one = _mm_set1_pd(-1.0);
125 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
127 __m128d dummy_mask,cutoff_mask;
128 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
129 __m128d one = _mm_set1_pd(1.0);
130 __m128d two = _mm_set1_pd(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm_set1_pd(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
147 vdwgridparam = fr->ljpme_c6grid;
148 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
149 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
150 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
152 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
153 ewtab = fr->ic->tabq_coul_FDV0;
154 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
155 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
157 /* Setup water-specific parameters */
158 inr = nlist->iinr[0];
159 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
160 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
161 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
162 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
164 jq0 = _mm_set1_pd(charge[inr+0]);
165 jq1 = _mm_set1_pd(charge[inr+1]);
166 jq2 = _mm_set1_pd(charge[inr+2]);
167 vdwjidx0A = 2*vdwtype[inr+0];
168 qq00 = _mm_mul_pd(iq0,jq0);
169 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
170 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
171 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
172 qq01 = _mm_mul_pd(iq0,jq1);
173 qq02 = _mm_mul_pd(iq0,jq2);
174 qq10 = _mm_mul_pd(iq1,jq0);
175 qq11 = _mm_mul_pd(iq1,jq1);
176 qq12 = _mm_mul_pd(iq1,jq2);
177 qq20 = _mm_mul_pd(iq2,jq0);
178 qq21 = _mm_mul_pd(iq2,jq1);
179 qq22 = _mm_mul_pd(iq2,jq2);
181 /* Avoid stupid compiler warnings */
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
192 /* Load shift vector for this list */
193 i_shift_offset = DIM*shiftidx[iidx];
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
199 /* Get outer coordinate index */
201 i_coord_offset = DIM*inr;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
207 fix0 = _mm_setzero_pd();
208 fiy0 = _mm_setzero_pd();
209 fiz0 = _mm_setzero_pd();
210 fix1 = _mm_setzero_pd();
211 fiy1 = _mm_setzero_pd();
212 fiz1 = _mm_setzero_pd();
213 fix2 = _mm_setzero_pd();
214 fiy2 = _mm_setzero_pd();
215 fiz2 = _mm_setzero_pd();
217 /* Reset potential sums */
218 velecsum = _mm_setzero_pd();
219 vvdwsum = _mm_setzero_pd();
221 /* Start inner kernel loop */
222 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
225 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
231 /* load j atom coordinates */
232 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
233 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
235 /* Calculate displacement vector */
236 dx00 = _mm_sub_pd(ix0,jx0);
237 dy00 = _mm_sub_pd(iy0,jy0);
238 dz00 = _mm_sub_pd(iz0,jz0);
239 dx01 = _mm_sub_pd(ix0,jx1);
240 dy01 = _mm_sub_pd(iy0,jy1);
241 dz01 = _mm_sub_pd(iz0,jz1);
242 dx02 = _mm_sub_pd(ix0,jx2);
243 dy02 = _mm_sub_pd(iy0,jy2);
244 dz02 = _mm_sub_pd(iz0,jz2);
245 dx10 = _mm_sub_pd(ix1,jx0);
246 dy10 = _mm_sub_pd(iy1,jy0);
247 dz10 = _mm_sub_pd(iz1,jz0);
248 dx11 = _mm_sub_pd(ix1,jx1);
249 dy11 = _mm_sub_pd(iy1,jy1);
250 dz11 = _mm_sub_pd(iz1,jz1);
251 dx12 = _mm_sub_pd(ix1,jx2);
252 dy12 = _mm_sub_pd(iy1,jy2);
253 dz12 = _mm_sub_pd(iz1,jz2);
254 dx20 = _mm_sub_pd(ix2,jx0);
255 dy20 = _mm_sub_pd(iy2,jy0);
256 dz20 = _mm_sub_pd(iz2,jz0);
257 dx21 = _mm_sub_pd(ix2,jx1);
258 dy21 = _mm_sub_pd(iy2,jy1);
259 dz21 = _mm_sub_pd(iz2,jz1);
260 dx22 = _mm_sub_pd(ix2,jx2);
261 dy22 = _mm_sub_pd(iy2,jy2);
262 dz22 = _mm_sub_pd(iz2,jz2);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
266 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
267 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
268 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
269 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
270 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
271 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
272 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
273 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
275 rinv00 = gmx_mm_invsqrt_pd(rsq00);
276 rinv01 = gmx_mm_invsqrt_pd(rsq01);
277 rinv02 = gmx_mm_invsqrt_pd(rsq02);
278 rinv10 = gmx_mm_invsqrt_pd(rsq10);
279 rinv11 = gmx_mm_invsqrt_pd(rsq11);
280 rinv12 = gmx_mm_invsqrt_pd(rsq12);
281 rinv20 = gmx_mm_invsqrt_pd(rsq20);
282 rinv21 = gmx_mm_invsqrt_pd(rsq21);
283 rinv22 = gmx_mm_invsqrt_pd(rsq22);
285 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
286 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
287 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
288 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
289 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
290 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
291 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
292 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
293 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
295 fjx0 = _mm_setzero_pd();
296 fjy0 = _mm_setzero_pd();
297 fjz0 = _mm_setzero_pd();
298 fjx1 = _mm_setzero_pd();
299 fjy1 = _mm_setzero_pd();
300 fjz1 = _mm_setzero_pd();
301 fjx2 = _mm_setzero_pd();
302 fjy2 = _mm_setzero_pd();
303 fjz2 = _mm_setzero_pd();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 r00 = _mm_mul_pd(rsq00,rinv00);
311 /* EWALD ELECTROSTATICS */
313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
314 ewrt = _mm_mul_pd(r00,ewtabscale);
315 ewitab = _mm_cvttpd_epi32(ewrt);
316 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
317 ewitab = _mm_slli_epi32(ewitab,2);
318 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
319 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
320 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
321 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
322 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
323 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
324 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
325 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
326 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
327 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
329 /* Analytical LJ-PME */
330 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
331 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
332 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
333 exponent = gmx_simd_exp_d(ewcljrsq);
334 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
335 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
336 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
337 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
338 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
339 vvdw = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
340 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
341 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum = _mm_add_pd(velecsum,velec);
345 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
347 fscal = _mm_add_pd(felec,fvdw);
349 /* Calculate temporary vectorial force */
350 tx = _mm_mul_pd(fscal,dx00);
351 ty = _mm_mul_pd(fscal,dy00);
352 tz = _mm_mul_pd(fscal,dz00);
354 /* Update vectorial force */
355 fix0 = _mm_add_pd(fix0,tx);
356 fiy0 = _mm_add_pd(fiy0,ty);
357 fiz0 = _mm_add_pd(fiz0,tz);
359 fjx0 = _mm_add_pd(fjx0,tx);
360 fjy0 = _mm_add_pd(fjy0,ty);
361 fjz0 = _mm_add_pd(fjz0,tz);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r01 = _mm_mul_pd(rsq01,rinv01);
369 /* EWALD ELECTROSTATICS */
371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
372 ewrt = _mm_mul_pd(r01,ewtabscale);
373 ewitab = _mm_cvttpd_epi32(ewrt);
374 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
375 ewitab = _mm_slli_epi32(ewitab,2);
376 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
377 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
378 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
379 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
380 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
381 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
382 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
383 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
384 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
385 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velecsum = _mm_add_pd(velecsum,velec);
392 /* Calculate temporary vectorial force */
393 tx = _mm_mul_pd(fscal,dx01);
394 ty = _mm_mul_pd(fscal,dy01);
395 tz = _mm_mul_pd(fscal,dz01);
397 /* Update vectorial force */
398 fix0 = _mm_add_pd(fix0,tx);
399 fiy0 = _mm_add_pd(fiy0,ty);
400 fiz0 = _mm_add_pd(fiz0,tz);
402 fjx1 = _mm_add_pd(fjx1,tx);
403 fjy1 = _mm_add_pd(fjy1,ty);
404 fjz1 = _mm_add_pd(fjz1,tz);
406 /**************************
407 * CALCULATE INTERACTIONS *
408 **************************/
410 r02 = _mm_mul_pd(rsq02,rinv02);
412 /* EWALD ELECTROSTATICS */
414 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
415 ewrt = _mm_mul_pd(r02,ewtabscale);
416 ewitab = _mm_cvttpd_epi32(ewrt);
417 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
418 ewitab = _mm_slli_epi32(ewitab,2);
419 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
420 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
421 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
422 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
423 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
424 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
425 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
426 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
427 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
428 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm_add_pd(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm_mul_pd(fscal,dx02);
437 ty = _mm_mul_pd(fscal,dy02);
438 tz = _mm_mul_pd(fscal,dz02);
440 /* Update vectorial force */
441 fix0 = _mm_add_pd(fix0,tx);
442 fiy0 = _mm_add_pd(fiy0,ty);
443 fiz0 = _mm_add_pd(fiz0,tz);
445 fjx2 = _mm_add_pd(fjx2,tx);
446 fjy2 = _mm_add_pd(fjy2,ty);
447 fjz2 = _mm_add_pd(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 r10 = _mm_mul_pd(rsq10,rinv10);
455 /* EWALD ELECTROSTATICS */
457 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
458 ewrt = _mm_mul_pd(r10,ewtabscale);
459 ewitab = _mm_cvttpd_epi32(ewrt);
460 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
461 ewitab = _mm_slli_epi32(ewitab,2);
462 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
463 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
464 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
465 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
466 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
467 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
468 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
469 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
470 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
471 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velecsum = _mm_add_pd(velecsum,velec);
478 /* Calculate temporary vectorial force */
479 tx = _mm_mul_pd(fscal,dx10);
480 ty = _mm_mul_pd(fscal,dy10);
481 tz = _mm_mul_pd(fscal,dz10);
483 /* Update vectorial force */
484 fix1 = _mm_add_pd(fix1,tx);
485 fiy1 = _mm_add_pd(fiy1,ty);
486 fiz1 = _mm_add_pd(fiz1,tz);
488 fjx0 = _mm_add_pd(fjx0,tx);
489 fjy0 = _mm_add_pd(fjy0,ty);
490 fjz0 = _mm_add_pd(fjz0,tz);
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 r11 = _mm_mul_pd(rsq11,rinv11);
498 /* EWALD ELECTROSTATICS */
500 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
501 ewrt = _mm_mul_pd(r11,ewtabscale);
502 ewitab = _mm_cvttpd_epi32(ewrt);
503 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
504 ewitab = _mm_slli_epi32(ewitab,2);
505 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
506 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
507 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
508 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
509 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
510 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
511 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
512 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
513 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
514 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velecsum = _mm_add_pd(velecsum,velec);
521 /* Calculate temporary vectorial force */
522 tx = _mm_mul_pd(fscal,dx11);
523 ty = _mm_mul_pd(fscal,dy11);
524 tz = _mm_mul_pd(fscal,dz11);
526 /* Update vectorial force */
527 fix1 = _mm_add_pd(fix1,tx);
528 fiy1 = _mm_add_pd(fiy1,ty);
529 fiz1 = _mm_add_pd(fiz1,tz);
531 fjx1 = _mm_add_pd(fjx1,tx);
532 fjy1 = _mm_add_pd(fjy1,ty);
533 fjz1 = _mm_add_pd(fjz1,tz);
535 /**************************
536 * CALCULATE INTERACTIONS *
537 **************************/
539 r12 = _mm_mul_pd(rsq12,rinv12);
541 /* EWALD ELECTROSTATICS */
543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544 ewrt = _mm_mul_pd(r12,ewtabscale);
545 ewitab = _mm_cvttpd_epi32(ewrt);
546 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
547 ewitab = _mm_slli_epi32(ewitab,2);
548 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
549 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
550 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
551 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
552 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
553 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
554 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
555 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
556 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
557 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velecsum = _mm_add_pd(velecsum,velec);
564 /* Calculate temporary vectorial force */
565 tx = _mm_mul_pd(fscal,dx12);
566 ty = _mm_mul_pd(fscal,dy12);
567 tz = _mm_mul_pd(fscal,dz12);
569 /* Update vectorial force */
570 fix1 = _mm_add_pd(fix1,tx);
571 fiy1 = _mm_add_pd(fiy1,ty);
572 fiz1 = _mm_add_pd(fiz1,tz);
574 fjx2 = _mm_add_pd(fjx2,tx);
575 fjy2 = _mm_add_pd(fjy2,ty);
576 fjz2 = _mm_add_pd(fjz2,tz);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 r20 = _mm_mul_pd(rsq20,rinv20);
584 /* EWALD ELECTROSTATICS */
586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
587 ewrt = _mm_mul_pd(r20,ewtabscale);
588 ewitab = _mm_cvttpd_epi32(ewrt);
589 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
590 ewitab = _mm_slli_epi32(ewitab,2);
591 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
592 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
593 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
594 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
595 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
596 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
597 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
598 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
599 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
600 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velecsum = _mm_add_pd(velecsum,velec);
607 /* Calculate temporary vectorial force */
608 tx = _mm_mul_pd(fscal,dx20);
609 ty = _mm_mul_pd(fscal,dy20);
610 tz = _mm_mul_pd(fscal,dz20);
612 /* Update vectorial force */
613 fix2 = _mm_add_pd(fix2,tx);
614 fiy2 = _mm_add_pd(fiy2,ty);
615 fiz2 = _mm_add_pd(fiz2,tz);
617 fjx0 = _mm_add_pd(fjx0,tx);
618 fjy0 = _mm_add_pd(fjy0,ty);
619 fjz0 = _mm_add_pd(fjz0,tz);
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 r21 = _mm_mul_pd(rsq21,rinv21);
627 /* EWALD ELECTROSTATICS */
629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
630 ewrt = _mm_mul_pd(r21,ewtabscale);
631 ewitab = _mm_cvttpd_epi32(ewrt);
632 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
633 ewitab = _mm_slli_epi32(ewitab,2);
634 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
635 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
636 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
637 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
638 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
639 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
640 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
641 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
642 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
643 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velecsum = _mm_add_pd(velecsum,velec);
650 /* Calculate temporary vectorial force */
651 tx = _mm_mul_pd(fscal,dx21);
652 ty = _mm_mul_pd(fscal,dy21);
653 tz = _mm_mul_pd(fscal,dz21);
655 /* Update vectorial force */
656 fix2 = _mm_add_pd(fix2,tx);
657 fiy2 = _mm_add_pd(fiy2,ty);
658 fiz2 = _mm_add_pd(fiz2,tz);
660 fjx1 = _mm_add_pd(fjx1,tx);
661 fjy1 = _mm_add_pd(fjy1,ty);
662 fjz1 = _mm_add_pd(fjz1,tz);
664 /**************************
665 * CALCULATE INTERACTIONS *
666 **************************/
668 r22 = _mm_mul_pd(rsq22,rinv22);
670 /* EWALD ELECTROSTATICS */
672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673 ewrt = _mm_mul_pd(r22,ewtabscale);
674 ewitab = _mm_cvttpd_epi32(ewrt);
675 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
676 ewitab = _mm_slli_epi32(ewitab,2);
677 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
678 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
679 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
680 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
681 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
682 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
683 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
684 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
685 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
686 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
688 /* Update potential sum for this i atom from the interaction with this j atom. */
689 velecsum = _mm_add_pd(velecsum,velec);
693 /* Calculate temporary vectorial force */
694 tx = _mm_mul_pd(fscal,dx22);
695 ty = _mm_mul_pd(fscal,dy22);
696 tz = _mm_mul_pd(fscal,dz22);
698 /* Update vectorial force */
699 fix2 = _mm_add_pd(fix2,tx);
700 fiy2 = _mm_add_pd(fiy2,ty);
701 fiz2 = _mm_add_pd(fiz2,tz);
703 fjx2 = _mm_add_pd(fjx2,tx);
704 fjy2 = _mm_add_pd(fjy2,ty);
705 fjz2 = _mm_add_pd(fjz2,tz);
707 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
709 /* Inner loop uses 397 flops */
716 j_coord_offsetA = DIM*jnrA;
718 /* load j atom coordinates */
719 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
720 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
722 /* Calculate displacement vector */
723 dx00 = _mm_sub_pd(ix0,jx0);
724 dy00 = _mm_sub_pd(iy0,jy0);
725 dz00 = _mm_sub_pd(iz0,jz0);
726 dx01 = _mm_sub_pd(ix0,jx1);
727 dy01 = _mm_sub_pd(iy0,jy1);
728 dz01 = _mm_sub_pd(iz0,jz1);
729 dx02 = _mm_sub_pd(ix0,jx2);
730 dy02 = _mm_sub_pd(iy0,jy2);
731 dz02 = _mm_sub_pd(iz0,jz2);
732 dx10 = _mm_sub_pd(ix1,jx0);
733 dy10 = _mm_sub_pd(iy1,jy0);
734 dz10 = _mm_sub_pd(iz1,jz0);
735 dx11 = _mm_sub_pd(ix1,jx1);
736 dy11 = _mm_sub_pd(iy1,jy1);
737 dz11 = _mm_sub_pd(iz1,jz1);
738 dx12 = _mm_sub_pd(ix1,jx2);
739 dy12 = _mm_sub_pd(iy1,jy2);
740 dz12 = _mm_sub_pd(iz1,jz2);
741 dx20 = _mm_sub_pd(ix2,jx0);
742 dy20 = _mm_sub_pd(iy2,jy0);
743 dz20 = _mm_sub_pd(iz2,jz0);
744 dx21 = _mm_sub_pd(ix2,jx1);
745 dy21 = _mm_sub_pd(iy2,jy1);
746 dz21 = _mm_sub_pd(iz2,jz1);
747 dx22 = _mm_sub_pd(ix2,jx2);
748 dy22 = _mm_sub_pd(iy2,jy2);
749 dz22 = _mm_sub_pd(iz2,jz2);
751 /* Calculate squared distance and things based on it */
752 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
753 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
754 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
755 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
756 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
757 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
758 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
759 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
760 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
762 rinv00 = gmx_mm_invsqrt_pd(rsq00);
763 rinv01 = gmx_mm_invsqrt_pd(rsq01);
764 rinv02 = gmx_mm_invsqrt_pd(rsq02);
765 rinv10 = gmx_mm_invsqrt_pd(rsq10);
766 rinv11 = gmx_mm_invsqrt_pd(rsq11);
767 rinv12 = gmx_mm_invsqrt_pd(rsq12);
768 rinv20 = gmx_mm_invsqrt_pd(rsq20);
769 rinv21 = gmx_mm_invsqrt_pd(rsq21);
770 rinv22 = gmx_mm_invsqrt_pd(rsq22);
772 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
773 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
774 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
775 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
776 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
777 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
778 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
779 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
780 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
782 fjx0 = _mm_setzero_pd();
783 fjy0 = _mm_setzero_pd();
784 fjz0 = _mm_setzero_pd();
785 fjx1 = _mm_setzero_pd();
786 fjy1 = _mm_setzero_pd();
787 fjz1 = _mm_setzero_pd();
788 fjx2 = _mm_setzero_pd();
789 fjy2 = _mm_setzero_pd();
790 fjz2 = _mm_setzero_pd();
792 /**************************
793 * CALCULATE INTERACTIONS *
794 **************************/
796 r00 = _mm_mul_pd(rsq00,rinv00);
798 /* EWALD ELECTROSTATICS */
800 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
801 ewrt = _mm_mul_pd(r00,ewtabscale);
802 ewitab = _mm_cvttpd_epi32(ewrt);
803 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
804 ewitab = _mm_slli_epi32(ewitab,2);
805 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
806 ewtabD = _mm_setzero_pd();
807 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
808 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
809 ewtabFn = _mm_setzero_pd();
810 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
811 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
812 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
813 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
814 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
816 /* Analytical LJ-PME */
817 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
818 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
819 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
820 exponent = gmx_simd_exp_d(ewcljrsq);
821 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
822 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
823 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
824 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
825 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
826 vvdw = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
827 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
828 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
832 velecsum = _mm_add_pd(velecsum,velec);
833 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
834 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
836 fscal = _mm_add_pd(felec,fvdw);
838 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
840 /* Calculate temporary vectorial force */
841 tx = _mm_mul_pd(fscal,dx00);
842 ty = _mm_mul_pd(fscal,dy00);
843 tz = _mm_mul_pd(fscal,dz00);
845 /* Update vectorial force */
846 fix0 = _mm_add_pd(fix0,tx);
847 fiy0 = _mm_add_pd(fiy0,ty);
848 fiz0 = _mm_add_pd(fiz0,tz);
850 fjx0 = _mm_add_pd(fjx0,tx);
851 fjy0 = _mm_add_pd(fjy0,ty);
852 fjz0 = _mm_add_pd(fjz0,tz);
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 r01 = _mm_mul_pd(rsq01,rinv01);
860 /* EWALD ELECTROSTATICS */
862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
863 ewrt = _mm_mul_pd(r01,ewtabscale);
864 ewitab = _mm_cvttpd_epi32(ewrt);
865 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
866 ewitab = _mm_slli_epi32(ewitab,2);
867 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
868 ewtabD = _mm_setzero_pd();
869 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
870 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
871 ewtabFn = _mm_setzero_pd();
872 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
873 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
874 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
875 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
876 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
878 /* Update potential sum for this i atom from the interaction with this j atom. */
879 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
880 velecsum = _mm_add_pd(velecsum,velec);
884 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
886 /* Calculate temporary vectorial force */
887 tx = _mm_mul_pd(fscal,dx01);
888 ty = _mm_mul_pd(fscal,dy01);
889 tz = _mm_mul_pd(fscal,dz01);
891 /* Update vectorial force */
892 fix0 = _mm_add_pd(fix0,tx);
893 fiy0 = _mm_add_pd(fiy0,ty);
894 fiz0 = _mm_add_pd(fiz0,tz);
896 fjx1 = _mm_add_pd(fjx1,tx);
897 fjy1 = _mm_add_pd(fjy1,ty);
898 fjz1 = _mm_add_pd(fjz1,tz);
900 /**************************
901 * CALCULATE INTERACTIONS *
902 **************************/
904 r02 = _mm_mul_pd(rsq02,rinv02);
906 /* EWALD ELECTROSTATICS */
908 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
909 ewrt = _mm_mul_pd(r02,ewtabscale);
910 ewitab = _mm_cvttpd_epi32(ewrt);
911 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
912 ewitab = _mm_slli_epi32(ewitab,2);
913 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
914 ewtabD = _mm_setzero_pd();
915 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
916 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
917 ewtabFn = _mm_setzero_pd();
918 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
919 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
920 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
921 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
922 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
924 /* Update potential sum for this i atom from the interaction with this j atom. */
925 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
926 velecsum = _mm_add_pd(velecsum,velec);
930 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
932 /* Calculate temporary vectorial force */
933 tx = _mm_mul_pd(fscal,dx02);
934 ty = _mm_mul_pd(fscal,dy02);
935 tz = _mm_mul_pd(fscal,dz02);
937 /* Update vectorial force */
938 fix0 = _mm_add_pd(fix0,tx);
939 fiy0 = _mm_add_pd(fiy0,ty);
940 fiz0 = _mm_add_pd(fiz0,tz);
942 fjx2 = _mm_add_pd(fjx2,tx);
943 fjy2 = _mm_add_pd(fjy2,ty);
944 fjz2 = _mm_add_pd(fjz2,tz);
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 r10 = _mm_mul_pd(rsq10,rinv10);
952 /* EWALD ELECTROSTATICS */
954 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
955 ewrt = _mm_mul_pd(r10,ewtabscale);
956 ewitab = _mm_cvttpd_epi32(ewrt);
957 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
958 ewitab = _mm_slli_epi32(ewitab,2);
959 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
960 ewtabD = _mm_setzero_pd();
961 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
962 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
963 ewtabFn = _mm_setzero_pd();
964 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
965 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
966 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
967 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
968 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
970 /* Update potential sum for this i atom from the interaction with this j atom. */
971 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
972 velecsum = _mm_add_pd(velecsum,velec);
976 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
978 /* Calculate temporary vectorial force */
979 tx = _mm_mul_pd(fscal,dx10);
980 ty = _mm_mul_pd(fscal,dy10);
981 tz = _mm_mul_pd(fscal,dz10);
983 /* Update vectorial force */
984 fix1 = _mm_add_pd(fix1,tx);
985 fiy1 = _mm_add_pd(fiy1,ty);
986 fiz1 = _mm_add_pd(fiz1,tz);
988 fjx0 = _mm_add_pd(fjx0,tx);
989 fjy0 = _mm_add_pd(fjy0,ty);
990 fjz0 = _mm_add_pd(fjz0,tz);
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 r11 = _mm_mul_pd(rsq11,rinv11);
998 /* EWALD ELECTROSTATICS */
1000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1001 ewrt = _mm_mul_pd(r11,ewtabscale);
1002 ewitab = _mm_cvttpd_epi32(ewrt);
1003 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1004 ewitab = _mm_slli_epi32(ewitab,2);
1005 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1006 ewtabD = _mm_setzero_pd();
1007 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1008 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1009 ewtabFn = _mm_setzero_pd();
1010 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1011 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1012 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1013 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1014 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1018 velecsum = _mm_add_pd(velecsum,velec);
1022 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1024 /* Calculate temporary vectorial force */
1025 tx = _mm_mul_pd(fscal,dx11);
1026 ty = _mm_mul_pd(fscal,dy11);
1027 tz = _mm_mul_pd(fscal,dz11);
1029 /* Update vectorial force */
1030 fix1 = _mm_add_pd(fix1,tx);
1031 fiy1 = _mm_add_pd(fiy1,ty);
1032 fiz1 = _mm_add_pd(fiz1,tz);
1034 fjx1 = _mm_add_pd(fjx1,tx);
1035 fjy1 = _mm_add_pd(fjy1,ty);
1036 fjz1 = _mm_add_pd(fjz1,tz);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 r12 = _mm_mul_pd(rsq12,rinv12);
1044 /* EWALD ELECTROSTATICS */
1046 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1047 ewrt = _mm_mul_pd(r12,ewtabscale);
1048 ewitab = _mm_cvttpd_epi32(ewrt);
1049 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1050 ewitab = _mm_slli_epi32(ewitab,2);
1051 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1052 ewtabD = _mm_setzero_pd();
1053 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1054 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1055 ewtabFn = _mm_setzero_pd();
1056 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1057 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1058 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1059 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1060 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1064 velecsum = _mm_add_pd(velecsum,velec);
1068 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1070 /* Calculate temporary vectorial force */
1071 tx = _mm_mul_pd(fscal,dx12);
1072 ty = _mm_mul_pd(fscal,dy12);
1073 tz = _mm_mul_pd(fscal,dz12);
1075 /* Update vectorial force */
1076 fix1 = _mm_add_pd(fix1,tx);
1077 fiy1 = _mm_add_pd(fiy1,ty);
1078 fiz1 = _mm_add_pd(fiz1,tz);
1080 fjx2 = _mm_add_pd(fjx2,tx);
1081 fjy2 = _mm_add_pd(fjy2,ty);
1082 fjz2 = _mm_add_pd(fjz2,tz);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 r20 = _mm_mul_pd(rsq20,rinv20);
1090 /* EWALD ELECTROSTATICS */
1092 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1093 ewrt = _mm_mul_pd(r20,ewtabscale);
1094 ewitab = _mm_cvttpd_epi32(ewrt);
1095 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1096 ewitab = _mm_slli_epi32(ewitab,2);
1097 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1098 ewtabD = _mm_setzero_pd();
1099 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1100 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1101 ewtabFn = _mm_setzero_pd();
1102 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1103 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1104 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1105 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1106 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1110 velecsum = _mm_add_pd(velecsum,velec);
1114 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1116 /* Calculate temporary vectorial force */
1117 tx = _mm_mul_pd(fscal,dx20);
1118 ty = _mm_mul_pd(fscal,dy20);
1119 tz = _mm_mul_pd(fscal,dz20);
1121 /* Update vectorial force */
1122 fix2 = _mm_add_pd(fix2,tx);
1123 fiy2 = _mm_add_pd(fiy2,ty);
1124 fiz2 = _mm_add_pd(fiz2,tz);
1126 fjx0 = _mm_add_pd(fjx0,tx);
1127 fjy0 = _mm_add_pd(fjy0,ty);
1128 fjz0 = _mm_add_pd(fjz0,tz);
1130 /**************************
1131 * CALCULATE INTERACTIONS *
1132 **************************/
1134 r21 = _mm_mul_pd(rsq21,rinv21);
1136 /* EWALD ELECTROSTATICS */
1138 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1139 ewrt = _mm_mul_pd(r21,ewtabscale);
1140 ewitab = _mm_cvttpd_epi32(ewrt);
1141 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1142 ewitab = _mm_slli_epi32(ewitab,2);
1143 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1144 ewtabD = _mm_setzero_pd();
1145 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1146 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1147 ewtabFn = _mm_setzero_pd();
1148 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1149 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1150 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1151 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1152 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1154 /* Update potential sum for this i atom from the interaction with this j atom. */
1155 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1156 velecsum = _mm_add_pd(velecsum,velec);
1160 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1162 /* Calculate temporary vectorial force */
1163 tx = _mm_mul_pd(fscal,dx21);
1164 ty = _mm_mul_pd(fscal,dy21);
1165 tz = _mm_mul_pd(fscal,dz21);
1167 /* Update vectorial force */
1168 fix2 = _mm_add_pd(fix2,tx);
1169 fiy2 = _mm_add_pd(fiy2,ty);
1170 fiz2 = _mm_add_pd(fiz2,tz);
1172 fjx1 = _mm_add_pd(fjx1,tx);
1173 fjy1 = _mm_add_pd(fjy1,ty);
1174 fjz1 = _mm_add_pd(fjz1,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 r22 = _mm_mul_pd(rsq22,rinv22);
1182 /* EWALD ELECTROSTATICS */
1184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1185 ewrt = _mm_mul_pd(r22,ewtabscale);
1186 ewitab = _mm_cvttpd_epi32(ewrt);
1187 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1188 ewitab = _mm_slli_epi32(ewitab,2);
1189 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1190 ewtabD = _mm_setzero_pd();
1191 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1192 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1193 ewtabFn = _mm_setzero_pd();
1194 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1195 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1196 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1197 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1198 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1200 /* Update potential sum for this i atom from the interaction with this j atom. */
1201 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1202 velecsum = _mm_add_pd(velecsum,velec);
1206 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1208 /* Calculate temporary vectorial force */
1209 tx = _mm_mul_pd(fscal,dx22);
1210 ty = _mm_mul_pd(fscal,dy22);
1211 tz = _mm_mul_pd(fscal,dz22);
1213 /* Update vectorial force */
1214 fix2 = _mm_add_pd(fix2,tx);
1215 fiy2 = _mm_add_pd(fiy2,ty);
1216 fiz2 = _mm_add_pd(fiz2,tz);
1218 fjx2 = _mm_add_pd(fjx2,tx);
1219 fjy2 = _mm_add_pd(fjy2,ty);
1220 fjz2 = _mm_add_pd(fjz2,tz);
1222 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1224 /* Inner loop uses 397 flops */
1227 /* End of innermost loop */
1229 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1230 f+i_coord_offset,fshift+i_shift_offset);
1233 /* Update potential energies */
1234 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1235 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1237 /* Increment number of inner iterations */
1238 inneriter += j_index_end - j_index_start;
1240 /* Outer loop uses 20 flops */
1243 /* Increment number of outer iterations */
1246 /* Update outer/inner flops */
1248 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*397);
1251 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double
1252 * Electrostatics interaction: Ewald
1253 * VdW interaction: LJEwald
1254 * Geometry: Water3-Water3
1255 * Calculate force/pot: Force
1258 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse4_1_double
1259 (t_nblist * gmx_restrict nlist,
1260 rvec * gmx_restrict xx,
1261 rvec * gmx_restrict ff,
1262 t_forcerec * gmx_restrict fr,
1263 t_mdatoms * gmx_restrict mdatoms,
1264 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1265 t_nrnb * gmx_restrict nrnb)
1267 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1268 * just 0 for non-waters.
1269 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1270 * jnr indices corresponding to data put in the four positions in the SIMD register.
1272 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1273 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1275 int j_coord_offsetA,j_coord_offsetB;
1276 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1277 real rcutoff_scalar;
1278 real *shiftvec,*fshift,*x,*f;
1279 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1281 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1283 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1285 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1286 int vdwjidx0A,vdwjidx0B;
1287 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1288 int vdwjidx1A,vdwjidx1B;
1289 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1290 int vdwjidx2A,vdwjidx2B;
1291 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1292 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1293 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1294 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1295 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1296 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1297 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1298 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1299 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1300 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1301 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1304 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1307 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1308 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1318 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1320 __m128d one_half = _mm_set1_pd(0.5);
1321 __m128d minus_one = _mm_set1_pd(-1.0);
1323 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1325 __m128d dummy_mask,cutoff_mask;
1326 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1327 __m128d one = _mm_set1_pd(1.0);
1328 __m128d two = _mm_set1_pd(2.0);
1334 jindex = nlist->jindex;
1336 shiftidx = nlist->shift;
1338 shiftvec = fr->shift_vec[0];
1339 fshift = fr->fshift[0];
1340 facel = _mm_set1_pd(fr->epsfac);
1341 charge = mdatoms->chargeA;
1342 nvdwtype = fr->ntype;
1343 vdwparam = fr->nbfp;
1344 vdwtype = mdatoms->typeA;
1345 vdwgridparam = fr->ljpme_c6grid;
1346 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
1347 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
1348 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
1350 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1351 ewtab = fr->ic->tabq_coul_F;
1352 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1353 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1355 /* Setup water-specific parameters */
1356 inr = nlist->iinr[0];
1357 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1358 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1359 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1360 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1362 jq0 = _mm_set1_pd(charge[inr+0]);
1363 jq1 = _mm_set1_pd(charge[inr+1]);
1364 jq2 = _mm_set1_pd(charge[inr+2]);
1365 vdwjidx0A = 2*vdwtype[inr+0];
1366 qq00 = _mm_mul_pd(iq0,jq0);
1367 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1368 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1369 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
1370 qq01 = _mm_mul_pd(iq0,jq1);
1371 qq02 = _mm_mul_pd(iq0,jq2);
1372 qq10 = _mm_mul_pd(iq1,jq0);
1373 qq11 = _mm_mul_pd(iq1,jq1);
1374 qq12 = _mm_mul_pd(iq1,jq2);
1375 qq20 = _mm_mul_pd(iq2,jq0);
1376 qq21 = _mm_mul_pd(iq2,jq1);
1377 qq22 = _mm_mul_pd(iq2,jq2);
1379 /* Avoid stupid compiler warnings */
1381 j_coord_offsetA = 0;
1382 j_coord_offsetB = 0;
1387 /* Start outer loop over neighborlists */
1388 for(iidx=0; iidx<nri; iidx++)
1390 /* Load shift vector for this list */
1391 i_shift_offset = DIM*shiftidx[iidx];
1393 /* Load limits for loop over neighbors */
1394 j_index_start = jindex[iidx];
1395 j_index_end = jindex[iidx+1];
1397 /* Get outer coordinate index */
1399 i_coord_offset = DIM*inr;
1401 /* Load i particle coords and add shift vector */
1402 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1403 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1405 fix0 = _mm_setzero_pd();
1406 fiy0 = _mm_setzero_pd();
1407 fiz0 = _mm_setzero_pd();
1408 fix1 = _mm_setzero_pd();
1409 fiy1 = _mm_setzero_pd();
1410 fiz1 = _mm_setzero_pd();
1411 fix2 = _mm_setzero_pd();
1412 fiy2 = _mm_setzero_pd();
1413 fiz2 = _mm_setzero_pd();
1415 /* Start inner kernel loop */
1416 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1419 /* Get j neighbor index, and coordinate index */
1421 jnrB = jjnr[jidx+1];
1422 j_coord_offsetA = DIM*jnrA;
1423 j_coord_offsetB = DIM*jnrB;
1425 /* load j atom coordinates */
1426 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1427 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1429 /* Calculate displacement vector */
1430 dx00 = _mm_sub_pd(ix0,jx0);
1431 dy00 = _mm_sub_pd(iy0,jy0);
1432 dz00 = _mm_sub_pd(iz0,jz0);
1433 dx01 = _mm_sub_pd(ix0,jx1);
1434 dy01 = _mm_sub_pd(iy0,jy1);
1435 dz01 = _mm_sub_pd(iz0,jz1);
1436 dx02 = _mm_sub_pd(ix0,jx2);
1437 dy02 = _mm_sub_pd(iy0,jy2);
1438 dz02 = _mm_sub_pd(iz0,jz2);
1439 dx10 = _mm_sub_pd(ix1,jx0);
1440 dy10 = _mm_sub_pd(iy1,jy0);
1441 dz10 = _mm_sub_pd(iz1,jz0);
1442 dx11 = _mm_sub_pd(ix1,jx1);
1443 dy11 = _mm_sub_pd(iy1,jy1);
1444 dz11 = _mm_sub_pd(iz1,jz1);
1445 dx12 = _mm_sub_pd(ix1,jx2);
1446 dy12 = _mm_sub_pd(iy1,jy2);
1447 dz12 = _mm_sub_pd(iz1,jz2);
1448 dx20 = _mm_sub_pd(ix2,jx0);
1449 dy20 = _mm_sub_pd(iy2,jy0);
1450 dz20 = _mm_sub_pd(iz2,jz0);
1451 dx21 = _mm_sub_pd(ix2,jx1);
1452 dy21 = _mm_sub_pd(iy2,jy1);
1453 dz21 = _mm_sub_pd(iz2,jz1);
1454 dx22 = _mm_sub_pd(ix2,jx2);
1455 dy22 = _mm_sub_pd(iy2,jy2);
1456 dz22 = _mm_sub_pd(iz2,jz2);
1458 /* Calculate squared distance and things based on it */
1459 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1460 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1461 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1462 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1463 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1464 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1465 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1466 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1467 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1469 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1470 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1471 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1472 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1473 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1474 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1475 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1476 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1477 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1479 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1480 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1481 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1482 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1483 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1484 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1485 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1486 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1487 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1489 fjx0 = _mm_setzero_pd();
1490 fjy0 = _mm_setzero_pd();
1491 fjz0 = _mm_setzero_pd();
1492 fjx1 = _mm_setzero_pd();
1493 fjy1 = _mm_setzero_pd();
1494 fjz1 = _mm_setzero_pd();
1495 fjx2 = _mm_setzero_pd();
1496 fjy2 = _mm_setzero_pd();
1497 fjz2 = _mm_setzero_pd();
1499 /**************************
1500 * CALCULATE INTERACTIONS *
1501 **************************/
1503 r00 = _mm_mul_pd(rsq00,rinv00);
1505 /* EWALD ELECTROSTATICS */
1507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1508 ewrt = _mm_mul_pd(r00,ewtabscale);
1509 ewitab = _mm_cvttpd_epi32(ewrt);
1510 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1511 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1513 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1514 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1516 /* Analytical LJ-PME */
1517 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1518 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1519 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1520 exponent = gmx_simd_exp_d(ewcljrsq);
1521 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1522 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1523 /* f6A = 6 * C6grid * (1 - poly) */
1524 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1525 /* f6B = C6grid * exponent * beta^6 */
1526 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1527 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1528 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1530 fscal = _mm_add_pd(felec,fvdw);
1532 /* Calculate temporary vectorial force */
1533 tx = _mm_mul_pd(fscal,dx00);
1534 ty = _mm_mul_pd(fscal,dy00);
1535 tz = _mm_mul_pd(fscal,dz00);
1537 /* Update vectorial force */
1538 fix0 = _mm_add_pd(fix0,tx);
1539 fiy0 = _mm_add_pd(fiy0,ty);
1540 fiz0 = _mm_add_pd(fiz0,tz);
1542 fjx0 = _mm_add_pd(fjx0,tx);
1543 fjy0 = _mm_add_pd(fjy0,ty);
1544 fjz0 = _mm_add_pd(fjz0,tz);
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 r01 = _mm_mul_pd(rsq01,rinv01);
1552 /* EWALD ELECTROSTATICS */
1554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1555 ewrt = _mm_mul_pd(r01,ewtabscale);
1556 ewitab = _mm_cvttpd_epi32(ewrt);
1557 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1558 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1560 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1561 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1565 /* Calculate temporary vectorial force */
1566 tx = _mm_mul_pd(fscal,dx01);
1567 ty = _mm_mul_pd(fscal,dy01);
1568 tz = _mm_mul_pd(fscal,dz01);
1570 /* Update vectorial force */
1571 fix0 = _mm_add_pd(fix0,tx);
1572 fiy0 = _mm_add_pd(fiy0,ty);
1573 fiz0 = _mm_add_pd(fiz0,tz);
1575 fjx1 = _mm_add_pd(fjx1,tx);
1576 fjy1 = _mm_add_pd(fjy1,ty);
1577 fjz1 = _mm_add_pd(fjz1,tz);
1579 /**************************
1580 * CALCULATE INTERACTIONS *
1581 **************************/
1583 r02 = _mm_mul_pd(rsq02,rinv02);
1585 /* EWALD ELECTROSTATICS */
1587 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1588 ewrt = _mm_mul_pd(r02,ewtabscale);
1589 ewitab = _mm_cvttpd_epi32(ewrt);
1590 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1591 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1593 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1594 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1598 /* Calculate temporary vectorial force */
1599 tx = _mm_mul_pd(fscal,dx02);
1600 ty = _mm_mul_pd(fscal,dy02);
1601 tz = _mm_mul_pd(fscal,dz02);
1603 /* Update vectorial force */
1604 fix0 = _mm_add_pd(fix0,tx);
1605 fiy0 = _mm_add_pd(fiy0,ty);
1606 fiz0 = _mm_add_pd(fiz0,tz);
1608 fjx2 = _mm_add_pd(fjx2,tx);
1609 fjy2 = _mm_add_pd(fjy2,ty);
1610 fjz2 = _mm_add_pd(fjz2,tz);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 r10 = _mm_mul_pd(rsq10,rinv10);
1618 /* EWALD ELECTROSTATICS */
1620 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1621 ewrt = _mm_mul_pd(r10,ewtabscale);
1622 ewitab = _mm_cvttpd_epi32(ewrt);
1623 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1624 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1626 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1627 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1631 /* Calculate temporary vectorial force */
1632 tx = _mm_mul_pd(fscal,dx10);
1633 ty = _mm_mul_pd(fscal,dy10);
1634 tz = _mm_mul_pd(fscal,dz10);
1636 /* Update vectorial force */
1637 fix1 = _mm_add_pd(fix1,tx);
1638 fiy1 = _mm_add_pd(fiy1,ty);
1639 fiz1 = _mm_add_pd(fiz1,tz);
1641 fjx0 = _mm_add_pd(fjx0,tx);
1642 fjy0 = _mm_add_pd(fjy0,ty);
1643 fjz0 = _mm_add_pd(fjz0,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 r11 = _mm_mul_pd(rsq11,rinv11);
1651 /* EWALD ELECTROSTATICS */
1653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1654 ewrt = _mm_mul_pd(r11,ewtabscale);
1655 ewitab = _mm_cvttpd_epi32(ewrt);
1656 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1657 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1659 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1660 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1664 /* Calculate temporary vectorial force */
1665 tx = _mm_mul_pd(fscal,dx11);
1666 ty = _mm_mul_pd(fscal,dy11);
1667 tz = _mm_mul_pd(fscal,dz11);
1669 /* Update vectorial force */
1670 fix1 = _mm_add_pd(fix1,tx);
1671 fiy1 = _mm_add_pd(fiy1,ty);
1672 fiz1 = _mm_add_pd(fiz1,tz);
1674 fjx1 = _mm_add_pd(fjx1,tx);
1675 fjy1 = _mm_add_pd(fjy1,ty);
1676 fjz1 = _mm_add_pd(fjz1,tz);
1678 /**************************
1679 * CALCULATE INTERACTIONS *
1680 **************************/
1682 r12 = _mm_mul_pd(rsq12,rinv12);
1684 /* EWALD ELECTROSTATICS */
1686 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1687 ewrt = _mm_mul_pd(r12,ewtabscale);
1688 ewitab = _mm_cvttpd_epi32(ewrt);
1689 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1690 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1692 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1693 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1697 /* Calculate temporary vectorial force */
1698 tx = _mm_mul_pd(fscal,dx12);
1699 ty = _mm_mul_pd(fscal,dy12);
1700 tz = _mm_mul_pd(fscal,dz12);
1702 /* Update vectorial force */
1703 fix1 = _mm_add_pd(fix1,tx);
1704 fiy1 = _mm_add_pd(fiy1,ty);
1705 fiz1 = _mm_add_pd(fiz1,tz);
1707 fjx2 = _mm_add_pd(fjx2,tx);
1708 fjy2 = _mm_add_pd(fjy2,ty);
1709 fjz2 = _mm_add_pd(fjz2,tz);
1711 /**************************
1712 * CALCULATE INTERACTIONS *
1713 **************************/
1715 r20 = _mm_mul_pd(rsq20,rinv20);
1717 /* EWALD ELECTROSTATICS */
1719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1720 ewrt = _mm_mul_pd(r20,ewtabscale);
1721 ewitab = _mm_cvttpd_epi32(ewrt);
1722 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1723 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1725 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1726 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1730 /* Calculate temporary vectorial force */
1731 tx = _mm_mul_pd(fscal,dx20);
1732 ty = _mm_mul_pd(fscal,dy20);
1733 tz = _mm_mul_pd(fscal,dz20);
1735 /* Update vectorial force */
1736 fix2 = _mm_add_pd(fix2,tx);
1737 fiy2 = _mm_add_pd(fiy2,ty);
1738 fiz2 = _mm_add_pd(fiz2,tz);
1740 fjx0 = _mm_add_pd(fjx0,tx);
1741 fjy0 = _mm_add_pd(fjy0,ty);
1742 fjz0 = _mm_add_pd(fjz0,tz);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 r21 = _mm_mul_pd(rsq21,rinv21);
1750 /* EWALD ELECTROSTATICS */
1752 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1753 ewrt = _mm_mul_pd(r21,ewtabscale);
1754 ewitab = _mm_cvttpd_epi32(ewrt);
1755 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1756 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1758 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1759 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1763 /* Calculate temporary vectorial force */
1764 tx = _mm_mul_pd(fscal,dx21);
1765 ty = _mm_mul_pd(fscal,dy21);
1766 tz = _mm_mul_pd(fscal,dz21);
1768 /* Update vectorial force */
1769 fix2 = _mm_add_pd(fix2,tx);
1770 fiy2 = _mm_add_pd(fiy2,ty);
1771 fiz2 = _mm_add_pd(fiz2,tz);
1773 fjx1 = _mm_add_pd(fjx1,tx);
1774 fjy1 = _mm_add_pd(fjy1,ty);
1775 fjz1 = _mm_add_pd(fjz1,tz);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 r22 = _mm_mul_pd(rsq22,rinv22);
1783 /* EWALD ELECTROSTATICS */
1785 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1786 ewrt = _mm_mul_pd(r22,ewtabscale);
1787 ewitab = _mm_cvttpd_epi32(ewrt);
1788 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1789 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1791 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1792 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_pd(fscal,dx22);
1798 ty = _mm_mul_pd(fscal,dy22);
1799 tz = _mm_mul_pd(fscal,dz22);
1801 /* Update vectorial force */
1802 fix2 = _mm_add_pd(fix2,tx);
1803 fiy2 = _mm_add_pd(fiy2,ty);
1804 fiz2 = _mm_add_pd(fiz2,tz);
1806 fjx2 = _mm_add_pd(fjx2,tx);
1807 fjy2 = _mm_add_pd(fjy2,ty);
1808 fjz2 = _mm_add_pd(fjz2,tz);
1810 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1812 /* Inner loop uses 347 flops */
1815 if(jidx<j_index_end)
1819 j_coord_offsetA = DIM*jnrA;
1821 /* load j atom coordinates */
1822 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1823 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1825 /* Calculate displacement vector */
1826 dx00 = _mm_sub_pd(ix0,jx0);
1827 dy00 = _mm_sub_pd(iy0,jy0);
1828 dz00 = _mm_sub_pd(iz0,jz0);
1829 dx01 = _mm_sub_pd(ix0,jx1);
1830 dy01 = _mm_sub_pd(iy0,jy1);
1831 dz01 = _mm_sub_pd(iz0,jz1);
1832 dx02 = _mm_sub_pd(ix0,jx2);
1833 dy02 = _mm_sub_pd(iy0,jy2);
1834 dz02 = _mm_sub_pd(iz0,jz2);
1835 dx10 = _mm_sub_pd(ix1,jx0);
1836 dy10 = _mm_sub_pd(iy1,jy0);
1837 dz10 = _mm_sub_pd(iz1,jz0);
1838 dx11 = _mm_sub_pd(ix1,jx1);
1839 dy11 = _mm_sub_pd(iy1,jy1);
1840 dz11 = _mm_sub_pd(iz1,jz1);
1841 dx12 = _mm_sub_pd(ix1,jx2);
1842 dy12 = _mm_sub_pd(iy1,jy2);
1843 dz12 = _mm_sub_pd(iz1,jz2);
1844 dx20 = _mm_sub_pd(ix2,jx0);
1845 dy20 = _mm_sub_pd(iy2,jy0);
1846 dz20 = _mm_sub_pd(iz2,jz0);
1847 dx21 = _mm_sub_pd(ix2,jx1);
1848 dy21 = _mm_sub_pd(iy2,jy1);
1849 dz21 = _mm_sub_pd(iz2,jz1);
1850 dx22 = _mm_sub_pd(ix2,jx2);
1851 dy22 = _mm_sub_pd(iy2,jy2);
1852 dz22 = _mm_sub_pd(iz2,jz2);
1854 /* Calculate squared distance and things based on it */
1855 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1856 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1857 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1858 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1859 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1860 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1861 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1862 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1863 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1865 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1866 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1867 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1868 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1869 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1870 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1871 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1872 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1873 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1875 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1876 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1877 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1878 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1879 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1880 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1881 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1882 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1883 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1885 fjx0 = _mm_setzero_pd();
1886 fjy0 = _mm_setzero_pd();
1887 fjz0 = _mm_setzero_pd();
1888 fjx1 = _mm_setzero_pd();
1889 fjy1 = _mm_setzero_pd();
1890 fjz1 = _mm_setzero_pd();
1891 fjx2 = _mm_setzero_pd();
1892 fjy2 = _mm_setzero_pd();
1893 fjz2 = _mm_setzero_pd();
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 r00 = _mm_mul_pd(rsq00,rinv00);
1901 /* EWALD ELECTROSTATICS */
1903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904 ewrt = _mm_mul_pd(r00,ewtabscale);
1905 ewitab = _mm_cvttpd_epi32(ewrt);
1906 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1907 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1908 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1909 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1911 /* Analytical LJ-PME */
1912 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1913 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1914 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1915 exponent = gmx_simd_exp_d(ewcljrsq);
1916 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1917 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1918 /* f6A = 6 * C6grid * (1 - poly) */
1919 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1920 /* f6B = C6grid * exponent * beta^6 */
1921 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1922 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1923 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1925 fscal = _mm_add_pd(felec,fvdw);
1927 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1929 /* Calculate temporary vectorial force */
1930 tx = _mm_mul_pd(fscal,dx00);
1931 ty = _mm_mul_pd(fscal,dy00);
1932 tz = _mm_mul_pd(fscal,dz00);
1934 /* Update vectorial force */
1935 fix0 = _mm_add_pd(fix0,tx);
1936 fiy0 = _mm_add_pd(fiy0,ty);
1937 fiz0 = _mm_add_pd(fiz0,tz);
1939 fjx0 = _mm_add_pd(fjx0,tx);
1940 fjy0 = _mm_add_pd(fjy0,ty);
1941 fjz0 = _mm_add_pd(fjz0,tz);
1943 /**************************
1944 * CALCULATE INTERACTIONS *
1945 **************************/
1947 r01 = _mm_mul_pd(rsq01,rinv01);
1949 /* EWALD ELECTROSTATICS */
1951 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952 ewrt = _mm_mul_pd(r01,ewtabscale);
1953 ewitab = _mm_cvttpd_epi32(ewrt);
1954 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1955 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1956 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1957 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1961 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1963 /* Calculate temporary vectorial force */
1964 tx = _mm_mul_pd(fscal,dx01);
1965 ty = _mm_mul_pd(fscal,dy01);
1966 tz = _mm_mul_pd(fscal,dz01);
1968 /* Update vectorial force */
1969 fix0 = _mm_add_pd(fix0,tx);
1970 fiy0 = _mm_add_pd(fiy0,ty);
1971 fiz0 = _mm_add_pd(fiz0,tz);
1973 fjx1 = _mm_add_pd(fjx1,tx);
1974 fjy1 = _mm_add_pd(fjy1,ty);
1975 fjz1 = _mm_add_pd(fjz1,tz);
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1981 r02 = _mm_mul_pd(rsq02,rinv02);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _mm_mul_pd(r02,ewtabscale);
1987 ewitab = _mm_cvttpd_epi32(ewrt);
1988 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1989 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1990 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1991 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1995 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1997 /* Calculate temporary vectorial force */
1998 tx = _mm_mul_pd(fscal,dx02);
1999 ty = _mm_mul_pd(fscal,dy02);
2000 tz = _mm_mul_pd(fscal,dz02);
2002 /* Update vectorial force */
2003 fix0 = _mm_add_pd(fix0,tx);
2004 fiy0 = _mm_add_pd(fiy0,ty);
2005 fiz0 = _mm_add_pd(fiz0,tz);
2007 fjx2 = _mm_add_pd(fjx2,tx);
2008 fjy2 = _mm_add_pd(fjy2,ty);
2009 fjz2 = _mm_add_pd(fjz2,tz);
2011 /**************************
2012 * CALCULATE INTERACTIONS *
2013 **************************/
2015 r10 = _mm_mul_pd(rsq10,rinv10);
2017 /* EWALD ELECTROSTATICS */
2019 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2020 ewrt = _mm_mul_pd(r10,ewtabscale);
2021 ewitab = _mm_cvttpd_epi32(ewrt);
2022 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2023 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2024 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2025 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2029 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2031 /* Calculate temporary vectorial force */
2032 tx = _mm_mul_pd(fscal,dx10);
2033 ty = _mm_mul_pd(fscal,dy10);
2034 tz = _mm_mul_pd(fscal,dz10);
2036 /* Update vectorial force */
2037 fix1 = _mm_add_pd(fix1,tx);
2038 fiy1 = _mm_add_pd(fiy1,ty);
2039 fiz1 = _mm_add_pd(fiz1,tz);
2041 fjx0 = _mm_add_pd(fjx0,tx);
2042 fjy0 = _mm_add_pd(fjy0,ty);
2043 fjz0 = _mm_add_pd(fjz0,tz);
2045 /**************************
2046 * CALCULATE INTERACTIONS *
2047 **************************/
2049 r11 = _mm_mul_pd(rsq11,rinv11);
2051 /* EWALD ELECTROSTATICS */
2053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2054 ewrt = _mm_mul_pd(r11,ewtabscale);
2055 ewitab = _mm_cvttpd_epi32(ewrt);
2056 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2057 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2058 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2059 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2063 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2065 /* Calculate temporary vectorial force */
2066 tx = _mm_mul_pd(fscal,dx11);
2067 ty = _mm_mul_pd(fscal,dy11);
2068 tz = _mm_mul_pd(fscal,dz11);
2070 /* Update vectorial force */
2071 fix1 = _mm_add_pd(fix1,tx);
2072 fiy1 = _mm_add_pd(fiy1,ty);
2073 fiz1 = _mm_add_pd(fiz1,tz);
2075 fjx1 = _mm_add_pd(fjx1,tx);
2076 fjy1 = _mm_add_pd(fjy1,ty);
2077 fjz1 = _mm_add_pd(fjz1,tz);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 r12 = _mm_mul_pd(rsq12,rinv12);
2085 /* EWALD ELECTROSTATICS */
2087 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2088 ewrt = _mm_mul_pd(r12,ewtabscale);
2089 ewitab = _mm_cvttpd_epi32(ewrt);
2090 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2091 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2092 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2093 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2097 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2099 /* Calculate temporary vectorial force */
2100 tx = _mm_mul_pd(fscal,dx12);
2101 ty = _mm_mul_pd(fscal,dy12);
2102 tz = _mm_mul_pd(fscal,dz12);
2104 /* Update vectorial force */
2105 fix1 = _mm_add_pd(fix1,tx);
2106 fiy1 = _mm_add_pd(fiy1,ty);
2107 fiz1 = _mm_add_pd(fiz1,tz);
2109 fjx2 = _mm_add_pd(fjx2,tx);
2110 fjy2 = _mm_add_pd(fjy2,ty);
2111 fjz2 = _mm_add_pd(fjz2,tz);
2113 /**************************
2114 * CALCULATE INTERACTIONS *
2115 **************************/
2117 r20 = _mm_mul_pd(rsq20,rinv20);
2119 /* EWALD ELECTROSTATICS */
2121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2122 ewrt = _mm_mul_pd(r20,ewtabscale);
2123 ewitab = _mm_cvttpd_epi32(ewrt);
2124 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2125 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2126 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2127 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2131 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2133 /* Calculate temporary vectorial force */
2134 tx = _mm_mul_pd(fscal,dx20);
2135 ty = _mm_mul_pd(fscal,dy20);
2136 tz = _mm_mul_pd(fscal,dz20);
2138 /* Update vectorial force */
2139 fix2 = _mm_add_pd(fix2,tx);
2140 fiy2 = _mm_add_pd(fiy2,ty);
2141 fiz2 = _mm_add_pd(fiz2,tz);
2143 fjx0 = _mm_add_pd(fjx0,tx);
2144 fjy0 = _mm_add_pd(fjy0,ty);
2145 fjz0 = _mm_add_pd(fjz0,tz);
2147 /**************************
2148 * CALCULATE INTERACTIONS *
2149 **************************/
2151 r21 = _mm_mul_pd(rsq21,rinv21);
2153 /* EWALD ELECTROSTATICS */
2155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2156 ewrt = _mm_mul_pd(r21,ewtabscale);
2157 ewitab = _mm_cvttpd_epi32(ewrt);
2158 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2159 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2160 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2161 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2165 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2167 /* Calculate temporary vectorial force */
2168 tx = _mm_mul_pd(fscal,dx21);
2169 ty = _mm_mul_pd(fscal,dy21);
2170 tz = _mm_mul_pd(fscal,dz21);
2172 /* Update vectorial force */
2173 fix2 = _mm_add_pd(fix2,tx);
2174 fiy2 = _mm_add_pd(fiy2,ty);
2175 fiz2 = _mm_add_pd(fiz2,tz);
2177 fjx1 = _mm_add_pd(fjx1,tx);
2178 fjy1 = _mm_add_pd(fjy1,ty);
2179 fjz1 = _mm_add_pd(fjz1,tz);
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 r22 = _mm_mul_pd(rsq22,rinv22);
2187 /* EWALD ELECTROSTATICS */
2189 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2190 ewrt = _mm_mul_pd(r22,ewtabscale);
2191 ewitab = _mm_cvttpd_epi32(ewrt);
2192 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2193 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2194 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2195 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2199 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2201 /* Calculate temporary vectorial force */
2202 tx = _mm_mul_pd(fscal,dx22);
2203 ty = _mm_mul_pd(fscal,dy22);
2204 tz = _mm_mul_pd(fscal,dz22);
2206 /* Update vectorial force */
2207 fix2 = _mm_add_pd(fix2,tx);
2208 fiy2 = _mm_add_pd(fiy2,ty);
2209 fiz2 = _mm_add_pd(fiz2,tz);
2211 fjx2 = _mm_add_pd(fjx2,tx);
2212 fjy2 = _mm_add_pd(fjy2,ty);
2213 fjz2 = _mm_add_pd(fjz2,tz);
2215 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2217 /* Inner loop uses 347 flops */
2220 /* End of innermost loop */
2222 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2223 f+i_coord_offset,fshift+i_shift_offset);
2225 /* Increment number of inner iterations */
2226 inneriter += j_index_end - j_index_start;
2228 /* Outer loop uses 18 flops */
2231 /* Increment number of outer iterations */
2234 /* Update outer/inner flops */
2236 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*347);