2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: None
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
89 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
90 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
91 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m128i ifour = _mm_set1_epi32(4);
95 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
97 __m128d dummy_mask,cutoff_mask;
98 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
99 __m128d one = _mm_set1_pd(1.0);
100 __m128d two = _mm_set1_pd(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm_set1_pd(fr->epsfac);
113 charge = mdatoms->chargeA;
115 vftab = kernel_data->table_elec->data;
116 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
118 /* Setup water-specific parameters */
119 inr = nlist->iinr[0];
120 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
121 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
122 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
124 /* Avoid stupid compiler warnings */
132 /* Start outer loop over neighborlists */
133 for(iidx=0; iidx<nri; iidx++)
135 /* Load shift vector for this list */
136 i_shift_offset = DIM*shiftidx[iidx];
138 /* Load limits for loop over neighbors */
139 j_index_start = jindex[iidx];
140 j_index_end = jindex[iidx+1];
142 /* Get outer coordinate index */
144 i_coord_offset = DIM*inr;
146 /* Load i particle coords and add shift vector */
147 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
148 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
150 fix1 = _mm_setzero_pd();
151 fiy1 = _mm_setzero_pd();
152 fiz1 = _mm_setzero_pd();
153 fix2 = _mm_setzero_pd();
154 fiy2 = _mm_setzero_pd();
155 fiz2 = _mm_setzero_pd();
156 fix3 = _mm_setzero_pd();
157 fiy3 = _mm_setzero_pd();
158 fiz3 = _mm_setzero_pd();
160 /* Reset potential sums */
161 velecsum = _mm_setzero_pd();
163 /* Start inner kernel loop */
164 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
167 /* Get j neighbor index, and coordinate index */
170 j_coord_offsetA = DIM*jnrA;
171 j_coord_offsetB = DIM*jnrB;
173 /* load j atom coordinates */
174 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
177 /* Calculate displacement vector */
178 dx10 = _mm_sub_pd(ix1,jx0);
179 dy10 = _mm_sub_pd(iy1,jy0);
180 dz10 = _mm_sub_pd(iz1,jz0);
181 dx20 = _mm_sub_pd(ix2,jx0);
182 dy20 = _mm_sub_pd(iy2,jy0);
183 dz20 = _mm_sub_pd(iz2,jz0);
184 dx30 = _mm_sub_pd(ix3,jx0);
185 dy30 = _mm_sub_pd(iy3,jy0);
186 dz30 = _mm_sub_pd(iz3,jz0);
188 /* Calculate squared distance and things based on it */
189 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
190 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
191 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
193 rinv10 = gmx_mm_invsqrt_pd(rsq10);
194 rinv20 = gmx_mm_invsqrt_pd(rsq20);
195 rinv30 = gmx_mm_invsqrt_pd(rsq30);
197 /* Load parameters for j particles */
198 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
200 fjx0 = _mm_setzero_pd();
201 fjy0 = _mm_setzero_pd();
202 fjz0 = _mm_setzero_pd();
204 /**************************
205 * CALCULATE INTERACTIONS *
206 **************************/
208 r10 = _mm_mul_pd(rsq10,rinv10);
210 /* Compute parameters for interactions between i and j atoms */
211 qq10 = _mm_mul_pd(iq1,jq0);
213 /* Calculate table index by multiplying r with table scale and truncate to integer */
214 rt = _mm_mul_pd(r10,vftabscale);
215 vfitab = _mm_cvttpd_epi32(rt);
216 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
217 vfitab = _mm_slli_epi32(vfitab,2);
219 /* CUBIC SPLINE TABLE ELECTROSTATICS */
220 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
221 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
222 GMX_MM_TRANSPOSE2_PD(Y,F);
223 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
224 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
225 GMX_MM_TRANSPOSE2_PD(G,H);
226 Heps = _mm_mul_pd(vfeps,H);
227 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
228 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
229 velec = _mm_mul_pd(qq10,VV);
230 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
231 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
233 /* Update potential sum for this i atom from the interaction with this j atom. */
234 velecsum = _mm_add_pd(velecsum,velec);
238 /* Calculate temporary vectorial force */
239 tx = _mm_mul_pd(fscal,dx10);
240 ty = _mm_mul_pd(fscal,dy10);
241 tz = _mm_mul_pd(fscal,dz10);
243 /* Update vectorial force */
244 fix1 = _mm_add_pd(fix1,tx);
245 fiy1 = _mm_add_pd(fiy1,ty);
246 fiz1 = _mm_add_pd(fiz1,tz);
248 fjx0 = _mm_add_pd(fjx0,tx);
249 fjy0 = _mm_add_pd(fjy0,ty);
250 fjz0 = _mm_add_pd(fjz0,tz);
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 r20 = _mm_mul_pd(rsq20,rinv20);
258 /* Compute parameters for interactions between i and j atoms */
259 qq20 = _mm_mul_pd(iq2,jq0);
261 /* Calculate table index by multiplying r with table scale and truncate to integer */
262 rt = _mm_mul_pd(r20,vftabscale);
263 vfitab = _mm_cvttpd_epi32(rt);
264 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
265 vfitab = _mm_slli_epi32(vfitab,2);
267 /* CUBIC SPLINE TABLE ELECTROSTATICS */
268 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
269 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
270 GMX_MM_TRANSPOSE2_PD(Y,F);
271 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
272 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
273 GMX_MM_TRANSPOSE2_PD(G,H);
274 Heps = _mm_mul_pd(vfeps,H);
275 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
276 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
277 velec = _mm_mul_pd(qq20,VV);
278 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
279 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
281 /* Update potential sum for this i atom from the interaction with this j atom. */
282 velecsum = _mm_add_pd(velecsum,velec);
286 /* Calculate temporary vectorial force */
287 tx = _mm_mul_pd(fscal,dx20);
288 ty = _mm_mul_pd(fscal,dy20);
289 tz = _mm_mul_pd(fscal,dz20);
291 /* Update vectorial force */
292 fix2 = _mm_add_pd(fix2,tx);
293 fiy2 = _mm_add_pd(fiy2,ty);
294 fiz2 = _mm_add_pd(fiz2,tz);
296 fjx0 = _mm_add_pd(fjx0,tx);
297 fjy0 = _mm_add_pd(fjy0,ty);
298 fjz0 = _mm_add_pd(fjz0,tz);
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 r30 = _mm_mul_pd(rsq30,rinv30);
306 /* Compute parameters for interactions between i and j atoms */
307 qq30 = _mm_mul_pd(iq3,jq0);
309 /* Calculate table index by multiplying r with table scale and truncate to integer */
310 rt = _mm_mul_pd(r30,vftabscale);
311 vfitab = _mm_cvttpd_epi32(rt);
312 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
313 vfitab = _mm_slli_epi32(vfitab,2);
315 /* CUBIC SPLINE TABLE ELECTROSTATICS */
316 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
317 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
318 GMX_MM_TRANSPOSE2_PD(Y,F);
319 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
320 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(G,H);
322 Heps = _mm_mul_pd(vfeps,H);
323 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
324 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
325 velec = _mm_mul_pd(qq30,VV);
326 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
327 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velecsum = _mm_add_pd(velecsum,velec);
334 /* Calculate temporary vectorial force */
335 tx = _mm_mul_pd(fscal,dx30);
336 ty = _mm_mul_pd(fscal,dy30);
337 tz = _mm_mul_pd(fscal,dz30);
339 /* Update vectorial force */
340 fix3 = _mm_add_pd(fix3,tx);
341 fiy3 = _mm_add_pd(fiy3,ty);
342 fiz3 = _mm_add_pd(fiz3,tz);
344 fjx0 = _mm_add_pd(fjx0,tx);
345 fjy0 = _mm_add_pd(fjy0,ty);
346 fjz0 = _mm_add_pd(fjz0,tz);
348 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
350 /* Inner loop uses 132 flops */
357 j_coord_offsetA = DIM*jnrA;
359 /* load j atom coordinates */
360 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
363 /* Calculate displacement vector */
364 dx10 = _mm_sub_pd(ix1,jx0);
365 dy10 = _mm_sub_pd(iy1,jy0);
366 dz10 = _mm_sub_pd(iz1,jz0);
367 dx20 = _mm_sub_pd(ix2,jx0);
368 dy20 = _mm_sub_pd(iy2,jy0);
369 dz20 = _mm_sub_pd(iz2,jz0);
370 dx30 = _mm_sub_pd(ix3,jx0);
371 dy30 = _mm_sub_pd(iy3,jy0);
372 dz30 = _mm_sub_pd(iz3,jz0);
374 /* Calculate squared distance and things based on it */
375 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
376 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
377 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
379 rinv10 = gmx_mm_invsqrt_pd(rsq10);
380 rinv20 = gmx_mm_invsqrt_pd(rsq20);
381 rinv30 = gmx_mm_invsqrt_pd(rsq30);
383 /* Load parameters for j particles */
384 jq0 = _mm_load_sd(charge+jnrA+0);
386 fjx0 = _mm_setzero_pd();
387 fjy0 = _mm_setzero_pd();
388 fjz0 = _mm_setzero_pd();
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r10 = _mm_mul_pd(rsq10,rinv10);
396 /* Compute parameters for interactions between i and j atoms */
397 qq10 = _mm_mul_pd(iq1,jq0);
399 /* Calculate table index by multiplying r with table scale and truncate to integer */
400 rt = _mm_mul_pd(r10,vftabscale);
401 vfitab = _mm_cvttpd_epi32(rt);
402 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
403 vfitab = _mm_slli_epi32(vfitab,2);
405 /* CUBIC SPLINE TABLE ELECTROSTATICS */
406 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
407 F = _mm_setzero_pd();
408 GMX_MM_TRANSPOSE2_PD(Y,F);
409 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
410 H = _mm_setzero_pd();
411 GMX_MM_TRANSPOSE2_PD(G,H);
412 Heps = _mm_mul_pd(vfeps,H);
413 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
414 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
415 velec = _mm_mul_pd(qq10,VV);
416 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
417 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
421 velecsum = _mm_add_pd(velecsum,velec);
425 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
427 /* Calculate temporary vectorial force */
428 tx = _mm_mul_pd(fscal,dx10);
429 ty = _mm_mul_pd(fscal,dy10);
430 tz = _mm_mul_pd(fscal,dz10);
432 /* Update vectorial force */
433 fix1 = _mm_add_pd(fix1,tx);
434 fiy1 = _mm_add_pd(fiy1,ty);
435 fiz1 = _mm_add_pd(fiz1,tz);
437 fjx0 = _mm_add_pd(fjx0,tx);
438 fjy0 = _mm_add_pd(fjy0,ty);
439 fjz0 = _mm_add_pd(fjz0,tz);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r20 = _mm_mul_pd(rsq20,rinv20);
447 /* Compute parameters for interactions between i and j atoms */
448 qq20 = _mm_mul_pd(iq2,jq0);
450 /* Calculate table index by multiplying r with table scale and truncate to integer */
451 rt = _mm_mul_pd(r20,vftabscale);
452 vfitab = _mm_cvttpd_epi32(rt);
453 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
454 vfitab = _mm_slli_epi32(vfitab,2);
456 /* CUBIC SPLINE TABLE ELECTROSTATICS */
457 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
458 F = _mm_setzero_pd();
459 GMX_MM_TRANSPOSE2_PD(Y,F);
460 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
461 H = _mm_setzero_pd();
462 GMX_MM_TRANSPOSE2_PD(G,H);
463 Heps = _mm_mul_pd(vfeps,H);
464 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
465 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
466 velec = _mm_mul_pd(qq20,VV);
467 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
468 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
472 velecsum = _mm_add_pd(velecsum,velec);
476 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
478 /* Calculate temporary vectorial force */
479 tx = _mm_mul_pd(fscal,dx20);
480 ty = _mm_mul_pd(fscal,dy20);
481 tz = _mm_mul_pd(fscal,dz20);
483 /* Update vectorial force */
484 fix2 = _mm_add_pd(fix2,tx);
485 fiy2 = _mm_add_pd(fiy2,ty);
486 fiz2 = _mm_add_pd(fiz2,tz);
488 fjx0 = _mm_add_pd(fjx0,tx);
489 fjy0 = _mm_add_pd(fjy0,ty);
490 fjz0 = _mm_add_pd(fjz0,tz);
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 r30 = _mm_mul_pd(rsq30,rinv30);
498 /* Compute parameters for interactions between i and j atoms */
499 qq30 = _mm_mul_pd(iq3,jq0);
501 /* Calculate table index by multiplying r with table scale and truncate to integer */
502 rt = _mm_mul_pd(r30,vftabscale);
503 vfitab = _mm_cvttpd_epi32(rt);
504 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
505 vfitab = _mm_slli_epi32(vfitab,2);
507 /* CUBIC SPLINE TABLE ELECTROSTATICS */
508 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
509 F = _mm_setzero_pd();
510 GMX_MM_TRANSPOSE2_PD(Y,F);
511 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
512 H = _mm_setzero_pd();
513 GMX_MM_TRANSPOSE2_PD(G,H);
514 Heps = _mm_mul_pd(vfeps,H);
515 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
516 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
517 velec = _mm_mul_pd(qq30,VV);
518 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
519 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
523 velecsum = _mm_add_pd(velecsum,velec);
527 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
529 /* Calculate temporary vectorial force */
530 tx = _mm_mul_pd(fscal,dx30);
531 ty = _mm_mul_pd(fscal,dy30);
532 tz = _mm_mul_pd(fscal,dz30);
534 /* Update vectorial force */
535 fix3 = _mm_add_pd(fix3,tx);
536 fiy3 = _mm_add_pd(fiy3,ty);
537 fiz3 = _mm_add_pd(fiz3,tz);
539 fjx0 = _mm_add_pd(fjx0,tx);
540 fjy0 = _mm_add_pd(fjy0,ty);
541 fjz0 = _mm_add_pd(fjz0,tz);
543 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
545 /* Inner loop uses 132 flops */
548 /* End of innermost loop */
550 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
551 f+i_coord_offset+DIM,fshift+i_shift_offset);
554 /* Update potential energies */
555 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
557 /* Increment number of inner iterations */
558 inneriter += j_index_end - j_index_start;
560 /* Outer loop uses 19 flops */
563 /* Increment number of outer iterations */
566 /* Update outer/inner flops */
568 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*132);
571 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_double
572 * Electrostatics interaction: CubicSplineTable
573 * VdW interaction: None
574 * Geometry: Water4-Particle
575 * Calculate force/pot: Force
578 nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_double
579 (t_nblist * gmx_restrict nlist,
580 rvec * gmx_restrict xx,
581 rvec * gmx_restrict ff,
582 t_forcerec * gmx_restrict fr,
583 t_mdatoms * gmx_restrict mdatoms,
584 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
585 t_nrnb * gmx_restrict nrnb)
587 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
588 * just 0 for non-waters.
589 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
590 * jnr indices corresponding to data put in the four positions in the SIMD register.
592 int i_shift_offset,i_coord_offset,outeriter,inneriter;
593 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
595 int j_coord_offsetA,j_coord_offsetB;
596 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
598 real *shiftvec,*fshift,*x,*f;
599 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
601 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
603 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
605 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
606 int vdwjidx0A,vdwjidx0B;
607 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
608 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
609 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
610 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
611 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
614 __m128i ifour = _mm_set1_epi32(4);
615 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
617 __m128d dummy_mask,cutoff_mask;
618 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
619 __m128d one = _mm_set1_pd(1.0);
620 __m128d two = _mm_set1_pd(2.0);
626 jindex = nlist->jindex;
628 shiftidx = nlist->shift;
630 shiftvec = fr->shift_vec[0];
631 fshift = fr->fshift[0];
632 facel = _mm_set1_pd(fr->epsfac);
633 charge = mdatoms->chargeA;
635 vftab = kernel_data->table_elec->data;
636 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
638 /* Setup water-specific parameters */
639 inr = nlist->iinr[0];
640 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
641 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
642 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
644 /* Avoid stupid compiler warnings */
652 /* Start outer loop over neighborlists */
653 for(iidx=0; iidx<nri; iidx++)
655 /* Load shift vector for this list */
656 i_shift_offset = DIM*shiftidx[iidx];
658 /* Load limits for loop over neighbors */
659 j_index_start = jindex[iidx];
660 j_index_end = jindex[iidx+1];
662 /* Get outer coordinate index */
664 i_coord_offset = DIM*inr;
666 /* Load i particle coords and add shift vector */
667 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
668 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
670 fix1 = _mm_setzero_pd();
671 fiy1 = _mm_setzero_pd();
672 fiz1 = _mm_setzero_pd();
673 fix2 = _mm_setzero_pd();
674 fiy2 = _mm_setzero_pd();
675 fiz2 = _mm_setzero_pd();
676 fix3 = _mm_setzero_pd();
677 fiy3 = _mm_setzero_pd();
678 fiz3 = _mm_setzero_pd();
680 /* Start inner kernel loop */
681 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
684 /* Get j neighbor index, and coordinate index */
687 j_coord_offsetA = DIM*jnrA;
688 j_coord_offsetB = DIM*jnrB;
690 /* load j atom coordinates */
691 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
694 /* Calculate displacement vector */
695 dx10 = _mm_sub_pd(ix1,jx0);
696 dy10 = _mm_sub_pd(iy1,jy0);
697 dz10 = _mm_sub_pd(iz1,jz0);
698 dx20 = _mm_sub_pd(ix2,jx0);
699 dy20 = _mm_sub_pd(iy2,jy0);
700 dz20 = _mm_sub_pd(iz2,jz0);
701 dx30 = _mm_sub_pd(ix3,jx0);
702 dy30 = _mm_sub_pd(iy3,jy0);
703 dz30 = _mm_sub_pd(iz3,jz0);
705 /* Calculate squared distance and things based on it */
706 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
707 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
708 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
710 rinv10 = gmx_mm_invsqrt_pd(rsq10);
711 rinv20 = gmx_mm_invsqrt_pd(rsq20);
712 rinv30 = gmx_mm_invsqrt_pd(rsq30);
714 /* Load parameters for j particles */
715 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
717 fjx0 = _mm_setzero_pd();
718 fjy0 = _mm_setzero_pd();
719 fjz0 = _mm_setzero_pd();
721 /**************************
722 * CALCULATE INTERACTIONS *
723 **************************/
725 r10 = _mm_mul_pd(rsq10,rinv10);
727 /* Compute parameters for interactions between i and j atoms */
728 qq10 = _mm_mul_pd(iq1,jq0);
730 /* Calculate table index by multiplying r with table scale and truncate to integer */
731 rt = _mm_mul_pd(r10,vftabscale);
732 vfitab = _mm_cvttpd_epi32(rt);
733 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
734 vfitab = _mm_slli_epi32(vfitab,2);
736 /* CUBIC SPLINE TABLE ELECTROSTATICS */
737 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
738 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
739 GMX_MM_TRANSPOSE2_PD(Y,F);
740 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
741 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
742 GMX_MM_TRANSPOSE2_PD(G,H);
743 Heps = _mm_mul_pd(vfeps,H);
744 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
745 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
746 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
750 /* Calculate temporary vectorial force */
751 tx = _mm_mul_pd(fscal,dx10);
752 ty = _mm_mul_pd(fscal,dy10);
753 tz = _mm_mul_pd(fscal,dz10);
755 /* Update vectorial force */
756 fix1 = _mm_add_pd(fix1,tx);
757 fiy1 = _mm_add_pd(fiy1,ty);
758 fiz1 = _mm_add_pd(fiz1,tz);
760 fjx0 = _mm_add_pd(fjx0,tx);
761 fjy0 = _mm_add_pd(fjy0,ty);
762 fjz0 = _mm_add_pd(fjz0,tz);
764 /**************************
765 * CALCULATE INTERACTIONS *
766 **************************/
768 r20 = _mm_mul_pd(rsq20,rinv20);
770 /* Compute parameters for interactions between i and j atoms */
771 qq20 = _mm_mul_pd(iq2,jq0);
773 /* Calculate table index by multiplying r with table scale and truncate to integer */
774 rt = _mm_mul_pd(r20,vftabscale);
775 vfitab = _mm_cvttpd_epi32(rt);
776 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
777 vfitab = _mm_slli_epi32(vfitab,2);
779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
780 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
781 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
782 GMX_MM_TRANSPOSE2_PD(Y,F);
783 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
784 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
785 GMX_MM_TRANSPOSE2_PD(G,H);
786 Heps = _mm_mul_pd(vfeps,H);
787 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
788 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
789 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
793 /* Calculate temporary vectorial force */
794 tx = _mm_mul_pd(fscal,dx20);
795 ty = _mm_mul_pd(fscal,dy20);
796 tz = _mm_mul_pd(fscal,dz20);
798 /* Update vectorial force */
799 fix2 = _mm_add_pd(fix2,tx);
800 fiy2 = _mm_add_pd(fiy2,ty);
801 fiz2 = _mm_add_pd(fiz2,tz);
803 fjx0 = _mm_add_pd(fjx0,tx);
804 fjy0 = _mm_add_pd(fjy0,ty);
805 fjz0 = _mm_add_pd(fjz0,tz);
807 /**************************
808 * CALCULATE INTERACTIONS *
809 **************************/
811 r30 = _mm_mul_pd(rsq30,rinv30);
813 /* Compute parameters for interactions between i and j atoms */
814 qq30 = _mm_mul_pd(iq3,jq0);
816 /* Calculate table index by multiplying r with table scale and truncate to integer */
817 rt = _mm_mul_pd(r30,vftabscale);
818 vfitab = _mm_cvttpd_epi32(rt);
819 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
820 vfitab = _mm_slli_epi32(vfitab,2);
822 /* CUBIC SPLINE TABLE ELECTROSTATICS */
823 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
824 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
825 GMX_MM_TRANSPOSE2_PD(Y,F);
826 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
827 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
828 GMX_MM_TRANSPOSE2_PD(G,H);
829 Heps = _mm_mul_pd(vfeps,H);
830 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
831 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
832 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
836 /* Calculate temporary vectorial force */
837 tx = _mm_mul_pd(fscal,dx30);
838 ty = _mm_mul_pd(fscal,dy30);
839 tz = _mm_mul_pd(fscal,dz30);
841 /* Update vectorial force */
842 fix3 = _mm_add_pd(fix3,tx);
843 fiy3 = _mm_add_pd(fiy3,ty);
844 fiz3 = _mm_add_pd(fiz3,tz);
846 fjx0 = _mm_add_pd(fjx0,tx);
847 fjy0 = _mm_add_pd(fjy0,ty);
848 fjz0 = _mm_add_pd(fjz0,tz);
850 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
852 /* Inner loop uses 120 flops */
859 j_coord_offsetA = DIM*jnrA;
861 /* load j atom coordinates */
862 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
865 /* Calculate displacement vector */
866 dx10 = _mm_sub_pd(ix1,jx0);
867 dy10 = _mm_sub_pd(iy1,jy0);
868 dz10 = _mm_sub_pd(iz1,jz0);
869 dx20 = _mm_sub_pd(ix2,jx0);
870 dy20 = _mm_sub_pd(iy2,jy0);
871 dz20 = _mm_sub_pd(iz2,jz0);
872 dx30 = _mm_sub_pd(ix3,jx0);
873 dy30 = _mm_sub_pd(iy3,jy0);
874 dz30 = _mm_sub_pd(iz3,jz0);
876 /* Calculate squared distance and things based on it */
877 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
878 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
879 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
881 rinv10 = gmx_mm_invsqrt_pd(rsq10);
882 rinv20 = gmx_mm_invsqrt_pd(rsq20);
883 rinv30 = gmx_mm_invsqrt_pd(rsq30);
885 /* Load parameters for j particles */
886 jq0 = _mm_load_sd(charge+jnrA+0);
888 fjx0 = _mm_setzero_pd();
889 fjy0 = _mm_setzero_pd();
890 fjz0 = _mm_setzero_pd();
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 r10 = _mm_mul_pd(rsq10,rinv10);
898 /* Compute parameters for interactions between i and j atoms */
899 qq10 = _mm_mul_pd(iq1,jq0);
901 /* Calculate table index by multiplying r with table scale and truncate to integer */
902 rt = _mm_mul_pd(r10,vftabscale);
903 vfitab = _mm_cvttpd_epi32(rt);
904 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
905 vfitab = _mm_slli_epi32(vfitab,2);
907 /* CUBIC SPLINE TABLE ELECTROSTATICS */
908 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
909 F = _mm_setzero_pd();
910 GMX_MM_TRANSPOSE2_PD(Y,F);
911 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
912 H = _mm_setzero_pd();
913 GMX_MM_TRANSPOSE2_PD(G,H);
914 Heps = _mm_mul_pd(vfeps,H);
915 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
916 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
917 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
921 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
923 /* Calculate temporary vectorial force */
924 tx = _mm_mul_pd(fscal,dx10);
925 ty = _mm_mul_pd(fscal,dy10);
926 tz = _mm_mul_pd(fscal,dz10);
928 /* Update vectorial force */
929 fix1 = _mm_add_pd(fix1,tx);
930 fiy1 = _mm_add_pd(fiy1,ty);
931 fiz1 = _mm_add_pd(fiz1,tz);
933 fjx0 = _mm_add_pd(fjx0,tx);
934 fjy0 = _mm_add_pd(fjy0,ty);
935 fjz0 = _mm_add_pd(fjz0,tz);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 r20 = _mm_mul_pd(rsq20,rinv20);
943 /* Compute parameters for interactions between i and j atoms */
944 qq20 = _mm_mul_pd(iq2,jq0);
946 /* Calculate table index by multiplying r with table scale and truncate to integer */
947 rt = _mm_mul_pd(r20,vftabscale);
948 vfitab = _mm_cvttpd_epi32(rt);
949 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
950 vfitab = _mm_slli_epi32(vfitab,2);
952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
953 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
954 F = _mm_setzero_pd();
955 GMX_MM_TRANSPOSE2_PD(Y,F);
956 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
957 H = _mm_setzero_pd();
958 GMX_MM_TRANSPOSE2_PD(G,H);
959 Heps = _mm_mul_pd(vfeps,H);
960 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
961 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
962 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
968 /* Calculate temporary vectorial force */
969 tx = _mm_mul_pd(fscal,dx20);
970 ty = _mm_mul_pd(fscal,dy20);
971 tz = _mm_mul_pd(fscal,dz20);
973 /* Update vectorial force */
974 fix2 = _mm_add_pd(fix2,tx);
975 fiy2 = _mm_add_pd(fiy2,ty);
976 fiz2 = _mm_add_pd(fiz2,tz);
978 fjx0 = _mm_add_pd(fjx0,tx);
979 fjy0 = _mm_add_pd(fjy0,ty);
980 fjz0 = _mm_add_pd(fjz0,tz);
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 r30 = _mm_mul_pd(rsq30,rinv30);
988 /* Compute parameters for interactions between i and j atoms */
989 qq30 = _mm_mul_pd(iq3,jq0);
991 /* Calculate table index by multiplying r with table scale and truncate to integer */
992 rt = _mm_mul_pd(r30,vftabscale);
993 vfitab = _mm_cvttpd_epi32(rt);
994 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
995 vfitab = _mm_slli_epi32(vfitab,2);
997 /* CUBIC SPLINE TABLE ELECTROSTATICS */
998 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
999 F = _mm_setzero_pd();
1000 GMX_MM_TRANSPOSE2_PD(Y,F);
1001 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1002 H = _mm_setzero_pd();
1003 GMX_MM_TRANSPOSE2_PD(G,H);
1004 Heps = _mm_mul_pd(vfeps,H);
1005 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1006 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1007 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1011 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1013 /* Calculate temporary vectorial force */
1014 tx = _mm_mul_pd(fscal,dx30);
1015 ty = _mm_mul_pd(fscal,dy30);
1016 tz = _mm_mul_pd(fscal,dz30);
1018 /* Update vectorial force */
1019 fix3 = _mm_add_pd(fix3,tx);
1020 fiy3 = _mm_add_pd(fiy3,ty);
1021 fiz3 = _mm_add_pd(fiz3,tz);
1023 fjx0 = _mm_add_pd(fjx0,tx);
1024 fjy0 = _mm_add_pd(fjy0,ty);
1025 fjz0 = _mm_add_pd(fjz0,tz);
1027 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1029 /* Inner loop uses 120 flops */
1032 /* End of innermost loop */
1034 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1035 f+i_coord_offset+DIM,fshift+i_shift_offset);
1037 /* Increment number of inner iterations */
1038 inneriter += j_index_end - j_index_start;
1040 /* Outer loop uses 18 flops */
1043 /* Increment number of outer iterations */
1046 /* Update outer/inner flops */
1048 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);