640fe433ce6c0050ba77b8c5f9b190e774c78b33
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse4_1_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_VF_sse4_1_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B;
75     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79     __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
81     real             *charge;
82     int              nvdwtype;
83     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
84     int              *vdwtype;
85     real             *vdwparam;
86     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
87     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
88     __m128i          vfitab;
89     __m128i          ifour       = _mm_set1_epi32(4);
90     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
91     real             *vftab;
92     __m128d          dummy_mask,cutoff_mask;
93     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
94     __m128d          one     = _mm_set1_pd(1.0);
95     __m128d          two     = _mm_set1_pd(2.0);
96     x                = xx[0];
97     f                = ff[0];
98
99     nri              = nlist->nri;
100     iinr             = nlist->iinr;
101     jindex           = nlist->jindex;
102     jjnr             = nlist->jjnr;
103     shiftidx         = nlist->shift;
104     gid              = nlist->gid;
105     shiftvec         = fr->shift_vec[0];
106     fshift           = fr->fshift[0];
107     facel            = _mm_set1_pd(fr->epsfac);
108     charge           = mdatoms->chargeA;
109     nvdwtype         = fr->ntype;
110     vdwparam         = fr->nbfp;
111     vdwtype          = mdatoms->typeA;
112
113     vftab            = kernel_data->table_elec_vdw->data;
114     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
115
116     /* Setup water-specific parameters */
117     inr              = nlist->iinr[0];
118     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
119     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
120     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
121     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
122
123     /* Avoid stupid compiler warnings */
124     jnrA = jnrB = 0;
125     j_coord_offsetA = 0;
126     j_coord_offsetB = 0;
127
128     outeriter        = 0;
129     inneriter        = 0;
130
131     /* Start outer loop over neighborlists */
132     for(iidx=0; iidx<nri; iidx++)
133     {
134         /* Load shift vector for this list */
135         i_shift_offset   = DIM*shiftidx[iidx];
136
137         /* Load limits for loop over neighbors */
138         j_index_start    = jindex[iidx];
139         j_index_end      = jindex[iidx+1];
140
141         /* Get outer coordinate index */
142         inr              = iinr[iidx];
143         i_coord_offset   = DIM*inr;
144
145         /* Load i particle coords and add shift vector */
146         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
147                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
148
149         fix0             = _mm_setzero_pd();
150         fiy0             = _mm_setzero_pd();
151         fiz0             = _mm_setzero_pd();
152         fix1             = _mm_setzero_pd();
153         fiy1             = _mm_setzero_pd();
154         fiz1             = _mm_setzero_pd();
155         fix2             = _mm_setzero_pd();
156         fiy2             = _mm_setzero_pd();
157         fiz2             = _mm_setzero_pd();
158         fix3             = _mm_setzero_pd();
159         fiy3             = _mm_setzero_pd();
160         fiz3             = _mm_setzero_pd();
161
162         /* Reset potential sums */
163         velecsum         = _mm_setzero_pd();
164         vvdwsum          = _mm_setzero_pd();
165
166         /* Start inner kernel loop */
167         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
168         {
169
170             /* Get j neighbor index, and coordinate index */
171             jnrA             = jjnr[jidx];
172             jnrB             = jjnr[jidx+1];
173             j_coord_offsetA  = DIM*jnrA;
174             j_coord_offsetB  = DIM*jnrB;
175
176             /* load j atom coordinates */
177             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
178                                               &jx0,&jy0,&jz0);
179
180             /* Calculate displacement vector */
181             dx00             = _mm_sub_pd(ix0,jx0);
182             dy00             = _mm_sub_pd(iy0,jy0);
183             dz00             = _mm_sub_pd(iz0,jz0);
184             dx10             = _mm_sub_pd(ix1,jx0);
185             dy10             = _mm_sub_pd(iy1,jy0);
186             dz10             = _mm_sub_pd(iz1,jz0);
187             dx20             = _mm_sub_pd(ix2,jx0);
188             dy20             = _mm_sub_pd(iy2,jy0);
189             dz20             = _mm_sub_pd(iz2,jz0);
190             dx30             = _mm_sub_pd(ix3,jx0);
191             dy30             = _mm_sub_pd(iy3,jy0);
192             dz30             = _mm_sub_pd(iz3,jz0);
193
194             /* Calculate squared distance and things based on it */
195             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
196             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
197             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
198             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
199
200             rinv00           = gmx_mm_invsqrt_pd(rsq00);
201             rinv10           = gmx_mm_invsqrt_pd(rsq10);
202             rinv20           = gmx_mm_invsqrt_pd(rsq20);
203             rinv30           = gmx_mm_invsqrt_pd(rsq30);
204
205             /* Load parameters for j particles */
206             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
207             vdwjidx0A        = 2*vdwtype[jnrA+0];
208             vdwjidx0B        = 2*vdwtype[jnrB+0];
209
210             fjx0             = _mm_setzero_pd();
211             fjy0             = _mm_setzero_pd();
212             fjz0             = _mm_setzero_pd();
213
214             /**************************
215              * CALCULATE INTERACTIONS *
216              **************************/
217
218             r00              = _mm_mul_pd(rsq00,rinv00);
219
220             /* Compute parameters for interactions between i and j atoms */
221             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
222                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
223
224             /* Calculate table index by multiplying r with table scale and truncate to integer */
225             rt               = _mm_mul_pd(r00,vftabscale);
226             vfitab           = _mm_cvttpd_epi32(rt);
227             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
228             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
229
230             /* CUBIC SPLINE TABLE DISPERSION */
231             vfitab           = _mm_add_epi32(vfitab,ifour);
232             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
233             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
234             GMX_MM_TRANSPOSE2_PD(Y,F);
235             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
236             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
237             GMX_MM_TRANSPOSE2_PD(G,H);
238             Heps             = _mm_mul_pd(vfeps,H);
239             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
240             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
241             vvdw6            = _mm_mul_pd(c6_00,VV);
242             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
243             fvdw6            = _mm_mul_pd(c6_00,FF);
244
245             /* CUBIC SPLINE TABLE REPULSION */
246             vfitab           = _mm_add_epi32(vfitab,ifour);
247             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
248             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
249             GMX_MM_TRANSPOSE2_PD(Y,F);
250             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
251             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
252             GMX_MM_TRANSPOSE2_PD(G,H);
253             Heps             = _mm_mul_pd(vfeps,H);
254             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
255             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
256             vvdw12           = _mm_mul_pd(c12_00,VV);
257             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
258             fvdw12           = _mm_mul_pd(c12_00,FF);
259             vvdw             = _mm_add_pd(vvdw12,vvdw6);
260             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
261
262             /* Update potential sum for this i atom from the interaction with this j atom. */
263             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
264
265             fscal            = fvdw;
266
267             /* Calculate temporary vectorial force */
268             tx               = _mm_mul_pd(fscal,dx00);
269             ty               = _mm_mul_pd(fscal,dy00);
270             tz               = _mm_mul_pd(fscal,dz00);
271
272             /* Update vectorial force */
273             fix0             = _mm_add_pd(fix0,tx);
274             fiy0             = _mm_add_pd(fiy0,ty);
275             fiz0             = _mm_add_pd(fiz0,tz);
276
277             fjx0             = _mm_add_pd(fjx0,tx);
278             fjy0             = _mm_add_pd(fjy0,ty);
279             fjz0             = _mm_add_pd(fjz0,tz);
280
281             /**************************
282              * CALCULATE INTERACTIONS *
283              **************************/
284
285             r10              = _mm_mul_pd(rsq10,rinv10);
286
287             /* Compute parameters for interactions between i and j atoms */
288             qq10             = _mm_mul_pd(iq1,jq0);
289
290             /* Calculate table index by multiplying r with table scale and truncate to integer */
291             rt               = _mm_mul_pd(r10,vftabscale);
292             vfitab           = _mm_cvttpd_epi32(rt);
293             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
294             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
295
296             /* CUBIC SPLINE TABLE ELECTROSTATICS */
297             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
298             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
299             GMX_MM_TRANSPOSE2_PD(Y,F);
300             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
301             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
302             GMX_MM_TRANSPOSE2_PD(G,H);
303             Heps             = _mm_mul_pd(vfeps,H);
304             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
305             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
306             velec            = _mm_mul_pd(qq10,VV);
307             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
308             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
309
310             /* Update potential sum for this i atom from the interaction with this j atom. */
311             velecsum         = _mm_add_pd(velecsum,velec);
312
313             fscal            = felec;
314
315             /* Calculate temporary vectorial force */
316             tx               = _mm_mul_pd(fscal,dx10);
317             ty               = _mm_mul_pd(fscal,dy10);
318             tz               = _mm_mul_pd(fscal,dz10);
319
320             /* Update vectorial force */
321             fix1             = _mm_add_pd(fix1,tx);
322             fiy1             = _mm_add_pd(fiy1,ty);
323             fiz1             = _mm_add_pd(fiz1,tz);
324
325             fjx0             = _mm_add_pd(fjx0,tx);
326             fjy0             = _mm_add_pd(fjy0,ty);
327             fjz0             = _mm_add_pd(fjz0,tz);
328
329             /**************************
330              * CALCULATE INTERACTIONS *
331              **************************/
332
333             r20              = _mm_mul_pd(rsq20,rinv20);
334
335             /* Compute parameters for interactions between i and j atoms */
336             qq20             = _mm_mul_pd(iq2,jq0);
337
338             /* Calculate table index by multiplying r with table scale and truncate to integer */
339             rt               = _mm_mul_pd(r20,vftabscale);
340             vfitab           = _mm_cvttpd_epi32(rt);
341             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
342             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
343
344             /* CUBIC SPLINE TABLE ELECTROSTATICS */
345             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
346             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
347             GMX_MM_TRANSPOSE2_PD(Y,F);
348             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
349             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
350             GMX_MM_TRANSPOSE2_PD(G,H);
351             Heps             = _mm_mul_pd(vfeps,H);
352             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
353             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
354             velec            = _mm_mul_pd(qq20,VV);
355             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
356             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
357
358             /* Update potential sum for this i atom from the interaction with this j atom. */
359             velecsum         = _mm_add_pd(velecsum,velec);
360
361             fscal            = felec;
362
363             /* Calculate temporary vectorial force */
364             tx               = _mm_mul_pd(fscal,dx20);
365             ty               = _mm_mul_pd(fscal,dy20);
366             tz               = _mm_mul_pd(fscal,dz20);
367
368             /* Update vectorial force */
369             fix2             = _mm_add_pd(fix2,tx);
370             fiy2             = _mm_add_pd(fiy2,ty);
371             fiz2             = _mm_add_pd(fiz2,tz);
372
373             fjx0             = _mm_add_pd(fjx0,tx);
374             fjy0             = _mm_add_pd(fjy0,ty);
375             fjz0             = _mm_add_pd(fjz0,tz);
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             r30              = _mm_mul_pd(rsq30,rinv30);
382
383             /* Compute parameters for interactions between i and j atoms */
384             qq30             = _mm_mul_pd(iq3,jq0);
385
386             /* Calculate table index by multiplying r with table scale and truncate to integer */
387             rt               = _mm_mul_pd(r30,vftabscale);
388             vfitab           = _mm_cvttpd_epi32(rt);
389             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
390             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
391
392             /* CUBIC SPLINE TABLE ELECTROSTATICS */
393             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
394             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
395             GMX_MM_TRANSPOSE2_PD(Y,F);
396             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
397             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
398             GMX_MM_TRANSPOSE2_PD(G,H);
399             Heps             = _mm_mul_pd(vfeps,H);
400             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
401             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
402             velec            = _mm_mul_pd(qq30,VV);
403             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
404             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velecsum         = _mm_add_pd(velecsum,velec);
408
409             fscal            = felec;
410
411             /* Calculate temporary vectorial force */
412             tx               = _mm_mul_pd(fscal,dx30);
413             ty               = _mm_mul_pd(fscal,dy30);
414             tz               = _mm_mul_pd(fscal,dz30);
415
416             /* Update vectorial force */
417             fix3             = _mm_add_pd(fix3,tx);
418             fiy3             = _mm_add_pd(fiy3,ty);
419             fiz3             = _mm_add_pd(fiz3,tz);
420
421             fjx0             = _mm_add_pd(fjx0,tx);
422             fjy0             = _mm_add_pd(fjy0,ty);
423             fjz0             = _mm_add_pd(fjz0,tz);
424
425             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
426
427             /* Inner loop uses 188 flops */
428         }
429
430         if(jidx<j_index_end)
431         {
432
433             jnrA             = jjnr[jidx];
434             j_coord_offsetA  = DIM*jnrA;
435
436             /* load j atom coordinates */
437             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
438                                               &jx0,&jy0,&jz0);
439
440             /* Calculate displacement vector */
441             dx00             = _mm_sub_pd(ix0,jx0);
442             dy00             = _mm_sub_pd(iy0,jy0);
443             dz00             = _mm_sub_pd(iz0,jz0);
444             dx10             = _mm_sub_pd(ix1,jx0);
445             dy10             = _mm_sub_pd(iy1,jy0);
446             dz10             = _mm_sub_pd(iz1,jz0);
447             dx20             = _mm_sub_pd(ix2,jx0);
448             dy20             = _mm_sub_pd(iy2,jy0);
449             dz20             = _mm_sub_pd(iz2,jz0);
450             dx30             = _mm_sub_pd(ix3,jx0);
451             dy30             = _mm_sub_pd(iy3,jy0);
452             dz30             = _mm_sub_pd(iz3,jz0);
453
454             /* Calculate squared distance and things based on it */
455             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
456             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
457             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
458             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
459
460             rinv00           = gmx_mm_invsqrt_pd(rsq00);
461             rinv10           = gmx_mm_invsqrt_pd(rsq10);
462             rinv20           = gmx_mm_invsqrt_pd(rsq20);
463             rinv30           = gmx_mm_invsqrt_pd(rsq30);
464
465             /* Load parameters for j particles */
466             jq0              = _mm_load_sd(charge+jnrA+0);
467             vdwjidx0A        = 2*vdwtype[jnrA+0];
468
469             fjx0             = _mm_setzero_pd();
470             fjy0             = _mm_setzero_pd();
471             fjz0             = _mm_setzero_pd();
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             r00              = _mm_mul_pd(rsq00,rinv00);
478
479             /* Compute parameters for interactions between i and j atoms */
480             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
481
482             /* Calculate table index by multiplying r with table scale and truncate to integer */
483             rt               = _mm_mul_pd(r00,vftabscale);
484             vfitab           = _mm_cvttpd_epi32(rt);
485             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
486             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
487
488             /* CUBIC SPLINE TABLE DISPERSION */
489             vfitab           = _mm_add_epi32(vfitab,ifour);
490             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
491             F                = _mm_setzero_pd();
492             GMX_MM_TRANSPOSE2_PD(Y,F);
493             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
494             H                = _mm_setzero_pd();
495             GMX_MM_TRANSPOSE2_PD(G,H);
496             Heps             = _mm_mul_pd(vfeps,H);
497             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
498             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
499             vvdw6            = _mm_mul_pd(c6_00,VV);
500             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
501             fvdw6            = _mm_mul_pd(c6_00,FF);
502
503             /* CUBIC SPLINE TABLE REPULSION */
504             vfitab           = _mm_add_epi32(vfitab,ifour);
505             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
506             F                = _mm_setzero_pd();
507             GMX_MM_TRANSPOSE2_PD(Y,F);
508             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
509             H                = _mm_setzero_pd();
510             GMX_MM_TRANSPOSE2_PD(G,H);
511             Heps             = _mm_mul_pd(vfeps,H);
512             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
513             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
514             vvdw12           = _mm_mul_pd(c12_00,VV);
515             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
516             fvdw12           = _mm_mul_pd(c12_00,FF);
517             vvdw             = _mm_add_pd(vvdw12,vvdw6);
518             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
519
520             /* Update potential sum for this i atom from the interaction with this j atom. */
521             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
522             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
523
524             fscal            = fvdw;
525
526             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
527
528             /* Calculate temporary vectorial force */
529             tx               = _mm_mul_pd(fscal,dx00);
530             ty               = _mm_mul_pd(fscal,dy00);
531             tz               = _mm_mul_pd(fscal,dz00);
532
533             /* Update vectorial force */
534             fix0             = _mm_add_pd(fix0,tx);
535             fiy0             = _mm_add_pd(fiy0,ty);
536             fiz0             = _mm_add_pd(fiz0,tz);
537
538             fjx0             = _mm_add_pd(fjx0,tx);
539             fjy0             = _mm_add_pd(fjy0,ty);
540             fjz0             = _mm_add_pd(fjz0,tz);
541
542             /**************************
543              * CALCULATE INTERACTIONS *
544              **************************/
545
546             r10              = _mm_mul_pd(rsq10,rinv10);
547
548             /* Compute parameters for interactions between i and j atoms */
549             qq10             = _mm_mul_pd(iq1,jq0);
550
551             /* Calculate table index by multiplying r with table scale and truncate to integer */
552             rt               = _mm_mul_pd(r10,vftabscale);
553             vfitab           = _mm_cvttpd_epi32(rt);
554             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
555             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
556
557             /* CUBIC SPLINE TABLE ELECTROSTATICS */
558             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
559             F                = _mm_setzero_pd();
560             GMX_MM_TRANSPOSE2_PD(Y,F);
561             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
562             H                = _mm_setzero_pd();
563             GMX_MM_TRANSPOSE2_PD(G,H);
564             Heps             = _mm_mul_pd(vfeps,H);
565             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
566             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
567             velec            = _mm_mul_pd(qq10,VV);
568             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
569             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
570
571             /* Update potential sum for this i atom from the interaction with this j atom. */
572             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
573             velecsum         = _mm_add_pd(velecsum,velec);
574
575             fscal            = felec;
576
577             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
578
579             /* Calculate temporary vectorial force */
580             tx               = _mm_mul_pd(fscal,dx10);
581             ty               = _mm_mul_pd(fscal,dy10);
582             tz               = _mm_mul_pd(fscal,dz10);
583
584             /* Update vectorial force */
585             fix1             = _mm_add_pd(fix1,tx);
586             fiy1             = _mm_add_pd(fiy1,ty);
587             fiz1             = _mm_add_pd(fiz1,tz);
588
589             fjx0             = _mm_add_pd(fjx0,tx);
590             fjy0             = _mm_add_pd(fjy0,ty);
591             fjz0             = _mm_add_pd(fjz0,tz);
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             r20              = _mm_mul_pd(rsq20,rinv20);
598
599             /* Compute parameters for interactions between i and j atoms */
600             qq20             = _mm_mul_pd(iq2,jq0);
601
602             /* Calculate table index by multiplying r with table scale and truncate to integer */
603             rt               = _mm_mul_pd(r20,vftabscale);
604             vfitab           = _mm_cvttpd_epi32(rt);
605             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
606             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
607
608             /* CUBIC SPLINE TABLE ELECTROSTATICS */
609             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
610             F                = _mm_setzero_pd();
611             GMX_MM_TRANSPOSE2_PD(Y,F);
612             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
613             H                = _mm_setzero_pd();
614             GMX_MM_TRANSPOSE2_PD(G,H);
615             Heps             = _mm_mul_pd(vfeps,H);
616             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
617             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
618             velec            = _mm_mul_pd(qq20,VV);
619             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
620             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
621
622             /* Update potential sum for this i atom from the interaction with this j atom. */
623             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
624             velecsum         = _mm_add_pd(velecsum,velec);
625
626             fscal            = felec;
627
628             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
629
630             /* Calculate temporary vectorial force */
631             tx               = _mm_mul_pd(fscal,dx20);
632             ty               = _mm_mul_pd(fscal,dy20);
633             tz               = _mm_mul_pd(fscal,dz20);
634
635             /* Update vectorial force */
636             fix2             = _mm_add_pd(fix2,tx);
637             fiy2             = _mm_add_pd(fiy2,ty);
638             fiz2             = _mm_add_pd(fiz2,tz);
639
640             fjx0             = _mm_add_pd(fjx0,tx);
641             fjy0             = _mm_add_pd(fjy0,ty);
642             fjz0             = _mm_add_pd(fjz0,tz);
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             r30              = _mm_mul_pd(rsq30,rinv30);
649
650             /* Compute parameters for interactions between i and j atoms */
651             qq30             = _mm_mul_pd(iq3,jq0);
652
653             /* Calculate table index by multiplying r with table scale and truncate to integer */
654             rt               = _mm_mul_pd(r30,vftabscale);
655             vfitab           = _mm_cvttpd_epi32(rt);
656             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
657             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
658
659             /* CUBIC SPLINE TABLE ELECTROSTATICS */
660             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
661             F                = _mm_setzero_pd();
662             GMX_MM_TRANSPOSE2_PD(Y,F);
663             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
664             H                = _mm_setzero_pd();
665             GMX_MM_TRANSPOSE2_PD(G,H);
666             Heps             = _mm_mul_pd(vfeps,H);
667             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
668             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
669             velec            = _mm_mul_pd(qq30,VV);
670             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
671             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
675             velecsum         = _mm_add_pd(velecsum,velec);
676
677             fscal            = felec;
678
679             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
680
681             /* Calculate temporary vectorial force */
682             tx               = _mm_mul_pd(fscal,dx30);
683             ty               = _mm_mul_pd(fscal,dy30);
684             tz               = _mm_mul_pd(fscal,dz30);
685
686             /* Update vectorial force */
687             fix3             = _mm_add_pd(fix3,tx);
688             fiy3             = _mm_add_pd(fiy3,ty);
689             fiz3             = _mm_add_pd(fiz3,tz);
690
691             fjx0             = _mm_add_pd(fjx0,tx);
692             fjy0             = _mm_add_pd(fjy0,ty);
693             fjz0             = _mm_add_pd(fjz0,tz);
694
695             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
696
697             /* Inner loop uses 188 flops */
698         }
699
700         /* End of innermost loop */
701
702         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
703                                               f+i_coord_offset,fshift+i_shift_offset);
704
705         ggid                        = gid[iidx];
706         /* Update potential energies */
707         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
708         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
709
710         /* Increment number of inner iterations */
711         inneriter                  += j_index_end - j_index_start;
712
713         /* Outer loop uses 26 flops */
714     }
715
716     /* Increment number of outer iterations */
717     outeriter        += nri;
718
719     /* Update outer/inner flops */
720
721     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*188);
722 }
723 /*
724  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_double
725  * Electrostatics interaction: CubicSplineTable
726  * VdW interaction:            CubicSplineTable
727  * Geometry:                   Water4-Particle
728  * Calculate force/pot:        Force
729  */
730 void
731 nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_double
732                     (t_nblist * gmx_restrict                nlist,
733                      rvec * gmx_restrict                    xx,
734                      rvec * gmx_restrict                    ff,
735                      t_forcerec * gmx_restrict              fr,
736                      t_mdatoms * gmx_restrict               mdatoms,
737                      nb_kernel_data_t * gmx_restrict        kernel_data,
738                      t_nrnb * gmx_restrict                  nrnb)
739 {
740     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
741      * just 0 for non-waters.
742      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
743      * jnr indices corresponding to data put in the four positions in the SIMD register.
744      */
745     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
746     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
747     int              jnrA,jnrB;
748     int              j_coord_offsetA,j_coord_offsetB;
749     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
750     real             rcutoff_scalar;
751     real             *shiftvec,*fshift,*x,*f;
752     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
753     int              vdwioffset0;
754     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
755     int              vdwioffset1;
756     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
757     int              vdwioffset2;
758     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
759     int              vdwioffset3;
760     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
761     int              vdwjidx0A,vdwjidx0B;
762     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
763     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
764     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
765     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
766     __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
767     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
768     real             *charge;
769     int              nvdwtype;
770     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
771     int              *vdwtype;
772     real             *vdwparam;
773     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
774     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
775     __m128i          vfitab;
776     __m128i          ifour       = _mm_set1_epi32(4);
777     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
778     real             *vftab;
779     __m128d          dummy_mask,cutoff_mask;
780     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
781     __m128d          one     = _mm_set1_pd(1.0);
782     __m128d          two     = _mm_set1_pd(2.0);
783     x                = xx[0];
784     f                = ff[0];
785
786     nri              = nlist->nri;
787     iinr             = nlist->iinr;
788     jindex           = nlist->jindex;
789     jjnr             = nlist->jjnr;
790     shiftidx         = nlist->shift;
791     gid              = nlist->gid;
792     shiftvec         = fr->shift_vec[0];
793     fshift           = fr->fshift[0];
794     facel            = _mm_set1_pd(fr->epsfac);
795     charge           = mdatoms->chargeA;
796     nvdwtype         = fr->ntype;
797     vdwparam         = fr->nbfp;
798     vdwtype          = mdatoms->typeA;
799
800     vftab            = kernel_data->table_elec_vdw->data;
801     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
802
803     /* Setup water-specific parameters */
804     inr              = nlist->iinr[0];
805     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
806     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
807     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
808     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
809
810     /* Avoid stupid compiler warnings */
811     jnrA = jnrB = 0;
812     j_coord_offsetA = 0;
813     j_coord_offsetB = 0;
814
815     outeriter        = 0;
816     inneriter        = 0;
817
818     /* Start outer loop over neighborlists */
819     for(iidx=0; iidx<nri; iidx++)
820     {
821         /* Load shift vector for this list */
822         i_shift_offset   = DIM*shiftidx[iidx];
823
824         /* Load limits for loop over neighbors */
825         j_index_start    = jindex[iidx];
826         j_index_end      = jindex[iidx+1];
827
828         /* Get outer coordinate index */
829         inr              = iinr[iidx];
830         i_coord_offset   = DIM*inr;
831
832         /* Load i particle coords and add shift vector */
833         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
834                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
835
836         fix0             = _mm_setzero_pd();
837         fiy0             = _mm_setzero_pd();
838         fiz0             = _mm_setzero_pd();
839         fix1             = _mm_setzero_pd();
840         fiy1             = _mm_setzero_pd();
841         fiz1             = _mm_setzero_pd();
842         fix2             = _mm_setzero_pd();
843         fiy2             = _mm_setzero_pd();
844         fiz2             = _mm_setzero_pd();
845         fix3             = _mm_setzero_pd();
846         fiy3             = _mm_setzero_pd();
847         fiz3             = _mm_setzero_pd();
848
849         /* Start inner kernel loop */
850         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
851         {
852
853             /* Get j neighbor index, and coordinate index */
854             jnrA             = jjnr[jidx];
855             jnrB             = jjnr[jidx+1];
856             j_coord_offsetA  = DIM*jnrA;
857             j_coord_offsetB  = DIM*jnrB;
858
859             /* load j atom coordinates */
860             gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
861                                               &jx0,&jy0,&jz0);
862
863             /* Calculate displacement vector */
864             dx00             = _mm_sub_pd(ix0,jx0);
865             dy00             = _mm_sub_pd(iy0,jy0);
866             dz00             = _mm_sub_pd(iz0,jz0);
867             dx10             = _mm_sub_pd(ix1,jx0);
868             dy10             = _mm_sub_pd(iy1,jy0);
869             dz10             = _mm_sub_pd(iz1,jz0);
870             dx20             = _mm_sub_pd(ix2,jx0);
871             dy20             = _mm_sub_pd(iy2,jy0);
872             dz20             = _mm_sub_pd(iz2,jz0);
873             dx30             = _mm_sub_pd(ix3,jx0);
874             dy30             = _mm_sub_pd(iy3,jy0);
875             dz30             = _mm_sub_pd(iz3,jz0);
876
877             /* Calculate squared distance and things based on it */
878             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
879             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
880             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
881             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
882
883             rinv00           = gmx_mm_invsqrt_pd(rsq00);
884             rinv10           = gmx_mm_invsqrt_pd(rsq10);
885             rinv20           = gmx_mm_invsqrt_pd(rsq20);
886             rinv30           = gmx_mm_invsqrt_pd(rsq30);
887
888             /* Load parameters for j particles */
889             jq0              = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
890             vdwjidx0A        = 2*vdwtype[jnrA+0];
891             vdwjidx0B        = 2*vdwtype[jnrB+0];
892
893             fjx0             = _mm_setzero_pd();
894             fjy0             = _mm_setzero_pd();
895             fjz0             = _mm_setzero_pd();
896
897             /**************************
898              * CALCULATE INTERACTIONS *
899              **************************/
900
901             r00              = _mm_mul_pd(rsq00,rinv00);
902
903             /* Compute parameters for interactions between i and j atoms */
904             gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
905                                          vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
906
907             /* Calculate table index by multiplying r with table scale and truncate to integer */
908             rt               = _mm_mul_pd(r00,vftabscale);
909             vfitab           = _mm_cvttpd_epi32(rt);
910             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
911             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
912
913             /* CUBIC SPLINE TABLE DISPERSION */
914             vfitab           = _mm_add_epi32(vfitab,ifour);
915             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
916             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
917             GMX_MM_TRANSPOSE2_PD(Y,F);
918             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
919             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
920             GMX_MM_TRANSPOSE2_PD(G,H);
921             Heps             = _mm_mul_pd(vfeps,H);
922             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
923             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
924             fvdw6            = _mm_mul_pd(c6_00,FF);
925
926             /* CUBIC SPLINE TABLE REPULSION */
927             vfitab           = _mm_add_epi32(vfitab,ifour);
928             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
929             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
930             GMX_MM_TRANSPOSE2_PD(Y,F);
931             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
932             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
933             GMX_MM_TRANSPOSE2_PD(G,H);
934             Heps             = _mm_mul_pd(vfeps,H);
935             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
936             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
937             fvdw12           = _mm_mul_pd(c12_00,FF);
938             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
939
940             fscal            = fvdw;
941
942             /* Calculate temporary vectorial force */
943             tx               = _mm_mul_pd(fscal,dx00);
944             ty               = _mm_mul_pd(fscal,dy00);
945             tz               = _mm_mul_pd(fscal,dz00);
946
947             /* Update vectorial force */
948             fix0             = _mm_add_pd(fix0,tx);
949             fiy0             = _mm_add_pd(fiy0,ty);
950             fiz0             = _mm_add_pd(fiz0,tz);
951
952             fjx0             = _mm_add_pd(fjx0,tx);
953             fjy0             = _mm_add_pd(fjy0,ty);
954             fjz0             = _mm_add_pd(fjz0,tz);
955
956             /**************************
957              * CALCULATE INTERACTIONS *
958              **************************/
959
960             r10              = _mm_mul_pd(rsq10,rinv10);
961
962             /* Compute parameters for interactions between i and j atoms */
963             qq10             = _mm_mul_pd(iq1,jq0);
964
965             /* Calculate table index by multiplying r with table scale and truncate to integer */
966             rt               = _mm_mul_pd(r10,vftabscale);
967             vfitab           = _mm_cvttpd_epi32(rt);
968             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
969             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
970
971             /* CUBIC SPLINE TABLE ELECTROSTATICS */
972             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
973             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
974             GMX_MM_TRANSPOSE2_PD(Y,F);
975             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
976             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
977             GMX_MM_TRANSPOSE2_PD(G,H);
978             Heps             = _mm_mul_pd(vfeps,H);
979             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
980             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
981             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
982
983             fscal            = felec;
984
985             /* Calculate temporary vectorial force */
986             tx               = _mm_mul_pd(fscal,dx10);
987             ty               = _mm_mul_pd(fscal,dy10);
988             tz               = _mm_mul_pd(fscal,dz10);
989
990             /* Update vectorial force */
991             fix1             = _mm_add_pd(fix1,tx);
992             fiy1             = _mm_add_pd(fiy1,ty);
993             fiz1             = _mm_add_pd(fiz1,tz);
994
995             fjx0             = _mm_add_pd(fjx0,tx);
996             fjy0             = _mm_add_pd(fjy0,ty);
997             fjz0             = _mm_add_pd(fjz0,tz);
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             r20              = _mm_mul_pd(rsq20,rinv20);
1004
1005             /* Compute parameters for interactions between i and j atoms */
1006             qq20             = _mm_mul_pd(iq2,jq0);
1007
1008             /* Calculate table index by multiplying r with table scale and truncate to integer */
1009             rt               = _mm_mul_pd(r20,vftabscale);
1010             vfitab           = _mm_cvttpd_epi32(rt);
1011             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1012             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1013
1014             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1015             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1016             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1017             GMX_MM_TRANSPOSE2_PD(Y,F);
1018             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1019             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1020             GMX_MM_TRANSPOSE2_PD(G,H);
1021             Heps             = _mm_mul_pd(vfeps,H);
1022             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1023             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1024             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1025
1026             fscal            = felec;
1027
1028             /* Calculate temporary vectorial force */
1029             tx               = _mm_mul_pd(fscal,dx20);
1030             ty               = _mm_mul_pd(fscal,dy20);
1031             tz               = _mm_mul_pd(fscal,dz20);
1032
1033             /* Update vectorial force */
1034             fix2             = _mm_add_pd(fix2,tx);
1035             fiy2             = _mm_add_pd(fiy2,ty);
1036             fiz2             = _mm_add_pd(fiz2,tz);
1037
1038             fjx0             = _mm_add_pd(fjx0,tx);
1039             fjy0             = _mm_add_pd(fjy0,ty);
1040             fjz0             = _mm_add_pd(fjz0,tz);
1041
1042             /**************************
1043              * CALCULATE INTERACTIONS *
1044              **************************/
1045
1046             r30              = _mm_mul_pd(rsq30,rinv30);
1047
1048             /* Compute parameters for interactions between i and j atoms */
1049             qq30             = _mm_mul_pd(iq3,jq0);
1050
1051             /* Calculate table index by multiplying r with table scale and truncate to integer */
1052             rt               = _mm_mul_pd(r30,vftabscale);
1053             vfitab           = _mm_cvttpd_epi32(rt);
1054             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1055             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1056
1057             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1058             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1059             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1060             GMX_MM_TRANSPOSE2_PD(Y,F);
1061             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1062             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1063             GMX_MM_TRANSPOSE2_PD(G,H);
1064             Heps             = _mm_mul_pd(vfeps,H);
1065             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1066             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1067             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1068
1069             fscal            = felec;
1070
1071             /* Calculate temporary vectorial force */
1072             tx               = _mm_mul_pd(fscal,dx30);
1073             ty               = _mm_mul_pd(fscal,dy30);
1074             tz               = _mm_mul_pd(fscal,dz30);
1075
1076             /* Update vectorial force */
1077             fix3             = _mm_add_pd(fix3,tx);
1078             fiy3             = _mm_add_pd(fiy3,ty);
1079             fiz3             = _mm_add_pd(fiz3,tz);
1080
1081             fjx0             = _mm_add_pd(fjx0,tx);
1082             fjy0             = _mm_add_pd(fjy0,ty);
1083             fjz0             = _mm_add_pd(fjz0,tz);
1084
1085             gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1086
1087             /* Inner loop uses 168 flops */
1088         }
1089
1090         if(jidx<j_index_end)
1091         {
1092
1093             jnrA             = jjnr[jidx];
1094             j_coord_offsetA  = DIM*jnrA;
1095
1096             /* load j atom coordinates */
1097             gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1098                                               &jx0,&jy0,&jz0);
1099
1100             /* Calculate displacement vector */
1101             dx00             = _mm_sub_pd(ix0,jx0);
1102             dy00             = _mm_sub_pd(iy0,jy0);
1103             dz00             = _mm_sub_pd(iz0,jz0);
1104             dx10             = _mm_sub_pd(ix1,jx0);
1105             dy10             = _mm_sub_pd(iy1,jy0);
1106             dz10             = _mm_sub_pd(iz1,jz0);
1107             dx20             = _mm_sub_pd(ix2,jx0);
1108             dy20             = _mm_sub_pd(iy2,jy0);
1109             dz20             = _mm_sub_pd(iz2,jz0);
1110             dx30             = _mm_sub_pd(ix3,jx0);
1111             dy30             = _mm_sub_pd(iy3,jy0);
1112             dz30             = _mm_sub_pd(iz3,jz0);
1113
1114             /* Calculate squared distance and things based on it */
1115             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1116             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1117             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1118             rsq30            = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1119
1120             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1121             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1122             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1123             rinv30           = gmx_mm_invsqrt_pd(rsq30);
1124
1125             /* Load parameters for j particles */
1126             jq0              = _mm_load_sd(charge+jnrA+0);
1127             vdwjidx0A        = 2*vdwtype[jnrA+0];
1128
1129             fjx0             = _mm_setzero_pd();
1130             fjy0             = _mm_setzero_pd();
1131             fjz0             = _mm_setzero_pd();
1132
1133             /**************************
1134              * CALCULATE INTERACTIONS *
1135              **************************/
1136
1137             r00              = _mm_mul_pd(rsq00,rinv00);
1138
1139             /* Compute parameters for interactions between i and j atoms */
1140             gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1141
1142             /* Calculate table index by multiplying r with table scale and truncate to integer */
1143             rt               = _mm_mul_pd(r00,vftabscale);
1144             vfitab           = _mm_cvttpd_epi32(rt);
1145             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1146             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1147
1148             /* CUBIC SPLINE TABLE DISPERSION */
1149             vfitab           = _mm_add_epi32(vfitab,ifour);
1150             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1151             F                = _mm_setzero_pd();
1152             GMX_MM_TRANSPOSE2_PD(Y,F);
1153             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1154             H                = _mm_setzero_pd();
1155             GMX_MM_TRANSPOSE2_PD(G,H);
1156             Heps             = _mm_mul_pd(vfeps,H);
1157             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1158             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1159             fvdw6            = _mm_mul_pd(c6_00,FF);
1160
1161             /* CUBIC SPLINE TABLE REPULSION */
1162             vfitab           = _mm_add_epi32(vfitab,ifour);
1163             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1164             F                = _mm_setzero_pd();
1165             GMX_MM_TRANSPOSE2_PD(Y,F);
1166             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1167             H                = _mm_setzero_pd();
1168             GMX_MM_TRANSPOSE2_PD(G,H);
1169             Heps             = _mm_mul_pd(vfeps,H);
1170             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1171             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1172             fvdw12           = _mm_mul_pd(c12_00,FF);
1173             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1174
1175             fscal            = fvdw;
1176
1177             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1178
1179             /* Calculate temporary vectorial force */
1180             tx               = _mm_mul_pd(fscal,dx00);
1181             ty               = _mm_mul_pd(fscal,dy00);
1182             tz               = _mm_mul_pd(fscal,dz00);
1183
1184             /* Update vectorial force */
1185             fix0             = _mm_add_pd(fix0,tx);
1186             fiy0             = _mm_add_pd(fiy0,ty);
1187             fiz0             = _mm_add_pd(fiz0,tz);
1188
1189             fjx0             = _mm_add_pd(fjx0,tx);
1190             fjy0             = _mm_add_pd(fjy0,ty);
1191             fjz0             = _mm_add_pd(fjz0,tz);
1192
1193             /**************************
1194              * CALCULATE INTERACTIONS *
1195              **************************/
1196
1197             r10              = _mm_mul_pd(rsq10,rinv10);
1198
1199             /* Compute parameters for interactions between i and j atoms */
1200             qq10             = _mm_mul_pd(iq1,jq0);
1201
1202             /* Calculate table index by multiplying r with table scale and truncate to integer */
1203             rt               = _mm_mul_pd(r10,vftabscale);
1204             vfitab           = _mm_cvttpd_epi32(rt);
1205             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1206             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1207
1208             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1209             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1210             F                = _mm_setzero_pd();
1211             GMX_MM_TRANSPOSE2_PD(Y,F);
1212             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1213             H                = _mm_setzero_pd();
1214             GMX_MM_TRANSPOSE2_PD(G,H);
1215             Heps             = _mm_mul_pd(vfeps,H);
1216             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1217             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1218             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm_mul_pd(fscal,dx10);
1226             ty               = _mm_mul_pd(fscal,dy10);
1227             tz               = _mm_mul_pd(fscal,dz10);
1228
1229             /* Update vectorial force */
1230             fix1             = _mm_add_pd(fix1,tx);
1231             fiy1             = _mm_add_pd(fiy1,ty);
1232             fiz1             = _mm_add_pd(fiz1,tz);
1233
1234             fjx0             = _mm_add_pd(fjx0,tx);
1235             fjy0             = _mm_add_pd(fjy0,ty);
1236             fjz0             = _mm_add_pd(fjz0,tz);
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             r20              = _mm_mul_pd(rsq20,rinv20);
1243
1244             /* Compute parameters for interactions between i and j atoms */
1245             qq20             = _mm_mul_pd(iq2,jq0);
1246
1247             /* Calculate table index by multiplying r with table scale and truncate to integer */
1248             rt               = _mm_mul_pd(r20,vftabscale);
1249             vfitab           = _mm_cvttpd_epi32(rt);
1250             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1251             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1252
1253             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1254             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1255             F                = _mm_setzero_pd();
1256             GMX_MM_TRANSPOSE2_PD(Y,F);
1257             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1258             H                = _mm_setzero_pd();
1259             GMX_MM_TRANSPOSE2_PD(G,H);
1260             Heps             = _mm_mul_pd(vfeps,H);
1261             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1262             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1263             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1264
1265             fscal            = felec;
1266
1267             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm_mul_pd(fscal,dx20);
1271             ty               = _mm_mul_pd(fscal,dy20);
1272             tz               = _mm_mul_pd(fscal,dz20);
1273
1274             /* Update vectorial force */
1275             fix2             = _mm_add_pd(fix2,tx);
1276             fiy2             = _mm_add_pd(fiy2,ty);
1277             fiz2             = _mm_add_pd(fiz2,tz);
1278
1279             fjx0             = _mm_add_pd(fjx0,tx);
1280             fjy0             = _mm_add_pd(fjy0,ty);
1281             fjz0             = _mm_add_pd(fjz0,tz);
1282
1283             /**************************
1284              * CALCULATE INTERACTIONS *
1285              **************************/
1286
1287             r30              = _mm_mul_pd(rsq30,rinv30);
1288
1289             /* Compute parameters for interactions between i and j atoms */
1290             qq30             = _mm_mul_pd(iq3,jq0);
1291
1292             /* Calculate table index by multiplying r with table scale and truncate to integer */
1293             rt               = _mm_mul_pd(r30,vftabscale);
1294             vfitab           = _mm_cvttpd_epi32(rt);
1295             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1296             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1297
1298             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1299             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1300             F                = _mm_setzero_pd();
1301             GMX_MM_TRANSPOSE2_PD(Y,F);
1302             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1303             H                = _mm_setzero_pd();
1304             GMX_MM_TRANSPOSE2_PD(G,H);
1305             Heps             = _mm_mul_pd(vfeps,H);
1306             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1307             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1308             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq30,FF),_mm_mul_pd(vftabscale,rinv30)));
1309
1310             fscal            = felec;
1311
1312             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1313
1314             /* Calculate temporary vectorial force */
1315             tx               = _mm_mul_pd(fscal,dx30);
1316             ty               = _mm_mul_pd(fscal,dy30);
1317             tz               = _mm_mul_pd(fscal,dz30);
1318
1319             /* Update vectorial force */
1320             fix3             = _mm_add_pd(fix3,tx);
1321             fiy3             = _mm_add_pd(fiy3,ty);
1322             fiz3             = _mm_add_pd(fiz3,tz);
1323
1324             fjx0             = _mm_add_pd(fjx0,tx);
1325             fjy0             = _mm_add_pd(fjy0,ty);
1326             fjz0             = _mm_add_pd(fjz0,tz);
1327
1328             gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1329
1330             /* Inner loop uses 168 flops */
1331         }
1332
1333         /* End of innermost loop */
1334
1335         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1336                                               f+i_coord_offset,fshift+i_shift_offset);
1337
1338         /* Increment number of inner iterations */
1339         inneriter                  += j_index_end - j_index_start;
1340
1341         /* Outer loop uses 24 flops */
1342     }
1343
1344     /* Increment number of outer iterations */
1345     outeriter        += nri;
1346
1347     /* Update outer/inner flops */
1348
1349     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*168);
1350 }