Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse4_1_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse4_1_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     int              vdwjidx1A,vdwjidx1B;
88     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     int              nvdwtype;
103     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104     int              *vdwtype;
105     real             *vdwparam;
106     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
107     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
108     __m128i          vfitab;
109     __m128i          ifour       = _mm_set1_epi32(4);
110     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111     real             *vftab;
112     __m128d          dummy_mask,cutoff_mask;
113     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
114     __m128d          one     = _mm_set1_pd(1.0);
115     __m128d          two     = _mm_set1_pd(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_pd(fr->ic->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     vftab            = kernel_data->table_elec->data;
134     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
139     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
140     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
141     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
142
143     jq0              = _mm_set1_pd(charge[inr+0]);
144     jq1              = _mm_set1_pd(charge[inr+1]);
145     jq2              = _mm_set1_pd(charge[inr+2]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     qq00             = _mm_mul_pd(iq0,jq0);
148     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
149     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
150     qq01             = _mm_mul_pd(iq0,jq1);
151     qq02             = _mm_mul_pd(iq0,jq2);
152     qq10             = _mm_mul_pd(iq1,jq0);
153     qq11             = _mm_mul_pd(iq1,jq1);
154     qq12             = _mm_mul_pd(iq1,jq2);
155     qq20             = _mm_mul_pd(iq2,jq0);
156     qq21             = _mm_mul_pd(iq2,jq1);
157     qq22             = _mm_mul_pd(iq2,jq2);
158
159     /* Avoid stupid compiler warnings */
160     jnrA = jnrB = 0;
161     j_coord_offsetA = 0;
162     j_coord_offsetB = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _mm_setzero_pd();
186         fiy0             = _mm_setzero_pd();
187         fiz0             = _mm_setzero_pd();
188         fix1             = _mm_setzero_pd();
189         fiy1             = _mm_setzero_pd();
190         fiz1             = _mm_setzero_pd();
191         fix2             = _mm_setzero_pd();
192         fiy2             = _mm_setzero_pd();
193         fiz2             = _mm_setzero_pd();
194
195         /* Reset potential sums */
196         velecsum         = _mm_setzero_pd();
197         vvdwsum          = _mm_setzero_pd();
198
199         /* Start inner kernel loop */
200         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
201         {
202
203             /* Get j neighbor index, and coordinate index */
204             jnrA             = jjnr[jidx];
205             jnrB             = jjnr[jidx+1];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208
209             /* load j atom coordinates */
210             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
211                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
212
213             /* Calculate displacement vector */
214             dx00             = _mm_sub_pd(ix0,jx0);
215             dy00             = _mm_sub_pd(iy0,jy0);
216             dz00             = _mm_sub_pd(iz0,jz0);
217             dx01             = _mm_sub_pd(ix0,jx1);
218             dy01             = _mm_sub_pd(iy0,jy1);
219             dz01             = _mm_sub_pd(iz0,jz1);
220             dx02             = _mm_sub_pd(ix0,jx2);
221             dy02             = _mm_sub_pd(iy0,jy2);
222             dz02             = _mm_sub_pd(iz0,jz2);
223             dx10             = _mm_sub_pd(ix1,jx0);
224             dy10             = _mm_sub_pd(iy1,jy0);
225             dz10             = _mm_sub_pd(iz1,jz0);
226             dx11             = _mm_sub_pd(ix1,jx1);
227             dy11             = _mm_sub_pd(iy1,jy1);
228             dz11             = _mm_sub_pd(iz1,jz1);
229             dx12             = _mm_sub_pd(ix1,jx2);
230             dy12             = _mm_sub_pd(iy1,jy2);
231             dz12             = _mm_sub_pd(iz1,jz2);
232             dx20             = _mm_sub_pd(ix2,jx0);
233             dy20             = _mm_sub_pd(iy2,jy0);
234             dz20             = _mm_sub_pd(iz2,jz0);
235             dx21             = _mm_sub_pd(ix2,jx1);
236             dy21             = _mm_sub_pd(iy2,jy1);
237             dz21             = _mm_sub_pd(iz2,jz1);
238             dx22             = _mm_sub_pd(ix2,jx2);
239             dy22             = _mm_sub_pd(iy2,jy2);
240             dz22             = _mm_sub_pd(iz2,jz2);
241
242             /* Calculate squared distance and things based on it */
243             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
244             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
245             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
246             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
247             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
248             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
249             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
250             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
251             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
252
253             rinv00           = sse41_invsqrt_d(rsq00);
254             rinv01           = sse41_invsqrt_d(rsq01);
255             rinv02           = sse41_invsqrt_d(rsq02);
256             rinv10           = sse41_invsqrt_d(rsq10);
257             rinv11           = sse41_invsqrt_d(rsq11);
258             rinv12           = sse41_invsqrt_d(rsq12);
259             rinv20           = sse41_invsqrt_d(rsq20);
260             rinv21           = sse41_invsqrt_d(rsq21);
261             rinv22           = sse41_invsqrt_d(rsq22);
262
263             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
264
265             fjx0             = _mm_setzero_pd();
266             fjy0             = _mm_setzero_pd();
267             fjz0             = _mm_setzero_pd();
268             fjx1             = _mm_setzero_pd();
269             fjy1             = _mm_setzero_pd();
270             fjz1             = _mm_setzero_pd();
271             fjx2             = _mm_setzero_pd();
272             fjy2             = _mm_setzero_pd();
273             fjz2             = _mm_setzero_pd();
274
275             /**************************
276              * CALCULATE INTERACTIONS *
277              **************************/
278
279             r00              = _mm_mul_pd(rsq00,rinv00);
280
281             /* Calculate table index by multiplying r with table scale and truncate to integer */
282             rt               = _mm_mul_pd(r00,vftabscale);
283             vfitab           = _mm_cvttpd_epi32(rt);
284             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
285             vfitab           = _mm_slli_epi32(vfitab,2);
286
287             /* CUBIC SPLINE TABLE ELECTROSTATICS */
288             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
289             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
290             GMX_MM_TRANSPOSE2_PD(Y,F);
291             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
292             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
293             GMX_MM_TRANSPOSE2_PD(G,H);
294             Heps             = _mm_mul_pd(vfeps,H);
295             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
296             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
297             velec            = _mm_mul_pd(qq00,VV);
298             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
299             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
300
301             /* LENNARD-JONES DISPERSION/REPULSION */
302
303             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
304             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
305             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
306             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
307             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
308
309             /* Update potential sum for this i atom from the interaction with this j atom. */
310             velecsum         = _mm_add_pd(velecsum,velec);
311             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
312
313             fscal            = _mm_add_pd(felec,fvdw);
314
315             /* Calculate temporary vectorial force */
316             tx               = _mm_mul_pd(fscal,dx00);
317             ty               = _mm_mul_pd(fscal,dy00);
318             tz               = _mm_mul_pd(fscal,dz00);
319
320             /* Update vectorial force */
321             fix0             = _mm_add_pd(fix0,tx);
322             fiy0             = _mm_add_pd(fiy0,ty);
323             fiz0             = _mm_add_pd(fiz0,tz);
324
325             fjx0             = _mm_add_pd(fjx0,tx);
326             fjy0             = _mm_add_pd(fjy0,ty);
327             fjz0             = _mm_add_pd(fjz0,tz);
328
329             /**************************
330              * CALCULATE INTERACTIONS *
331              **************************/
332
333             r01              = _mm_mul_pd(rsq01,rinv01);
334
335             /* Calculate table index by multiplying r with table scale and truncate to integer */
336             rt               = _mm_mul_pd(r01,vftabscale);
337             vfitab           = _mm_cvttpd_epi32(rt);
338             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
339             vfitab           = _mm_slli_epi32(vfitab,2);
340
341             /* CUBIC SPLINE TABLE ELECTROSTATICS */
342             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
343             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
344             GMX_MM_TRANSPOSE2_PD(Y,F);
345             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
346             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
347             GMX_MM_TRANSPOSE2_PD(G,H);
348             Heps             = _mm_mul_pd(vfeps,H);
349             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
350             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
351             velec            = _mm_mul_pd(qq01,VV);
352             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
353             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
354
355             /* Update potential sum for this i atom from the interaction with this j atom. */
356             velecsum         = _mm_add_pd(velecsum,velec);
357
358             fscal            = felec;
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm_mul_pd(fscal,dx01);
362             ty               = _mm_mul_pd(fscal,dy01);
363             tz               = _mm_mul_pd(fscal,dz01);
364
365             /* Update vectorial force */
366             fix0             = _mm_add_pd(fix0,tx);
367             fiy0             = _mm_add_pd(fiy0,ty);
368             fiz0             = _mm_add_pd(fiz0,tz);
369
370             fjx1             = _mm_add_pd(fjx1,tx);
371             fjy1             = _mm_add_pd(fjy1,ty);
372             fjz1             = _mm_add_pd(fjz1,tz);
373
374             /**************************
375              * CALCULATE INTERACTIONS *
376              **************************/
377
378             r02              = _mm_mul_pd(rsq02,rinv02);
379
380             /* Calculate table index by multiplying r with table scale and truncate to integer */
381             rt               = _mm_mul_pd(r02,vftabscale);
382             vfitab           = _mm_cvttpd_epi32(rt);
383             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
384             vfitab           = _mm_slli_epi32(vfitab,2);
385
386             /* CUBIC SPLINE TABLE ELECTROSTATICS */
387             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
388             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
389             GMX_MM_TRANSPOSE2_PD(Y,F);
390             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
391             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
392             GMX_MM_TRANSPOSE2_PD(G,H);
393             Heps             = _mm_mul_pd(vfeps,H);
394             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
395             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
396             velec            = _mm_mul_pd(qq02,VV);
397             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
398             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
399
400             /* Update potential sum for this i atom from the interaction with this j atom. */
401             velecsum         = _mm_add_pd(velecsum,velec);
402
403             fscal            = felec;
404
405             /* Calculate temporary vectorial force */
406             tx               = _mm_mul_pd(fscal,dx02);
407             ty               = _mm_mul_pd(fscal,dy02);
408             tz               = _mm_mul_pd(fscal,dz02);
409
410             /* Update vectorial force */
411             fix0             = _mm_add_pd(fix0,tx);
412             fiy0             = _mm_add_pd(fiy0,ty);
413             fiz0             = _mm_add_pd(fiz0,tz);
414
415             fjx2             = _mm_add_pd(fjx2,tx);
416             fjy2             = _mm_add_pd(fjy2,ty);
417             fjz2             = _mm_add_pd(fjz2,tz);
418
419             /**************************
420              * CALCULATE INTERACTIONS *
421              **************************/
422
423             r10              = _mm_mul_pd(rsq10,rinv10);
424
425             /* Calculate table index by multiplying r with table scale and truncate to integer */
426             rt               = _mm_mul_pd(r10,vftabscale);
427             vfitab           = _mm_cvttpd_epi32(rt);
428             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
429             vfitab           = _mm_slli_epi32(vfitab,2);
430
431             /* CUBIC SPLINE TABLE ELECTROSTATICS */
432             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
433             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
434             GMX_MM_TRANSPOSE2_PD(Y,F);
435             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
436             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
437             GMX_MM_TRANSPOSE2_PD(G,H);
438             Heps             = _mm_mul_pd(vfeps,H);
439             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
440             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
441             velec            = _mm_mul_pd(qq10,VV);
442             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
443             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
444
445             /* Update potential sum for this i atom from the interaction with this j atom. */
446             velecsum         = _mm_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             /* Calculate temporary vectorial force */
451             tx               = _mm_mul_pd(fscal,dx10);
452             ty               = _mm_mul_pd(fscal,dy10);
453             tz               = _mm_mul_pd(fscal,dz10);
454
455             /* Update vectorial force */
456             fix1             = _mm_add_pd(fix1,tx);
457             fiy1             = _mm_add_pd(fiy1,ty);
458             fiz1             = _mm_add_pd(fiz1,tz);
459
460             fjx0             = _mm_add_pd(fjx0,tx);
461             fjy0             = _mm_add_pd(fjy0,ty);
462             fjz0             = _mm_add_pd(fjz0,tz);
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             r11              = _mm_mul_pd(rsq11,rinv11);
469
470             /* Calculate table index by multiplying r with table scale and truncate to integer */
471             rt               = _mm_mul_pd(r11,vftabscale);
472             vfitab           = _mm_cvttpd_epi32(rt);
473             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
474             vfitab           = _mm_slli_epi32(vfitab,2);
475
476             /* CUBIC SPLINE TABLE ELECTROSTATICS */
477             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
478             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
479             GMX_MM_TRANSPOSE2_PD(Y,F);
480             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
481             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
482             GMX_MM_TRANSPOSE2_PD(G,H);
483             Heps             = _mm_mul_pd(vfeps,H);
484             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
485             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
486             velec            = _mm_mul_pd(qq11,VV);
487             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
488             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
489
490             /* Update potential sum for this i atom from the interaction with this j atom. */
491             velecsum         = _mm_add_pd(velecsum,velec);
492
493             fscal            = felec;
494
495             /* Calculate temporary vectorial force */
496             tx               = _mm_mul_pd(fscal,dx11);
497             ty               = _mm_mul_pd(fscal,dy11);
498             tz               = _mm_mul_pd(fscal,dz11);
499
500             /* Update vectorial force */
501             fix1             = _mm_add_pd(fix1,tx);
502             fiy1             = _mm_add_pd(fiy1,ty);
503             fiz1             = _mm_add_pd(fiz1,tz);
504
505             fjx1             = _mm_add_pd(fjx1,tx);
506             fjy1             = _mm_add_pd(fjy1,ty);
507             fjz1             = _mm_add_pd(fjz1,tz);
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             r12              = _mm_mul_pd(rsq12,rinv12);
514
515             /* Calculate table index by multiplying r with table scale and truncate to integer */
516             rt               = _mm_mul_pd(r12,vftabscale);
517             vfitab           = _mm_cvttpd_epi32(rt);
518             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
519             vfitab           = _mm_slli_epi32(vfitab,2);
520
521             /* CUBIC SPLINE TABLE ELECTROSTATICS */
522             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
523             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
524             GMX_MM_TRANSPOSE2_PD(Y,F);
525             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
526             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
527             GMX_MM_TRANSPOSE2_PD(G,H);
528             Heps             = _mm_mul_pd(vfeps,H);
529             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
530             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
531             velec            = _mm_mul_pd(qq12,VV);
532             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
533             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
534
535             /* Update potential sum for this i atom from the interaction with this j atom. */
536             velecsum         = _mm_add_pd(velecsum,velec);
537
538             fscal            = felec;
539
540             /* Calculate temporary vectorial force */
541             tx               = _mm_mul_pd(fscal,dx12);
542             ty               = _mm_mul_pd(fscal,dy12);
543             tz               = _mm_mul_pd(fscal,dz12);
544
545             /* Update vectorial force */
546             fix1             = _mm_add_pd(fix1,tx);
547             fiy1             = _mm_add_pd(fiy1,ty);
548             fiz1             = _mm_add_pd(fiz1,tz);
549
550             fjx2             = _mm_add_pd(fjx2,tx);
551             fjy2             = _mm_add_pd(fjy2,ty);
552             fjz2             = _mm_add_pd(fjz2,tz);
553
554             /**************************
555              * CALCULATE INTERACTIONS *
556              **************************/
557
558             r20              = _mm_mul_pd(rsq20,rinv20);
559
560             /* Calculate table index by multiplying r with table scale and truncate to integer */
561             rt               = _mm_mul_pd(r20,vftabscale);
562             vfitab           = _mm_cvttpd_epi32(rt);
563             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
564             vfitab           = _mm_slli_epi32(vfitab,2);
565
566             /* CUBIC SPLINE TABLE ELECTROSTATICS */
567             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
568             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
569             GMX_MM_TRANSPOSE2_PD(Y,F);
570             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
571             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
572             GMX_MM_TRANSPOSE2_PD(G,H);
573             Heps             = _mm_mul_pd(vfeps,H);
574             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
575             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
576             velec            = _mm_mul_pd(qq20,VV);
577             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
578             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
579
580             /* Update potential sum for this i atom from the interaction with this j atom. */
581             velecsum         = _mm_add_pd(velecsum,velec);
582
583             fscal            = felec;
584
585             /* Calculate temporary vectorial force */
586             tx               = _mm_mul_pd(fscal,dx20);
587             ty               = _mm_mul_pd(fscal,dy20);
588             tz               = _mm_mul_pd(fscal,dz20);
589
590             /* Update vectorial force */
591             fix2             = _mm_add_pd(fix2,tx);
592             fiy2             = _mm_add_pd(fiy2,ty);
593             fiz2             = _mm_add_pd(fiz2,tz);
594
595             fjx0             = _mm_add_pd(fjx0,tx);
596             fjy0             = _mm_add_pd(fjy0,ty);
597             fjz0             = _mm_add_pd(fjz0,tz);
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             r21              = _mm_mul_pd(rsq21,rinv21);
604
605             /* Calculate table index by multiplying r with table scale and truncate to integer */
606             rt               = _mm_mul_pd(r21,vftabscale);
607             vfitab           = _mm_cvttpd_epi32(rt);
608             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
609             vfitab           = _mm_slli_epi32(vfitab,2);
610
611             /* CUBIC SPLINE TABLE ELECTROSTATICS */
612             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
613             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
614             GMX_MM_TRANSPOSE2_PD(Y,F);
615             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
616             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
617             GMX_MM_TRANSPOSE2_PD(G,H);
618             Heps             = _mm_mul_pd(vfeps,H);
619             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
620             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
621             velec            = _mm_mul_pd(qq21,VV);
622             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
623             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
624
625             /* Update potential sum for this i atom from the interaction with this j atom. */
626             velecsum         = _mm_add_pd(velecsum,velec);
627
628             fscal            = felec;
629
630             /* Calculate temporary vectorial force */
631             tx               = _mm_mul_pd(fscal,dx21);
632             ty               = _mm_mul_pd(fscal,dy21);
633             tz               = _mm_mul_pd(fscal,dz21);
634
635             /* Update vectorial force */
636             fix2             = _mm_add_pd(fix2,tx);
637             fiy2             = _mm_add_pd(fiy2,ty);
638             fiz2             = _mm_add_pd(fiz2,tz);
639
640             fjx1             = _mm_add_pd(fjx1,tx);
641             fjy1             = _mm_add_pd(fjy1,ty);
642             fjz1             = _mm_add_pd(fjz1,tz);
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             r22              = _mm_mul_pd(rsq22,rinv22);
649
650             /* Calculate table index by multiplying r with table scale and truncate to integer */
651             rt               = _mm_mul_pd(r22,vftabscale);
652             vfitab           = _mm_cvttpd_epi32(rt);
653             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
654             vfitab           = _mm_slli_epi32(vfitab,2);
655
656             /* CUBIC SPLINE TABLE ELECTROSTATICS */
657             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
658             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
659             GMX_MM_TRANSPOSE2_PD(Y,F);
660             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
661             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
662             GMX_MM_TRANSPOSE2_PD(G,H);
663             Heps             = _mm_mul_pd(vfeps,H);
664             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
665             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
666             velec            = _mm_mul_pd(qq22,VV);
667             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
668             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
669
670             /* Update potential sum for this i atom from the interaction with this j atom. */
671             velecsum         = _mm_add_pd(velecsum,velec);
672
673             fscal            = felec;
674
675             /* Calculate temporary vectorial force */
676             tx               = _mm_mul_pd(fscal,dx22);
677             ty               = _mm_mul_pd(fscal,dy22);
678             tz               = _mm_mul_pd(fscal,dz22);
679
680             /* Update vectorial force */
681             fix2             = _mm_add_pd(fix2,tx);
682             fiy2             = _mm_add_pd(fiy2,ty);
683             fiz2             = _mm_add_pd(fiz2,tz);
684
685             fjx2             = _mm_add_pd(fjx2,tx);
686             fjy2             = _mm_add_pd(fjy2,ty);
687             fjz2             = _mm_add_pd(fjz2,tz);
688
689             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
690
691             /* Inner loop uses 400 flops */
692         }
693
694         if(jidx<j_index_end)
695         {
696
697             jnrA             = jjnr[jidx];
698             j_coord_offsetA  = DIM*jnrA;
699
700             /* load j atom coordinates */
701             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
702                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
703
704             /* Calculate displacement vector */
705             dx00             = _mm_sub_pd(ix0,jx0);
706             dy00             = _mm_sub_pd(iy0,jy0);
707             dz00             = _mm_sub_pd(iz0,jz0);
708             dx01             = _mm_sub_pd(ix0,jx1);
709             dy01             = _mm_sub_pd(iy0,jy1);
710             dz01             = _mm_sub_pd(iz0,jz1);
711             dx02             = _mm_sub_pd(ix0,jx2);
712             dy02             = _mm_sub_pd(iy0,jy2);
713             dz02             = _mm_sub_pd(iz0,jz2);
714             dx10             = _mm_sub_pd(ix1,jx0);
715             dy10             = _mm_sub_pd(iy1,jy0);
716             dz10             = _mm_sub_pd(iz1,jz0);
717             dx11             = _mm_sub_pd(ix1,jx1);
718             dy11             = _mm_sub_pd(iy1,jy1);
719             dz11             = _mm_sub_pd(iz1,jz1);
720             dx12             = _mm_sub_pd(ix1,jx2);
721             dy12             = _mm_sub_pd(iy1,jy2);
722             dz12             = _mm_sub_pd(iz1,jz2);
723             dx20             = _mm_sub_pd(ix2,jx0);
724             dy20             = _mm_sub_pd(iy2,jy0);
725             dz20             = _mm_sub_pd(iz2,jz0);
726             dx21             = _mm_sub_pd(ix2,jx1);
727             dy21             = _mm_sub_pd(iy2,jy1);
728             dz21             = _mm_sub_pd(iz2,jz1);
729             dx22             = _mm_sub_pd(ix2,jx2);
730             dy22             = _mm_sub_pd(iy2,jy2);
731             dz22             = _mm_sub_pd(iz2,jz2);
732
733             /* Calculate squared distance and things based on it */
734             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
735             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
736             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
737             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
738             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
739             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
740             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
741             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
742             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
743
744             rinv00           = sse41_invsqrt_d(rsq00);
745             rinv01           = sse41_invsqrt_d(rsq01);
746             rinv02           = sse41_invsqrt_d(rsq02);
747             rinv10           = sse41_invsqrt_d(rsq10);
748             rinv11           = sse41_invsqrt_d(rsq11);
749             rinv12           = sse41_invsqrt_d(rsq12);
750             rinv20           = sse41_invsqrt_d(rsq20);
751             rinv21           = sse41_invsqrt_d(rsq21);
752             rinv22           = sse41_invsqrt_d(rsq22);
753
754             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
755
756             fjx0             = _mm_setzero_pd();
757             fjy0             = _mm_setzero_pd();
758             fjz0             = _mm_setzero_pd();
759             fjx1             = _mm_setzero_pd();
760             fjy1             = _mm_setzero_pd();
761             fjz1             = _mm_setzero_pd();
762             fjx2             = _mm_setzero_pd();
763             fjy2             = _mm_setzero_pd();
764             fjz2             = _mm_setzero_pd();
765
766             /**************************
767              * CALCULATE INTERACTIONS *
768              **************************/
769
770             r00              = _mm_mul_pd(rsq00,rinv00);
771
772             /* Calculate table index by multiplying r with table scale and truncate to integer */
773             rt               = _mm_mul_pd(r00,vftabscale);
774             vfitab           = _mm_cvttpd_epi32(rt);
775             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
776             vfitab           = _mm_slli_epi32(vfitab,2);
777
778             /* CUBIC SPLINE TABLE ELECTROSTATICS */
779             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
780             F                = _mm_setzero_pd();
781             GMX_MM_TRANSPOSE2_PD(Y,F);
782             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
783             H                = _mm_setzero_pd();
784             GMX_MM_TRANSPOSE2_PD(G,H);
785             Heps             = _mm_mul_pd(vfeps,H);
786             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
787             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
788             velec            = _mm_mul_pd(qq00,VV);
789             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
790             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
791
792             /* LENNARD-JONES DISPERSION/REPULSION */
793
794             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
795             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
796             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
797             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
798             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
802             velecsum         = _mm_add_pd(velecsum,velec);
803             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
804             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
805
806             fscal            = _mm_add_pd(felec,fvdw);
807
808             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
809
810             /* Calculate temporary vectorial force */
811             tx               = _mm_mul_pd(fscal,dx00);
812             ty               = _mm_mul_pd(fscal,dy00);
813             tz               = _mm_mul_pd(fscal,dz00);
814
815             /* Update vectorial force */
816             fix0             = _mm_add_pd(fix0,tx);
817             fiy0             = _mm_add_pd(fiy0,ty);
818             fiz0             = _mm_add_pd(fiz0,tz);
819
820             fjx0             = _mm_add_pd(fjx0,tx);
821             fjy0             = _mm_add_pd(fjy0,ty);
822             fjz0             = _mm_add_pd(fjz0,tz);
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             r01              = _mm_mul_pd(rsq01,rinv01);
829
830             /* Calculate table index by multiplying r with table scale and truncate to integer */
831             rt               = _mm_mul_pd(r01,vftabscale);
832             vfitab           = _mm_cvttpd_epi32(rt);
833             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
834             vfitab           = _mm_slli_epi32(vfitab,2);
835
836             /* CUBIC SPLINE TABLE ELECTROSTATICS */
837             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
838             F                = _mm_setzero_pd();
839             GMX_MM_TRANSPOSE2_PD(Y,F);
840             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
841             H                = _mm_setzero_pd();
842             GMX_MM_TRANSPOSE2_PD(G,H);
843             Heps             = _mm_mul_pd(vfeps,H);
844             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
845             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
846             velec            = _mm_mul_pd(qq01,VV);
847             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
848             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
849
850             /* Update potential sum for this i atom from the interaction with this j atom. */
851             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
852             velecsum         = _mm_add_pd(velecsum,velec);
853
854             fscal            = felec;
855
856             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
857
858             /* Calculate temporary vectorial force */
859             tx               = _mm_mul_pd(fscal,dx01);
860             ty               = _mm_mul_pd(fscal,dy01);
861             tz               = _mm_mul_pd(fscal,dz01);
862
863             /* Update vectorial force */
864             fix0             = _mm_add_pd(fix0,tx);
865             fiy0             = _mm_add_pd(fiy0,ty);
866             fiz0             = _mm_add_pd(fiz0,tz);
867
868             fjx1             = _mm_add_pd(fjx1,tx);
869             fjy1             = _mm_add_pd(fjy1,ty);
870             fjz1             = _mm_add_pd(fjz1,tz);
871
872             /**************************
873              * CALCULATE INTERACTIONS *
874              **************************/
875
876             r02              = _mm_mul_pd(rsq02,rinv02);
877
878             /* Calculate table index by multiplying r with table scale and truncate to integer */
879             rt               = _mm_mul_pd(r02,vftabscale);
880             vfitab           = _mm_cvttpd_epi32(rt);
881             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
882             vfitab           = _mm_slli_epi32(vfitab,2);
883
884             /* CUBIC SPLINE TABLE ELECTROSTATICS */
885             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
886             F                = _mm_setzero_pd();
887             GMX_MM_TRANSPOSE2_PD(Y,F);
888             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
889             H                = _mm_setzero_pd();
890             GMX_MM_TRANSPOSE2_PD(G,H);
891             Heps             = _mm_mul_pd(vfeps,H);
892             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
893             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
894             velec            = _mm_mul_pd(qq02,VV);
895             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
896             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
897
898             /* Update potential sum for this i atom from the interaction with this j atom. */
899             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
900             velecsum         = _mm_add_pd(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
905
906             /* Calculate temporary vectorial force */
907             tx               = _mm_mul_pd(fscal,dx02);
908             ty               = _mm_mul_pd(fscal,dy02);
909             tz               = _mm_mul_pd(fscal,dz02);
910
911             /* Update vectorial force */
912             fix0             = _mm_add_pd(fix0,tx);
913             fiy0             = _mm_add_pd(fiy0,ty);
914             fiz0             = _mm_add_pd(fiz0,tz);
915
916             fjx2             = _mm_add_pd(fjx2,tx);
917             fjy2             = _mm_add_pd(fjy2,ty);
918             fjz2             = _mm_add_pd(fjz2,tz);
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             r10              = _mm_mul_pd(rsq10,rinv10);
925
926             /* Calculate table index by multiplying r with table scale and truncate to integer */
927             rt               = _mm_mul_pd(r10,vftabscale);
928             vfitab           = _mm_cvttpd_epi32(rt);
929             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
930             vfitab           = _mm_slli_epi32(vfitab,2);
931
932             /* CUBIC SPLINE TABLE ELECTROSTATICS */
933             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
934             F                = _mm_setzero_pd();
935             GMX_MM_TRANSPOSE2_PD(Y,F);
936             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
937             H                = _mm_setzero_pd();
938             GMX_MM_TRANSPOSE2_PD(G,H);
939             Heps             = _mm_mul_pd(vfeps,H);
940             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
941             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
942             velec            = _mm_mul_pd(qq10,VV);
943             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
944             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
945
946             /* Update potential sum for this i atom from the interaction with this j atom. */
947             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
948             velecsum         = _mm_add_pd(velecsum,velec);
949
950             fscal            = felec;
951
952             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
953
954             /* Calculate temporary vectorial force */
955             tx               = _mm_mul_pd(fscal,dx10);
956             ty               = _mm_mul_pd(fscal,dy10);
957             tz               = _mm_mul_pd(fscal,dz10);
958
959             /* Update vectorial force */
960             fix1             = _mm_add_pd(fix1,tx);
961             fiy1             = _mm_add_pd(fiy1,ty);
962             fiz1             = _mm_add_pd(fiz1,tz);
963
964             fjx0             = _mm_add_pd(fjx0,tx);
965             fjy0             = _mm_add_pd(fjy0,ty);
966             fjz0             = _mm_add_pd(fjz0,tz);
967
968             /**************************
969              * CALCULATE INTERACTIONS *
970              **************************/
971
972             r11              = _mm_mul_pd(rsq11,rinv11);
973
974             /* Calculate table index by multiplying r with table scale and truncate to integer */
975             rt               = _mm_mul_pd(r11,vftabscale);
976             vfitab           = _mm_cvttpd_epi32(rt);
977             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
978             vfitab           = _mm_slli_epi32(vfitab,2);
979
980             /* CUBIC SPLINE TABLE ELECTROSTATICS */
981             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
982             F                = _mm_setzero_pd();
983             GMX_MM_TRANSPOSE2_PD(Y,F);
984             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
985             H                = _mm_setzero_pd();
986             GMX_MM_TRANSPOSE2_PD(G,H);
987             Heps             = _mm_mul_pd(vfeps,H);
988             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
989             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
990             velec            = _mm_mul_pd(qq11,VV);
991             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
992             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
993
994             /* Update potential sum for this i atom from the interaction with this j atom. */
995             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
996             velecsum         = _mm_add_pd(velecsum,velec);
997
998             fscal            = felec;
999
1000             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1001
1002             /* Calculate temporary vectorial force */
1003             tx               = _mm_mul_pd(fscal,dx11);
1004             ty               = _mm_mul_pd(fscal,dy11);
1005             tz               = _mm_mul_pd(fscal,dz11);
1006
1007             /* Update vectorial force */
1008             fix1             = _mm_add_pd(fix1,tx);
1009             fiy1             = _mm_add_pd(fiy1,ty);
1010             fiz1             = _mm_add_pd(fiz1,tz);
1011
1012             fjx1             = _mm_add_pd(fjx1,tx);
1013             fjy1             = _mm_add_pd(fjy1,ty);
1014             fjz1             = _mm_add_pd(fjz1,tz);
1015
1016             /**************************
1017              * CALCULATE INTERACTIONS *
1018              **************************/
1019
1020             r12              = _mm_mul_pd(rsq12,rinv12);
1021
1022             /* Calculate table index by multiplying r with table scale and truncate to integer */
1023             rt               = _mm_mul_pd(r12,vftabscale);
1024             vfitab           = _mm_cvttpd_epi32(rt);
1025             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1026             vfitab           = _mm_slli_epi32(vfitab,2);
1027
1028             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1029             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1030             F                = _mm_setzero_pd();
1031             GMX_MM_TRANSPOSE2_PD(Y,F);
1032             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1033             H                = _mm_setzero_pd();
1034             GMX_MM_TRANSPOSE2_PD(G,H);
1035             Heps             = _mm_mul_pd(vfeps,H);
1036             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1037             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1038             velec            = _mm_mul_pd(qq12,VV);
1039             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1040             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1044             velecsum         = _mm_add_pd(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm_mul_pd(fscal,dx12);
1052             ty               = _mm_mul_pd(fscal,dy12);
1053             tz               = _mm_mul_pd(fscal,dz12);
1054
1055             /* Update vectorial force */
1056             fix1             = _mm_add_pd(fix1,tx);
1057             fiy1             = _mm_add_pd(fiy1,ty);
1058             fiz1             = _mm_add_pd(fiz1,tz);
1059
1060             fjx2             = _mm_add_pd(fjx2,tx);
1061             fjy2             = _mm_add_pd(fjy2,ty);
1062             fjz2             = _mm_add_pd(fjz2,tz);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             r20              = _mm_mul_pd(rsq20,rinv20);
1069
1070             /* Calculate table index by multiplying r with table scale and truncate to integer */
1071             rt               = _mm_mul_pd(r20,vftabscale);
1072             vfitab           = _mm_cvttpd_epi32(rt);
1073             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1074             vfitab           = _mm_slli_epi32(vfitab,2);
1075
1076             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1077             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1078             F                = _mm_setzero_pd();
1079             GMX_MM_TRANSPOSE2_PD(Y,F);
1080             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1081             H                = _mm_setzero_pd();
1082             GMX_MM_TRANSPOSE2_PD(G,H);
1083             Heps             = _mm_mul_pd(vfeps,H);
1084             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1085             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1086             velec            = _mm_mul_pd(qq20,VV);
1087             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1088             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1089
1090             /* Update potential sum for this i atom from the interaction with this j atom. */
1091             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1092             velecsum         = _mm_add_pd(velecsum,velec);
1093
1094             fscal            = felec;
1095
1096             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm_mul_pd(fscal,dx20);
1100             ty               = _mm_mul_pd(fscal,dy20);
1101             tz               = _mm_mul_pd(fscal,dz20);
1102
1103             /* Update vectorial force */
1104             fix2             = _mm_add_pd(fix2,tx);
1105             fiy2             = _mm_add_pd(fiy2,ty);
1106             fiz2             = _mm_add_pd(fiz2,tz);
1107
1108             fjx0             = _mm_add_pd(fjx0,tx);
1109             fjy0             = _mm_add_pd(fjy0,ty);
1110             fjz0             = _mm_add_pd(fjz0,tz);
1111
1112             /**************************
1113              * CALCULATE INTERACTIONS *
1114              **************************/
1115
1116             r21              = _mm_mul_pd(rsq21,rinv21);
1117
1118             /* Calculate table index by multiplying r with table scale and truncate to integer */
1119             rt               = _mm_mul_pd(r21,vftabscale);
1120             vfitab           = _mm_cvttpd_epi32(rt);
1121             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1122             vfitab           = _mm_slli_epi32(vfitab,2);
1123
1124             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1125             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1126             F                = _mm_setzero_pd();
1127             GMX_MM_TRANSPOSE2_PD(Y,F);
1128             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1129             H                = _mm_setzero_pd();
1130             GMX_MM_TRANSPOSE2_PD(G,H);
1131             Heps             = _mm_mul_pd(vfeps,H);
1132             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1133             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1134             velec            = _mm_mul_pd(qq21,VV);
1135             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1136             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1140             velecsum         = _mm_add_pd(velecsum,velec);
1141
1142             fscal            = felec;
1143
1144             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1145
1146             /* Calculate temporary vectorial force */
1147             tx               = _mm_mul_pd(fscal,dx21);
1148             ty               = _mm_mul_pd(fscal,dy21);
1149             tz               = _mm_mul_pd(fscal,dz21);
1150
1151             /* Update vectorial force */
1152             fix2             = _mm_add_pd(fix2,tx);
1153             fiy2             = _mm_add_pd(fiy2,ty);
1154             fiz2             = _mm_add_pd(fiz2,tz);
1155
1156             fjx1             = _mm_add_pd(fjx1,tx);
1157             fjy1             = _mm_add_pd(fjy1,ty);
1158             fjz1             = _mm_add_pd(fjz1,tz);
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             r22              = _mm_mul_pd(rsq22,rinv22);
1165
1166             /* Calculate table index by multiplying r with table scale and truncate to integer */
1167             rt               = _mm_mul_pd(r22,vftabscale);
1168             vfitab           = _mm_cvttpd_epi32(rt);
1169             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1170             vfitab           = _mm_slli_epi32(vfitab,2);
1171
1172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1173             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1174             F                = _mm_setzero_pd();
1175             GMX_MM_TRANSPOSE2_PD(Y,F);
1176             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1177             H                = _mm_setzero_pd();
1178             GMX_MM_TRANSPOSE2_PD(G,H);
1179             Heps             = _mm_mul_pd(vfeps,H);
1180             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1181             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1182             velec            = _mm_mul_pd(qq22,VV);
1183             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1184             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1185
1186             /* Update potential sum for this i atom from the interaction with this j atom. */
1187             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1188             velecsum         = _mm_add_pd(velecsum,velec);
1189
1190             fscal            = felec;
1191
1192             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1193
1194             /* Calculate temporary vectorial force */
1195             tx               = _mm_mul_pd(fscal,dx22);
1196             ty               = _mm_mul_pd(fscal,dy22);
1197             tz               = _mm_mul_pd(fscal,dz22);
1198
1199             /* Update vectorial force */
1200             fix2             = _mm_add_pd(fix2,tx);
1201             fiy2             = _mm_add_pd(fiy2,ty);
1202             fiz2             = _mm_add_pd(fiz2,tz);
1203
1204             fjx2             = _mm_add_pd(fjx2,tx);
1205             fjy2             = _mm_add_pd(fjy2,ty);
1206             fjz2             = _mm_add_pd(fjz2,tz);
1207
1208             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1209
1210             /* Inner loop uses 400 flops */
1211         }
1212
1213         /* End of innermost loop */
1214
1215         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1216                                               f+i_coord_offset,fshift+i_shift_offset);
1217
1218         ggid                        = gid[iidx];
1219         /* Update potential energies */
1220         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1221         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1222
1223         /* Increment number of inner iterations */
1224         inneriter                  += j_index_end - j_index_start;
1225
1226         /* Outer loop uses 20 flops */
1227     }
1228
1229     /* Increment number of outer iterations */
1230     outeriter        += nri;
1231
1232     /* Update outer/inner flops */
1233
1234     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400);
1235 }
1236 /*
1237  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1238  * Electrostatics interaction: CubicSplineTable
1239  * VdW interaction:            LennardJones
1240  * Geometry:                   Water3-Water3
1241  * Calculate force/pot:        Force
1242  */
1243 void
1244 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1245                     (t_nblist                    * gmx_restrict       nlist,
1246                      rvec                        * gmx_restrict          xx,
1247                      rvec                        * gmx_restrict          ff,
1248                      struct t_forcerec           * gmx_restrict          fr,
1249                      t_mdatoms                   * gmx_restrict     mdatoms,
1250                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1251                      t_nrnb                      * gmx_restrict        nrnb)
1252 {
1253     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1254      * just 0 for non-waters.
1255      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1256      * jnr indices corresponding to data put in the four positions in the SIMD register.
1257      */
1258     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1259     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1260     int              jnrA,jnrB;
1261     int              j_coord_offsetA,j_coord_offsetB;
1262     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1263     real             rcutoff_scalar;
1264     real             *shiftvec,*fshift,*x,*f;
1265     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1266     int              vdwioffset0;
1267     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1268     int              vdwioffset1;
1269     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1270     int              vdwioffset2;
1271     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1272     int              vdwjidx0A,vdwjidx0B;
1273     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1274     int              vdwjidx1A,vdwjidx1B;
1275     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1276     int              vdwjidx2A,vdwjidx2B;
1277     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1278     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1279     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1280     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1281     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1282     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1283     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1284     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1285     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1286     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1287     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1288     real             *charge;
1289     int              nvdwtype;
1290     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1291     int              *vdwtype;
1292     real             *vdwparam;
1293     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1294     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1295     __m128i          vfitab;
1296     __m128i          ifour       = _mm_set1_epi32(4);
1297     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1298     real             *vftab;
1299     __m128d          dummy_mask,cutoff_mask;
1300     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1301     __m128d          one     = _mm_set1_pd(1.0);
1302     __m128d          two     = _mm_set1_pd(2.0);
1303     x                = xx[0];
1304     f                = ff[0];
1305
1306     nri              = nlist->nri;
1307     iinr             = nlist->iinr;
1308     jindex           = nlist->jindex;
1309     jjnr             = nlist->jjnr;
1310     shiftidx         = nlist->shift;
1311     gid              = nlist->gid;
1312     shiftvec         = fr->shift_vec[0];
1313     fshift           = fr->fshift[0];
1314     facel            = _mm_set1_pd(fr->ic->epsfac);
1315     charge           = mdatoms->chargeA;
1316     nvdwtype         = fr->ntype;
1317     vdwparam         = fr->nbfp;
1318     vdwtype          = mdatoms->typeA;
1319
1320     vftab            = kernel_data->table_elec->data;
1321     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
1322
1323     /* Setup water-specific parameters */
1324     inr              = nlist->iinr[0];
1325     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1326     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1327     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1328     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1329
1330     jq0              = _mm_set1_pd(charge[inr+0]);
1331     jq1              = _mm_set1_pd(charge[inr+1]);
1332     jq2              = _mm_set1_pd(charge[inr+2]);
1333     vdwjidx0A        = 2*vdwtype[inr+0];
1334     qq00             = _mm_mul_pd(iq0,jq0);
1335     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1336     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1337     qq01             = _mm_mul_pd(iq0,jq1);
1338     qq02             = _mm_mul_pd(iq0,jq2);
1339     qq10             = _mm_mul_pd(iq1,jq0);
1340     qq11             = _mm_mul_pd(iq1,jq1);
1341     qq12             = _mm_mul_pd(iq1,jq2);
1342     qq20             = _mm_mul_pd(iq2,jq0);
1343     qq21             = _mm_mul_pd(iq2,jq1);
1344     qq22             = _mm_mul_pd(iq2,jq2);
1345
1346     /* Avoid stupid compiler warnings */
1347     jnrA = jnrB = 0;
1348     j_coord_offsetA = 0;
1349     j_coord_offsetB = 0;
1350
1351     outeriter        = 0;
1352     inneriter        = 0;
1353
1354     /* Start outer loop over neighborlists */
1355     for(iidx=0; iidx<nri; iidx++)
1356     {
1357         /* Load shift vector for this list */
1358         i_shift_offset   = DIM*shiftidx[iidx];
1359
1360         /* Load limits for loop over neighbors */
1361         j_index_start    = jindex[iidx];
1362         j_index_end      = jindex[iidx+1];
1363
1364         /* Get outer coordinate index */
1365         inr              = iinr[iidx];
1366         i_coord_offset   = DIM*inr;
1367
1368         /* Load i particle coords and add shift vector */
1369         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1370                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1371
1372         fix0             = _mm_setzero_pd();
1373         fiy0             = _mm_setzero_pd();
1374         fiz0             = _mm_setzero_pd();
1375         fix1             = _mm_setzero_pd();
1376         fiy1             = _mm_setzero_pd();
1377         fiz1             = _mm_setzero_pd();
1378         fix2             = _mm_setzero_pd();
1379         fiy2             = _mm_setzero_pd();
1380         fiz2             = _mm_setzero_pd();
1381
1382         /* Start inner kernel loop */
1383         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1384         {
1385
1386             /* Get j neighbor index, and coordinate index */
1387             jnrA             = jjnr[jidx];
1388             jnrB             = jjnr[jidx+1];
1389             j_coord_offsetA  = DIM*jnrA;
1390             j_coord_offsetB  = DIM*jnrB;
1391
1392             /* load j atom coordinates */
1393             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1394                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1395
1396             /* Calculate displacement vector */
1397             dx00             = _mm_sub_pd(ix0,jx0);
1398             dy00             = _mm_sub_pd(iy0,jy0);
1399             dz00             = _mm_sub_pd(iz0,jz0);
1400             dx01             = _mm_sub_pd(ix0,jx1);
1401             dy01             = _mm_sub_pd(iy0,jy1);
1402             dz01             = _mm_sub_pd(iz0,jz1);
1403             dx02             = _mm_sub_pd(ix0,jx2);
1404             dy02             = _mm_sub_pd(iy0,jy2);
1405             dz02             = _mm_sub_pd(iz0,jz2);
1406             dx10             = _mm_sub_pd(ix1,jx0);
1407             dy10             = _mm_sub_pd(iy1,jy0);
1408             dz10             = _mm_sub_pd(iz1,jz0);
1409             dx11             = _mm_sub_pd(ix1,jx1);
1410             dy11             = _mm_sub_pd(iy1,jy1);
1411             dz11             = _mm_sub_pd(iz1,jz1);
1412             dx12             = _mm_sub_pd(ix1,jx2);
1413             dy12             = _mm_sub_pd(iy1,jy2);
1414             dz12             = _mm_sub_pd(iz1,jz2);
1415             dx20             = _mm_sub_pd(ix2,jx0);
1416             dy20             = _mm_sub_pd(iy2,jy0);
1417             dz20             = _mm_sub_pd(iz2,jz0);
1418             dx21             = _mm_sub_pd(ix2,jx1);
1419             dy21             = _mm_sub_pd(iy2,jy1);
1420             dz21             = _mm_sub_pd(iz2,jz1);
1421             dx22             = _mm_sub_pd(ix2,jx2);
1422             dy22             = _mm_sub_pd(iy2,jy2);
1423             dz22             = _mm_sub_pd(iz2,jz2);
1424
1425             /* Calculate squared distance and things based on it */
1426             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1427             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1428             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1429             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1430             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1431             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1432             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1433             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1434             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1435
1436             rinv00           = sse41_invsqrt_d(rsq00);
1437             rinv01           = sse41_invsqrt_d(rsq01);
1438             rinv02           = sse41_invsqrt_d(rsq02);
1439             rinv10           = sse41_invsqrt_d(rsq10);
1440             rinv11           = sse41_invsqrt_d(rsq11);
1441             rinv12           = sse41_invsqrt_d(rsq12);
1442             rinv20           = sse41_invsqrt_d(rsq20);
1443             rinv21           = sse41_invsqrt_d(rsq21);
1444             rinv22           = sse41_invsqrt_d(rsq22);
1445
1446             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1447
1448             fjx0             = _mm_setzero_pd();
1449             fjy0             = _mm_setzero_pd();
1450             fjz0             = _mm_setzero_pd();
1451             fjx1             = _mm_setzero_pd();
1452             fjy1             = _mm_setzero_pd();
1453             fjz1             = _mm_setzero_pd();
1454             fjx2             = _mm_setzero_pd();
1455             fjy2             = _mm_setzero_pd();
1456             fjz2             = _mm_setzero_pd();
1457
1458             /**************************
1459              * CALCULATE INTERACTIONS *
1460              **************************/
1461
1462             r00              = _mm_mul_pd(rsq00,rinv00);
1463
1464             /* Calculate table index by multiplying r with table scale and truncate to integer */
1465             rt               = _mm_mul_pd(r00,vftabscale);
1466             vfitab           = _mm_cvttpd_epi32(rt);
1467             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1468             vfitab           = _mm_slli_epi32(vfitab,2);
1469
1470             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1471             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1472             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1473             GMX_MM_TRANSPOSE2_PD(Y,F);
1474             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1475             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1476             GMX_MM_TRANSPOSE2_PD(G,H);
1477             Heps             = _mm_mul_pd(vfeps,H);
1478             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1479             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1480             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1481
1482             /* LENNARD-JONES DISPERSION/REPULSION */
1483
1484             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1485             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1486
1487             fscal            = _mm_add_pd(felec,fvdw);
1488
1489             /* Calculate temporary vectorial force */
1490             tx               = _mm_mul_pd(fscal,dx00);
1491             ty               = _mm_mul_pd(fscal,dy00);
1492             tz               = _mm_mul_pd(fscal,dz00);
1493
1494             /* Update vectorial force */
1495             fix0             = _mm_add_pd(fix0,tx);
1496             fiy0             = _mm_add_pd(fiy0,ty);
1497             fiz0             = _mm_add_pd(fiz0,tz);
1498
1499             fjx0             = _mm_add_pd(fjx0,tx);
1500             fjy0             = _mm_add_pd(fjy0,ty);
1501             fjz0             = _mm_add_pd(fjz0,tz);
1502
1503             /**************************
1504              * CALCULATE INTERACTIONS *
1505              **************************/
1506
1507             r01              = _mm_mul_pd(rsq01,rinv01);
1508
1509             /* Calculate table index by multiplying r with table scale and truncate to integer */
1510             rt               = _mm_mul_pd(r01,vftabscale);
1511             vfitab           = _mm_cvttpd_epi32(rt);
1512             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1513             vfitab           = _mm_slli_epi32(vfitab,2);
1514
1515             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1516             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1517             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1518             GMX_MM_TRANSPOSE2_PD(Y,F);
1519             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1520             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1521             GMX_MM_TRANSPOSE2_PD(G,H);
1522             Heps             = _mm_mul_pd(vfeps,H);
1523             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1524             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1525             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1526
1527             fscal            = felec;
1528
1529             /* Calculate temporary vectorial force */
1530             tx               = _mm_mul_pd(fscal,dx01);
1531             ty               = _mm_mul_pd(fscal,dy01);
1532             tz               = _mm_mul_pd(fscal,dz01);
1533
1534             /* Update vectorial force */
1535             fix0             = _mm_add_pd(fix0,tx);
1536             fiy0             = _mm_add_pd(fiy0,ty);
1537             fiz0             = _mm_add_pd(fiz0,tz);
1538
1539             fjx1             = _mm_add_pd(fjx1,tx);
1540             fjy1             = _mm_add_pd(fjy1,ty);
1541             fjz1             = _mm_add_pd(fjz1,tz);
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             r02              = _mm_mul_pd(rsq02,rinv02);
1548
1549             /* Calculate table index by multiplying r with table scale and truncate to integer */
1550             rt               = _mm_mul_pd(r02,vftabscale);
1551             vfitab           = _mm_cvttpd_epi32(rt);
1552             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1553             vfitab           = _mm_slli_epi32(vfitab,2);
1554
1555             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1556             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1557             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1558             GMX_MM_TRANSPOSE2_PD(Y,F);
1559             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1560             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1561             GMX_MM_TRANSPOSE2_PD(G,H);
1562             Heps             = _mm_mul_pd(vfeps,H);
1563             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1564             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1565             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1566
1567             fscal            = felec;
1568
1569             /* Calculate temporary vectorial force */
1570             tx               = _mm_mul_pd(fscal,dx02);
1571             ty               = _mm_mul_pd(fscal,dy02);
1572             tz               = _mm_mul_pd(fscal,dz02);
1573
1574             /* Update vectorial force */
1575             fix0             = _mm_add_pd(fix0,tx);
1576             fiy0             = _mm_add_pd(fiy0,ty);
1577             fiz0             = _mm_add_pd(fiz0,tz);
1578
1579             fjx2             = _mm_add_pd(fjx2,tx);
1580             fjy2             = _mm_add_pd(fjy2,ty);
1581             fjz2             = _mm_add_pd(fjz2,tz);
1582
1583             /**************************
1584              * CALCULATE INTERACTIONS *
1585              **************************/
1586
1587             r10              = _mm_mul_pd(rsq10,rinv10);
1588
1589             /* Calculate table index by multiplying r with table scale and truncate to integer */
1590             rt               = _mm_mul_pd(r10,vftabscale);
1591             vfitab           = _mm_cvttpd_epi32(rt);
1592             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1593             vfitab           = _mm_slli_epi32(vfitab,2);
1594
1595             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1596             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1597             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1598             GMX_MM_TRANSPOSE2_PD(Y,F);
1599             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1600             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1601             GMX_MM_TRANSPOSE2_PD(G,H);
1602             Heps             = _mm_mul_pd(vfeps,H);
1603             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1604             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1605             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1606
1607             fscal            = felec;
1608
1609             /* Calculate temporary vectorial force */
1610             tx               = _mm_mul_pd(fscal,dx10);
1611             ty               = _mm_mul_pd(fscal,dy10);
1612             tz               = _mm_mul_pd(fscal,dz10);
1613
1614             /* Update vectorial force */
1615             fix1             = _mm_add_pd(fix1,tx);
1616             fiy1             = _mm_add_pd(fiy1,ty);
1617             fiz1             = _mm_add_pd(fiz1,tz);
1618
1619             fjx0             = _mm_add_pd(fjx0,tx);
1620             fjy0             = _mm_add_pd(fjy0,ty);
1621             fjz0             = _mm_add_pd(fjz0,tz);
1622
1623             /**************************
1624              * CALCULATE INTERACTIONS *
1625              **************************/
1626
1627             r11              = _mm_mul_pd(rsq11,rinv11);
1628
1629             /* Calculate table index by multiplying r with table scale and truncate to integer */
1630             rt               = _mm_mul_pd(r11,vftabscale);
1631             vfitab           = _mm_cvttpd_epi32(rt);
1632             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1633             vfitab           = _mm_slli_epi32(vfitab,2);
1634
1635             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1636             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1637             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1638             GMX_MM_TRANSPOSE2_PD(Y,F);
1639             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1640             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1641             GMX_MM_TRANSPOSE2_PD(G,H);
1642             Heps             = _mm_mul_pd(vfeps,H);
1643             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1644             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1645             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1646
1647             fscal            = felec;
1648
1649             /* Calculate temporary vectorial force */
1650             tx               = _mm_mul_pd(fscal,dx11);
1651             ty               = _mm_mul_pd(fscal,dy11);
1652             tz               = _mm_mul_pd(fscal,dz11);
1653
1654             /* Update vectorial force */
1655             fix1             = _mm_add_pd(fix1,tx);
1656             fiy1             = _mm_add_pd(fiy1,ty);
1657             fiz1             = _mm_add_pd(fiz1,tz);
1658
1659             fjx1             = _mm_add_pd(fjx1,tx);
1660             fjy1             = _mm_add_pd(fjy1,ty);
1661             fjz1             = _mm_add_pd(fjz1,tz);
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             r12              = _mm_mul_pd(rsq12,rinv12);
1668
1669             /* Calculate table index by multiplying r with table scale and truncate to integer */
1670             rt               = _mm_mul_pd(r12,vftabscale);
1671             vfitab           = _mm_cvttpd_epi32(rt);
1672             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1673             vfitab           = _mm_slli_epi32(vfitab,2);
1674
1675             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1676             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1677             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1678             GMX_MM_TRANSPOSE2_PD(Y,F);
1679             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1680             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1681             GMX_MM_TRANSPOSE2_PD(G,H);
1682             Heps             = _mm_mul_pd(vfeps,H);
1683             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1684             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1685             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1686
1687             fscal            = felec;
1688
1689             /* Calculate temporary vectorial force */
1690             tx               = _mm_mul_pd(fscal,dx12);
1691             ty               = _mm_mul_pd(fscal,dy12);
1692             tz               = _mm_mul_pd(fscal,dz12);
1693
1694             /* Update vectorial force */
1695             fix1             = _mm_add_pd(fix1,tx);
1696             fiy1             = _mm_add_pd(fiy1,ty);
1697             fiz1             = _mm_add_pd(fiz1,tz);
1698
1699             fjx2             = _mm_add_pd(fjx2,tx);
1700             fjy2             = _mm_add_pd(fjy2,ty);
1701             fjz2             = _mm_add_pd(fjz2,tz);
1702
1703             /**************************
1704              * CALCULATE INTERACTIONS *
1705              **************************/
1706
1707             r20              = _mm_mul_pd(rsq20,rinv20);
1708
1709             /* Calculate table index by multiplying r with table scale and truncate to integer */
1710             rt               = _mm_mul_pd(r20,vftabscale);
1711             vfitab           = _mm_cvttpd_epi32(rt);
1712             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1713             vfitab           = _mm_slli_epi32(vfitab,2);
1714
1715             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1716             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1717             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1718             GMX_MM_TRANSPOSE2_PD(Y,F);
1719             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1720             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1721             GMX_MM_TRANSPOSE2_PD(G,H);
1722             Heps             = _mm_mul_pd(vfeps,H);
1723             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1724             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1725             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1726
1727             fscal            = felec;
1728
1729             /* Calculate temporary vectorial force */
1730             tx               = _mm_mul_pd(fscal,dx20);
1731             ty               = _mm_mul_pd(fscal,dy20);
1732             tz               = _mm_mul_pd(fscal,dz20);
1733
1734             /* Update vectorial force */
1735             fix2             = _mm_add_pd(fix2,tx);
1736             fiy2             = _mm_add_pd(fiy2,ty);
1737             fiz2             = _mm_add_pd(fiz2,tz);
1738
1739             fjx0             = _mm_add_pd(fjx0,tx);
1740             fjy0             = _mm_add_pd(fjy0,ty);
1741             fjz0             = _mm_add_pd(fjz0,tz);
1742
1743             /**************************
1744              * CALCULATE INTERACTIONS *
1745              **************************/
1746
1747             r21              = _mm_mul_pd(rsq21,rinv21);
1748
1749             /* Calculate table index by multiplying r with table scale and truncate to integer */
1750             rt               = _mm_mul_pd(r21,vftabscale);
1751             vfitab           = _mm_cvttpd_epi32(rt);
1752             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1753             vfitab           = _mm_slli_epi32(vfitab,2);
1754
1755             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1757             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1758             GMX_MM_TRANSPOSE2_PD(Y,F);
1759             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1760             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1761             GMX_MM_TRANSPOSE2_PD(G,H);
1762             Heps             = _mm_mul_pd(vfeps,H);
1763             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1764             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1765             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1766
1767             fscal            = felec;
1768
1769             /* Calculate temporary vectorial force */
1770             tx               = _mm_mul_pd(fscal,dx21);
1771             ty               = _mm_mul_pd(fscal,dy21);
1772             tz               = _mm_mul_pd(fscal,dz21);
1773
1774             /* Update vectorial force */
1775             fix2             = _mm_add_pd(fix2,tx);
1776             fiy2             = _mm_add_pd(fiy2,ty);
1777             fiz2             = _mm_add_pd(fiz2,tz);
1778
1779             fjx1             = _mm_add_pd(fjx1,tx);
1780             fjy1             = _mm_add_pd(fjy1,ty);
1781             fjz1             = _mm_add_pd(fjz1,tz);
1782
1783             /**************************
1784              * CALCULATE INTERACTIONS *
1785              **************************/
1786
1787             r22              = _mm_mul_pd(rsq22,rinv22);
1788
1789             /* Calculate table index by multiplying r with table scale and truncate to integer */
1790             rt               = _mm_mul_pd(r22,vftabscale);
1791             vfitab           = _mm_cvttpd_epi32(rt);
1792             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1793             vfitab           = _mm_slli_epi32(vfitab,2);
1794
1795             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1796             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1797             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1798             GMX_MM_TRANSPOSE2_PD(Y,F);
1799             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1800             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1801             GMX_MM_TRANSPOSE2_PD(G,H);
1802             Heps             = _mm_mul_pd(vfeps,H);
1803             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1804             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1805             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1806
1807             fscal            = felec;
1808
1809             /* Calculate temporary vectorial force */
1810             tx               = _mm_mul_pd(fscal,dx22);
1811             ty               = _mm_mul_pd(fscal,dy22);
1812             tz               = _mm_mul_pd(fscal,dz22);
1813
1814             /* Update vectorial force */
1815             fix2             = _mm_add_pd(fix2,tx);
1816             fiy2             = _mm_add_pd(fiy2,ty);
1817             fiz2             = _mm_add_pd(fiz2,tz);
1818
1819             fjx2             = _mm_add_pd(fjx2,tx);
1820             fjy2             = _mm_add_pd(fjy2,ty);
1821             fjz2             = _mm_add_pd(fjz2,tz);
1822
1823             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1824
1825             /* Inner loop uses 359 flops */
1826         }
1827
1828         if(jidx<j_index_end)
1829         {
1830
1831             jnrA             = jjnr[jidx];
1832             j_coord_offsetA  = DIM*jnrA;
1833
1834             /* load j atom coordinates */
1835             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1836                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1837
1838             /* Calculate displacement vector */
1839             dx00             = _mm_sub_pd(ix0,jx0);
1840             dy00             = _mm_sub_pd(iy0,jy0);
1841             dz00             = _mm_sub_pd(iz0,jz0);
1842             dx01             = _mm_sub_pd(ix0,jx1);
1843             dy01             = _mm_sub_pd(iy0,jy1);
1844             dz01             = _mm_sub_pd(iz0,jz1);
1845             dx02             = _mm_sub_pd(ix0,jx2);
1846             dy02             = _mm_sub_pd(iy0,jy2);
1847             dz02             = _mm_sub_pd(iz0,jz2);
1848             dx10             = _mm_sub_pd(ix1,jx0);
1849             dy10             = _mm_sub_pd(iy1,jy0);
1850             dz10             = _mm_sub_pd(iz1,jz0);
1851             dx11             = _mm_sub_pd(ix1,jx1);
1852             dy11             = _mm_sub_pd(iy1,jy1);
1853             dz11             = _mm_sub_pd(iz1,jz1);
1854             dx12             = _mm_sub_pd(ix1,jx2);
1855             dy12             = _mm_sub_pd(iy1,jy2);
1856             dz12             = _mm_sub_pd(iz1,jz2);
1857             dx20             = _mm_sub_pd(ix2,jx0);
1858             dy20             = _mm_sub_pd(iy2,jy0);
1859             dz20             = _mm_sub_pd(iz2,jz0);
1860             dx21             = _mm_sub_pd(ix2,jx1);
1861             dy21             = _mm_sub_pd(iy2,jy1);
1862             dz21             = _mm_sub_pd(iz2,jz1);
1863             dx22             = _mm_sub_pd(ix2,jx2);
1864             dy22             = _mm_sub_pd(iy2,jy2);
1865             dz22             = _mm_sub_pd(iz2,jz2);
1866
1867             /* Calculate squared distance and things based on it */
1868             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1869             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1870             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1871             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1872             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1873             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1874             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1875             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1876             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1877
1878             rinv00           = sse41_invsqrt_d(rsq00);
1879             rinv01           = sse41_invsqrt_d(rsq01);
1880             rinv02           = sse41_invsqrt_d(rsq02);
1881             rinv10           = sse41_invsqrt_d(rsq10);
1882             rinv11           = sse41_invsqrt_d(rsq11);
1883             rinv12           = sse41_invsqrt_d(rsq12);
1884             rinv20           = sse41_invsqrt_d(rsq20);
1885             rinv21           = sse41_invsqrt_d(rsq21);
1886             rinv22           = sse41_invsqrt_d(rsq22);
1887
1888             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1889
1890             fjx0             = _mm_setzero_pd();
1891             fjy0             = _mm_setzero_pd();
1892             fjz0             = _mm_setzero_pd();
1893             fjx1             = _mm_setzero_pd();
1894             fjy1             = _mm_setzero_pd();
1895             fjz1             = _mm_setzero_pd();
1896             fjx2             = _mm_setzero_pd();
1897             fjy2             = _mm_setzero_pd();
1898             fjz2             = _mm_setzero_pd();
1899
1900             /**************************
1901              * CALCULATE INTERACTIONS *
1902              **************************/
1903
1904             r00              = _mm_mul_pd(rsq00,rinv00);
1905
1906             /* Calculate table index by multiplying r with table scale and truncate to integer */
1907             rt               = _mm_mul_pd(r00,vftabscale);
1908             vfitab           = _mm_cvttpd_epi32(rt);
1909             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1910             vfitab           = _mm_slli_epi32(vfitab,2);
1911
1912             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1913             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1914             F                = _mm_setzero_pd();
1915             GMX_MM_TRANSPOSE2_PD(Y,F);
1916             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1917             H                = _mm_setzero_pd();
1918             GMX_MM_TRANSPOSE2_PD(G,H);
1919             Heps             = _mm_mul_pd(vfeps,H);
1920             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1921             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1922             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1923
1924             /* LENNARD-JONES DISPERSION/REPULSION */
1925
1926             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1927             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1928
1929             fscal            = _mm_add_pd(felec,fvdw);
1930
1931             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1932
1933             /* Calculate temporary vectorial force */
1934             tx               = _mm_mul_pd(fscal,dx00);
1935             ty               = _mm_mul_pd(fscal,dy00);
1936             tz               = _mm_mul_pd(fscal,dz00);
1937
1938             /* Update vectorial force */
1939             fix0             = _mm_add_pd(fix0,tx);
1940             fiy0             = _mm_add_pd(fiy0,ty);
1941             fiz0             = _mm_add_pd(fiz0,tz);
1942
1943             fjx0             = _mm_add_pd(fjx0,tx);
1944             fjy0             = _mm_add_pd(fjy0,ty);
1945             fjz0             = _mm_add_pd(fjz0,tz);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             r01              = _mm_mul_pd(rsq01,rinv01);
1952
1953             /* Calculate table index by multiplying r with table scale and truncate to integer */
1954             rt               = _mm_mul_pd(r01,vftabscale);
1955             vfitab           = _mm_cvttpd_epi32(rt);
1956             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1957             vfitab           = _mm_slli_epi32(vfitab,2);
1958
1959             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1960             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1961             F                = _mm_setzero_pd();
1962             GMX_MM_TRANSPOSE2_PD(Y,F);
1963             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1964             H                = _mm_setzero_pd();
1965             GMX_MM_TRANSPOSE2_PD(G,H);
1966             Heps             = _mm_mul_pd(vfeps,H);
1967             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1968             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1969             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1970
1971             fscal            = felec;
1972
1973             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm_mul_pd(fscal,dx01);
1977             ty               = _mm_mul_pd(fscal,dy01);
1978             tz               = _mm_mul_pd(fscal,dz01);
1979
1980             /* Update vectorial force */
1981             fix0             = _mm_add_pd(fix0,tx);
1982             fiy0             = _mm_add_pd(fiy0,ty);
1983             fiz0             = _mm_add_pd(fiz0,tz);
1984
1985             fjx1             = _mm_add_pd(fjx1,tx);
1986             fjy1             = _mm_add_pd(fjy1,ty);
1987             fjz1             = _mm_add_pd(fjz1,tz);
1988
1989             /**************************
1990              * CALCULATE INTERACTIONS *
1991              **************************/
1992
1993             r02              = _mm_mul_pd(rsq02,rinv02);
1994
1995             /* Calculate table index by multiplying r with table scale and truncate to integer */
1996             rt               = _mm_mul_pd(r02,vftabscale);
1997             vfitab           = _mm_cvttpd_epi32(rt);
1998             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1999             vfitab           = _mm_slli_epi32(vfitab,2);
2000
2001             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2002             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2003             F                = _mm_setzero_pd();
2004             GMX_MM_TRANSPOSE2_PD(Y,F);
2005             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2006             H                = _mm_setzero_pd();
2007             GMX_MM_TRANSPOSE2_PD(G,H);
2008             Heps             = _mm_mul_pd(vfeps,H);
2009             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2010             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2011             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2012
2013             fscal            = felec;
2014
2015             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2016
2017             /* Calculate temporary vectorial force */
2018             tx               = _mm_mul_pd(fscal,dx02);
2019             ty               = _mm_mul_pd(fscal,dy02);
2020             tz               = _mm_mul_pd(fscal,dz02);
2021
2022             /* Update vectorial force */
2023             fix0             = _mm_add_pd(fix0,tx);
2024             fiy0             = _mm_add_pd(fiy0,ty);
2025             fiz0             = _mm_add_pd(fiz0,tz);
2026
2027             fjx2             = _mm_add_pd(fjx2,tx);
2028             fjy2             = _mm_add_pd(fjy2,ty);
2029             fjz2             = _mm_add_pd(fjz2,tz);
2030
2031             /**************************
2032              * CALCULATE INTERACTIONS *
2033              **************************/
2034
2035             r10              = _mm_mul_pd(rsq10,rinv10);
2036
2037             /* Calculate table index by multiplying r with table scale and truncate to integer */
2038             rt               = _mm_mul_pd(r10,vftabscale);
2039             vfitab           = _mm_cvttpd_epi32(rt);
2040             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2041             vfitab           = _mm_slli_epi32(vfitab,2);
2042
2043             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2044             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2045             F                = _mm_setzero_pd();
2046             GMX_MM_TRANSPOSE2_PD(Y,F);
2047             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2048             H                = _mm_setzero_pd();
2049             GMX_MM_TRANSPOSE2_PD(G,H);
2050             Heps             = _mm_mul_pd(vfeps,H);
2051             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2052             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2053             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2054
2055             fscal            = felec;
2056
2057             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2058
2059             /* Calculate temporary vectorial force */
2060             tx               = _mm_mul_pd(fscal,dx10);
2061             ty               = _mm_mul_pd(fscal,dy10);
2062             tz               = _mm_mul_pd(fscal,dz10);
2063
2064             /* Update vectorial force */
2065             fix1             = _mm_add_pd(fix1,tx);
2066             fiy1             = _mm_add_pd(fiy1,ty);
2067             fiz1             = _mm_add_pd(fiz1,tz);
2068
2069             fjx0             = _mm_add_pd(fjx0,tx);
2070             fjy0             = _mm_add_pd(fjy0,ty);
2071             fjz0             = _mm_add_pd(fjz0,tz);
2072
2073             /**************************
2074              * CALCULATE INTERACTIONS *
2075              **************************/
2076
2077             r11              = _mm_mul_pd(rsq11,rinv11);
2078
2079             /* Calculate table index by multiplying r with table scale and truncate to integer */
2080             rt               = _mm_mul_pd(r11,vftabscale);
2081             vfitab           = _mm_cvttpd_epi32(rt);
2082             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2083             vfitab           = _mm_slli_epi32(vfitab,2);
2084
2085             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2086             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2087             F                = _mm_setzero_pd();
2088             GMX_MM_TRANSPOSE2_PD(Y,F);
2089             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2090             H                = _mm_setzero_pd();
2091             GMX_MM_TRANSPOSE2_PD(G,H);
2092             Heps             = _mm_mul_pd(vfeps,H);
2093             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2094             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2095             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2096
2097             fscal            = felec;
2098
2099             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2100
2101             /* Calculate temporary vectorial force */
2102             tx               = _mm_mul_pd(fscal,dx11);
2103             ty               = _mm_mul_pd(fscal,dy11);
2104             tz               = _mm_mul_pd(fscal,dz11);
2105
2106             /* Update vectorial force */
2107             fix1             = _mm_add_pd(fix1,tx);
2108             fiy1             = _mm_add_pd(fiy1,ty);
2109             fiz1             = _mm_add_pd(fiz1,tz);
2110
2111             fjx1             = _mm_add_pd(fjx1,tx);
2112             fjy1             = _mm_add_pd(fjy1,ty);
2113             fjz1             = _mm_add_pd(fjz1,tz);
2114
2115             /**************************
2116              * CALCULATE INTERACTIONS *
2117              **************************/
2118
2119             r12              = _mm_mul_pd(rsq12,rinv12);
2120
2121             /* Calculate table index by multiplying r with table scale and truncate to integer */
2122             rt               = _mm_mul_pd(r12,vftabscale);
2123             vfitab           = _mm_cvttpd_epi32(rt);
2124             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2125             vfitab           = _mm_slli_epi32(vfitab,2);
2126
2127             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2128             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2129             F                = _mm_setzero_pd();
2130             GMX_MM_TRANSPOSE2_PD(Y,F);
2131             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2132             H                = _mm_setzero_pd();
2133             GMX_MM_TRANSPOSE2_PD(G,H);
2134             Heps             = _mm_mul_pd(vfeps,H);
2135             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2136             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2137             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2138
2139             fscal            = felec;
2140
2141             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2142
2143             /* Calculate temporary vectorial force */
2144             tx               = _mm_mul_pd(fscal,dx12);
2145             ty               = _mm_mul_pd(fscal,dy12);
2146             tz               = _mm_mul_pd(fscal,dz12);
2147
2148             /* Update vectorial force */
2149             fix1             = _mm_add_pd(fix1,tx);
2150             fiy1             = _mm_add_pd(fiy1,ty);
2151             fiz1             = _mm_add_pd(fiz1,tz);
2152
2153             fjx2             = _mm_add_pd(fjx2,tx);
2154             fjy2             = _mm_add_pd(fjy2,ty);
2155             fjz2             = _mm_add_pd(fjz2,tz);
2156
2157             /**************************
2158              * CALCULATE INTERACTIONS *
2159              **************************/
2160
2161             r20              = _mm_mul_pd(rsq20,rinv20);
2162
2163             /* Calculate table index by multiplying r with table scale and truncate to integer */
2164             rt               = _mm_mul_pd(r20,vftabscale);
2165             vfitab           = _mm_cvttpd_epi32(rt);
2166             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2167             vfitab           = _mm_slli_epi32(vfitab,2);
2168
2169             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2171             F                = _mm_setzero_pd();
2172             GMX_MM_TRANSPOSE2_PD(Y,F);
2173             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2174             H                = _mm_setzero_pd();
2175             GMX_MM_TRANSPOSE2_PD(G,H);
2176             Heps             = _mm_mul_pd(vfeps,H);
2177             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2178             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2179             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2180
2181             fscal            = felec;
2182
2183             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2184
2185             /* Calculate temporary vectorial force */
2186             tx               = _mm_mul_pd(fscal,dx20);
2187             ty               = _mm_mul_pd(fscal,dy20);
2188             tz               = _mm_mul_pd(fscal,dz20);
2189
2190             /* Update vectorial force */
2191             fix2             = _mm_add_pd(fix2,tx);
2192             fiy2             = _mm_add_pd(fiy2,ty);
2193             fiz2             = _mm_add_pd(fiz2,tz);
2194
2195             fjx0             = _mm_add_pd(fjx0,tx);
2196             fjy0             = _mm_add_pd(fjy0,ty);
2197             fjz0             = _mm_add_pd(fjz0,tz);
2198
2199             /**************************
2200              * CALCULATE INTERACTIONS *
2201              **************************/
2202
2203             r21              = _mm_mul_pd(rsq21,rinv21);
2204
2205             /* Calculate table index by multiplying r with table scale and truncate to integer */
2206             rt               = _mm_mul_pd(r21,vftabscale);
2207             vfitab           = _mm_cvttpd_epi32(rt);
2208             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2209             vfitab           = _mm_slli_epi32(vfitab,2);
2210
2211             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2212             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2213             F                = _mm_setzero_pd();
2214             GMX_MM_TRANSPOSE2_PD(Y,F);
2215             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2216             H                = _mm_setzero_pd();
2217             GMX_MM_TRANSPOSE2_PD(G,H);
2218             Heps             = _mm_mul_pd(vfeps,H);
2219             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2220             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2221             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2222
2223             fscal            = felec;
2224
2225             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2226
2227             /* Calculate temporary vectorial force */
2228             tx               = _mm_mul_pd(fscal,dx21);
2229             ty               = _mm_mul_pd(fscal,dy21);
2230             tz               = _mm_mul_pd(fscal,dz21);
2231
2232             /* Update vectorial force */
2233             fix2             = _mm_add_pd(fix2,tx);
2234             fiy2             = _mm_add_pd(fiy2,ty);
2235             fiz2             = _mm_add_pd(fiz2,tz);
2236
2237             fjx1             = _mm_add_pd(fjx1,tx);
2238             fjy1             = _mm_add_pd(fjy1,ty);
2239             fjz1             = _mm_add_pd(fjz1,tz);
2240
2241             /**************************
2242              * CALCULATE INTERACTIONS *
2243              **************************/
2244
2245             r22              = _mm_mul_pd(rsq22,rinv22);
2246
2247             /* Calculate table index by multiplying r with table scale and truncate to integer */
2248             rt               = _mm_mul_pd(r22,vftabscale);
2249             vfitab           = _mm_cvttpd_epi32(rt);
2250             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2251             vfitab           = _mm_slli_epi32(vfitab,2);
2252
2253             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2254             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2255             F                = _mm_setzero_pd();
2256             GMX_MM_TRANSPOSE2_PD(Y,F);
2257             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2258             H                = _mm_setzero_pd();
2259             GMX_MM_TRANSPOSE2_PD(G,H);
2260             Heps             = _mm_mul_pd(vfeps,H);
2261             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2262             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2263             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2264
2265             fscal            = felec;
2266
2267             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2268
2269             /* Calculate temporary vectorial force */
2270             tx               = _mm_mul_pd(fscal,dx22);
2271             ty               = _mm_mul_pd(fscal,dy22);
2272             tz               = _mm_mul_pd(fscal,dz22);
2273
2274             /* Update vectorial force */
2275             fix2             = _mm_add_pd(fix2,tx);
2276             fiy2             = _mm_add_pd(fiy2,ty);
2277             fiz2             = _mm_add_pd(fiz2,tz);
2278
2279             fjx2             = _mm_add_pd(fjx2,tx);
2280             fjy2             = _mm_add_pd(fjy2,ty);
2281             fjz2             = _mm_add_pd(fjz2,tz);
2282
2283             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2284
2285             /* Inner loop uses 359 flops */
2286         }
2287
2288         /* End of innermost loop */
2289
2290         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2291                                               f+i_coord_offset,fshift+i_shift_offset);
2292
2293         /* Increment number of inner iterations */
2294         inneriter                  += j_index_end - j_index_start;
2295
2296         /* Outer loop uses 18 flops */
2297     }
2298
2299     /* Increment number of outer iterations */
2300     outeriter        += nri;
2301
2302     /* Update outer/inner flops */
2303
2304     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);
2305 }