Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_sse2_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx0A,vdwjidx0B;
88     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B;
94     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
112     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
113     __m128i          vfitab;
114     __m128i          ifour       = _mm_set1_epi32(4);
115     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116     real             *vftab;
117     __m128d          dummy_mask,cutoff_mask;
118     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119     __m128d          one     = _mm_set1_pd(1.0);
120     __m128d          two     = _mm_set1_pd(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm_set1_pd(fr->ic->epsfac);
133     charge           = mdatoms->chargeA;
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     vftab            = kernel_data->table_elec->data;
139     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
146     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
147
148     jq1              = _mm_set1_pd(charge[inr+1]);
149     jq2              = _mm_set1_pd(charge[inr+2]);
150     jq3              = _mm_set1_pd(charge[inr+3]);
151     vdwjidx0A        = 2*vdwtype[inr+0];
152     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq11             = _mm_mul_pd(iq1,jq1);
155     qq12             = _mm_mul_pd(iq1,jq2);
156     qq13             = _mm_mul_pd(iq1,jq3);
157     qq21             = _mm_mul_pd(iq2,jq1);
158     qq22             = _mm_mul_pd(iq2,jq2);
159     qq23             = _mm_mul_pd(iq2,jq3);
160     qq31             = _mm_mul_pd(iq3,jq1);
161     qq32             = _mm_mul_pd(iq3,jq2);
162     qq33             = _mm_mul_pd(iq3,jq3);
163
164     /* Avoid stupid compiler warnings */
165     jnrA = jnrB = 0;
166     j_coord_offsetA = 0;
167     j_coord_offsetB = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189
190         fix0             = _mm_setzero_pd();
191         fiy0             = _mm_setzero_pd();
192         fiz0             = _mm_setzero_pd();
193         fix1             = _mm_setzero_pd();
194         fiy1             = _mm_setzero_pd();
195         fiz1             = _mm_setzero_pd();
196         fix2             = _mm_setzero_pd();
197         fiy2             = _mm_setzero_pd();
198         fiz2             = _mm_setzero_pd();
199         fix3             = _mm_setzero_pd();
200         fiy3             = _mm_setzero_pd();
201         fiz3             = _mm_setzero_pd();
202
203         /* Reset potential sums */
204         velecsum         = _mm_setzero_pd();
205         vvdwsum          = _mm_setzero_pd();
206
207         /* Start inner kernel loop */
208         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
209         {
210
211             /* Get j neighbor index, and coordinate index */
212             jnrA             = jjnr[jidx];
213             jnrB             = jjnr[jidx+1];
214             j_coord_offsetA  = DIM*jnrA;
215             j_coord_offsetB  = DIM*jnrB;
216
217             /* load j atom coordinates */
218             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
220                                               &jy2,&jz2,&jx3,&jy3,&jz3);
221
222             /* Calculate displacement vector */
223             dx00             = _mm_sub_pd(ix0,jx0);
224             dy00             = _mm_sub_pd(iy0,jy0);
225             dz00             = _mm_sub_pd(iz0,jz0);
226             dx11             = _mm_sub_pd(ix1,jx1);
227             dy11             = _mm_sub_pd(iy1,jy1);
228             dz11             = _mm_sub_pd(iz1,jz1);
229             dx12             = _mm_sub_pd(ix1,jx2);
230             dy12             = _mm_sub_pd(iy1,jy2);
231             dz12             = _mm_sub_pd(iz1,jz2);
232             dx13             = _mm_sub_pd(ix1,jx3);
233             dy13             = _mm_sub_pd(iy1,jy3);
234             dz13             = _mm_sub_pd(iz1,jz3);
235             dx21             = _mm_sub_pd(ix2,jx1);
236             dy21             = _mm_sub_pd(iy2,jy1);
237             dz21             = _mm_sub_pd(iz2,jz1);
238             dx22             = _mm_sub_pd(ix2,jx2);
239             dy22             = _mm_sub_pd(iy2,jy2);
240             dz22             = _mm_sub_pd(iz2,jz2);
241             dx23             = _mm_sub_pd(ix2,jx3);
242             dy23             = _mm_sub_pd(iy2,jy3);
243             dz23             = _mm_sub_pd(iz2,jz3);
244             dx31             = _mm_sub_pd(ix3,jx1);
245             dy31             = _mm_sub_pd(iy3,jy1);
246             dz31             = _mm_sub_pd(iz3,jz1);
247             dx32             = _mm_sub_pd(ix3,jx2);
248             dy32             = _mm_sub_pd(iy3,jy2);
249             dz32             = _mm_sub_pd(iz3,jz2);
250             dx33             = _mm_sub_pd(ix3,jx3);
251             dy33             = _mm_sub_pd(iy3,jy3);
252             dz33             = _mm_sub_pd(iz3,jz3);
253
254             /* Calculate squared distance and things based on it */
255             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
256             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
257             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
258             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
259             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
260             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
261             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
262             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
263             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
264             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
265
266             rinv11           = sse2_invsqrt_d(rsq11);
267             rinv12           = sse2_invsqrt_d(rsq12);
268             rinv13           = sse2_invsqrt_d(rsq13);
269             rinv21           = sse2_invsqrt_d(rsq21);
270             rinv22           = sse2_invsqrt_d(rsq22);
271             rinv23           = sse2_invsqrt_d(rsq23);
272             rinv31           = sse2_invsqrt_d(rsq31);
273             rinv32           = sse2_invsqrt_d(rsq32);
274             rinv33           = sse2_invsqrt_d(rsq33);
275
276             rinvsq00         = sse2_inv_d(rsq00);
277
278             fjx0             = _mm_setzero_pd();
279             fjy0             = _mm_setzero_pd();
280             fjz0             = _mm_setzero_pd();
281             fjx1             = _mm_setzero_pd();
282             fjy1             = _mm_setzero_pd();
283             fjz1             = _mm_setzero_pd();
284             fjx2             = _mm_setzero_pd();
285             fjy2             = _mm_setzero_pd();
286             fjz2             = _mm_setzero_pd();
287             fjx3             = _mm_setzero_pd();
288             fjy3             = _mm_setzero_pd();
289             fjz3             = _mm_setzero_pd();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             /* LENNARD-JONES DISPERSION/REPULSION */
296
297             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
298             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
299             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
300             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
301             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
302
303             /* Update potential sum for this i atom from the interaction with this j atom. */
304             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
305
306             fscal            = fvdw;
307
308             /* Calculate temporary vectorial force */
309             tx               = _mm_mul_pd(fscal,dx00);
310             ty               = _mm_mul_pd(fscal,dy00);
311             tz               = _mm_mul_pd(fscal,dz00);
312
313             /* Update vectorial force */
314             fix0             = _mm_add_pd(fix0,tx);
315             fiy0             = _mm_add_pd(fiy0,ty);
316             fiz0             = _mm_add_pd(fiz0,tz);
317
318             fjx0             = _mm_add_pd(fjx0,tx);
319             fjy0             = _mm_add_pd(fjy0,ty);
320             fjz0             = _mm_add_pd(fjz0,tz);
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             r11              = _mm_mul_pd(rsq11,rinv11);
327
328             /* Calculate table index by multiplying r with table scale and truncate to integer */
329             rt               = _mm_mul_pd(r11,vftabscale);
330             vfitab           = _mm_cvttpd_epi32(rt);
331             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
332             vfitab           = _mm_slli_epi32(vfitab,2);
333
334             /* CUBIC SPLINE TABLE ELECTROSTATICS */
335             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
336             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
337             GMX_MM_TRANSPOSE2_PD(Y,F);
338             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
339             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
340             GMX_MM_TRANSPOSE2_PD(G,H);
341             Heps             = _mm_mul_pd(vfeps,H);
342             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
343             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
344             velec            = _mm_mul_pd(qq11,VV);
345             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
346             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
347
348             /* Update potential sum for this i atom from the interaction with this j atom. */
349             velecsum         = _mm_add_pd(velecsum,velec);
350
351             fscal            = felec;
352
353             /* Calculate temporary vectorial force */
354             tx               = _mm_mul_pd(fscal,dx11);
355             ty               = _mm_mul_pd(fscal,dy11);
356             tz               = _mm_mul_pd(fscal,dz11);
357
358             /* Update vectorial force */
359             fix1             = _mm_add_pd(fix1,tx);
360             fiy1             = _mm_add_pd(fiy1,ty);
361             fiz1             = _mm_add_pd(fiz1,tz);
362
363             fjx1             = _mm_add_pd(fjx1,tx);
364             fjy1             = _mm_add_pd(fjy1,ty);
365             fjz1             = _mm_add_pd(fjz1,tz);
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             r12              = _mm_mul_pd(rsq12,rinv12);
372
373             /* Calculate table index by multiplying r with table scale and truncate to integer */
374             rt               = _mm_mul_pd(r12,vftabscale);
375             vfitab           = _mm_cvttpd_epi32(rt);
376             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
377             vfitab           = _mm_slli_epi32(vfitab,2);
378
379             /* CUBIC SPLINE TABLE ELECTROSTATICS */
380             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
381             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
382             GMX_MM_TRANSPOSE2_PD(Y,F);
383             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
384             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
385             GMX_MM_TRANSPOSE2_PD(G,H);
386             Heps             = _mm_mul_pd(vfeps,H);
387             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
388             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
389             velec            = _mm_mul_pd(qq12,VV);
390             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
391             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm_add_pd(velecsum,velec);
395
396             fscal            = felec;
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm_mul_pd(fscal,dx12);
400             ty               = _mm_mul_pd(fscal,dy12);
401             tz               = _mm_mul_pd(fscal,dz12);
402
403             /* Update vectorial force */
404             fix1             = _mm_add_pd(fix1,tx);
405             fiy1             = _mm_add_pd(fiy1,ty);
406             fiz1             = _mm_add_pd(fiz1,tz);
407
408             fjx2             = _mm_add_pd(fjx2,tx);
409             fjy2             = _mm_add_pd(fjy2,ty);
410             fjz2             = _mm_add_pd(fjz2,tz);
411
412             /**************************
413              * CALCULATE INTERACTIONS *
414              **************************/
415
416             r13              = _mm_mul_pd(rsq13,rinv13);
417
418             /* Calculate table index by multiplying r with table scale and truncate to integer */
419             rt               = _mm_mul_pd(r13,vftabscale);
420             vfitab           = _mm_cvttpd_epi32(rt);
421             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
422             vfitab           = _mm_slli_epi32(vfitab,2);
423
424             /* CUBIC SPLINE TABLE ELECTROSTATICS */
425             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
426             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
427             GMX_MM_TRANSPOSE2_PD(Y,F);
428             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
429             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
430             GMX_MM_TRANSPOSE2_PD(G,H);
431             Heps             = _mm_mul_pd(vfeps,H);
432             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
433             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
434             velec            = _mm_mul_pd(qq13,VV);
435             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
436             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
437
438             /* Update potential sum for this i atom from the interaction with this j atom. */
439             velecsum         = _mm_add_pd(velecsum,velec);
440
441             fscal            = felec;
442
443             /* Calculate temporary vectorial force */
444             tx               = _mm_mul_pd(fscal,dx13);
445             ty               = _mm_mul_pd(fscal,dy13);
446             tz               = _mm_mul_pd(fscal,dz13);
447
448             /* Update vectorial force */
449             fix1             = _mm_add_pd(fix1,tx);
450             fiy1             = _mm_add_pd(fiy1,ty);
451             fiz1             = _mm_add_pd(fiz1,tz);
452
453             fjx3             = _mm_add_pd(fjx3,tx);
454             fjy3             = _mm_add_pd(fjy3,ty);
455             fjz3             = _mm_add_pd(fjz3,tz);
456
457             /**************************
458              * CALCULATE INTERACTIONS *
459              **************************/
460
461             r21              = _mm_mul_pd(rsq21,rinv21);
462
463             /* Calculate table index by multiplying r with table scale and truncate to integer */
464             rt               = _mm_mul_pd(r21,vftabscale);
465             vfitab           = _mm_cvttpd_epi32(rt);
466             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
467             vfitab           = _mm_slli_epi32(vfitab,2);
468
469             /* CUBIC SPLINE TABLE ELECTROSTATICS */
470             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
471             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
472             GMX_MM_TRANSPOSE2_PD(Y,F);
473             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
474             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
475             GMX_MM_TRANSPOSE2_PD(G,H);
476             Heps             = _mm_mul_pd(vfeps,H);
477             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
478             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
479             velec            = _mm_mul_pd(qq21,VV);
480             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
481             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velecsum         = _mm_add_pd(velecsum,velec);
485
486             fscal            = felec;
487
488             /* Calculate temporary vectorial force */
489             tx               = _mm_mul_pd(fscal,dx21);
490             ty               = _mm_mul_pd(fscal,dy21);
491             tz               = _mm_mul_pd(fscal,dz21);
492
493             /* Update vectorial force */
494             fix2             = _mm_add_pd(fix2,tx);
495             fiy2             = _mm_add_pd(fiy2,ty);
496             fiz2             = _mm_add_pd(fiz2,tz);
497
498             fjx1             = _mm_add_pd(fjx1,tx);
499             fjy1             = _mm_add_pd(fjy1,ty);
500             fjz1             = _mm_add_pd(fjz1,tz);
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             r22              = _mm_mul_pd(rsq22,rinv22);
507
508             /* Calculate table index by multiplying r with table scale and truncate to integer */
509             rt               = _mm_mul_pd(r22,vftabscale);
510             vfitab           = _mm_cvttpd_epi32(rt);
511             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
512             vfitab           = _mm_slli_epi32(vfitab,2);
513
514             /* CUBIC SPLINE TABLE ELECTROSTATICS */
515             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
516             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
517             GMX_MM_TRANSPOSE2_PD(Y,F);
518             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
519             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
520             GMX_MM_TRANSPOSE2_PD(G,H);
521             Heps             = _mm_mul_pd(vfeps,H);
522             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
523             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
524             velec            = _mm_mul_pd(qq22,VV);
525             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
526             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
527
528             /* Update potential sum for this i atom from the interaction with this j atom. */
529             velecsum         = _mm_add_pd(velecsum,velec);
530
531             fscal            = felec;
532
533             /* Calculate temporary vectorial force */
534             tx               = _mm_mul_pd(fscal,dx22);
535             ty               = _mm_mul_pd(fscal,dy22);
536             tz               = _mm_mul_pd(fscal,dz22);
537
538             /* Update vectorial force */
539             fix2             = _mm_add_pd(fix2,tx);
540             fiy2             = _mm_add_pd(fiy2,ty);
541             fiz2             = _mm_add_pd(fiz2,tz);
542
543             fjx2             = _mm_add_pd(fjx2,tx);
544             fjy2             = _mm_add_pd(fjy2,ty);
545             fjz2             = _mm_add_pd(fjz2,tz);
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             r23              = _mm_mul_pd(rsq23,rinv23);
552
553             /* Calculate table index by multiplying r with table scale and truncate to integer */
554             rt               = _mm_mul_pd(r23,vftabscale);
555             vfitab           = _mm_cvttpd_epi32(rt);
556             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
557             vfitab           = _mm_slli_epi32(vfitab,2);
558
559             /* CUBIC SPLINE TABLE ELECTROSTATICS */
560             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
561             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
562             GMX_MM_TRANSPOSE2_PD(Y,F);
563             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
564             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
565             GMX_MM_TRANSPOSE2_PD(G,H);
566             Heps             = _mm_mul_pd(vfeps,H);
567             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
568             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
569             velec            = _mm_mul_pd(qq23,VV);
570             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
571             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
572
573             /* Update potential sum for this i atom from the interaction with this j atom. */
574             velecsum         = _mm_add_pd(velecsum,velec);
575
576             fscal            = felec;
577
578             /* Calculate temporary vectorial force */
579             tx               = _mm_mul_pd(fscal,dx23);
580             ty               = _mm_mul_pd(fscal,dy23);
581             tz               = _mm_mul_pd(fscal,dz23);
582
583             /* Update vectorial force */
584             fix2             = _mm_add_pd(fix2,tx);
585             fiy2             = _mm_add_pd(fiy2,ty);
586             fiz2             = _mm_add_pd(fiz2,tz);
587
588             fjx3             = _mm_add_pd(fjx3,tx);
589             fjy3             = _mm_add_pd(fjy3,ty);
590             fjz3             = _mm_add_pd(fjz3,tz);
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             r31              = _mm_mul_pd(rsq31,rinv31);
597
598             /* Calculate table index by multiplying r with table scale and truncate to integer */
599             rt               = _mm_mul_pd(r31,vftabscale);
600             vfitab           = _mm_cvttpd_epi32(rt);
601             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
602             vfitab           = _mm_slli_epi32(vfitab,2);
603
604             /* CUBIC SPLINE TABLE ELECTROSTATICS */
605             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
606             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
607             GMX_MM_TRANSPOSE2_PD(Y,F);
608             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
609             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
610             GMX_MM_TRANSPOSE2_PD(G,H);
611             Heps             = _mm_mul_pd(vfeps,H);
612             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
613             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
614             velec            = _mm_mul_pd(qq31,VV);
615             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
616             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
617
618             /* Update potential sum for this i atom from the interaction with this j atom. */
619             velecsum         = _mm_add_pd(velecsum,velec);
620
621             fscal            = felec;
622
623             /* Calculate temporary vectorial force */
624             tx               = _mm_mul_pd(fscal,dx31);
625             ty               = _mm_mul_pd(fscal,dy31);
626             tz               = _mm_mul_pd(fscal,dz31);
627
628             /* Update vectorial force */
629             fix3             = _mm_add_pd(fix3,tx);
630             fiy3             = _mm_add_pd(fiy3,ty);
631             fiz3             = _mm_add_pd(fiz3,tz);
632
633             fjx1             = _mm_add_pd(fjx1,tx);
634             fjy1             = _mm_add_pd(fjy1,ty);
635             fjz1             = _mm_add_pd(fjz1,tz);
636
637             /**************************
638              * CALCULATE INTERACTIONS *
639              **************************/
640
641             r32              = _mm_mul_pd(rsq32,rinv32);
642
643             /* Calculate table index by multiplying r with table scale and truncate to integer */
644             rt               = _mm_mul_pd(r32,vftabscale);
645             vfitab           = _mm_cvttpd_epi32(rt);
646             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
647             vfitab           = _mm_slli_epi32(vfitab,2);
648
649             /* CUBIC SPLINE TABLE ELECTROSTATICS */
650             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
651             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
652             GMX_MM_TRANSPOSE2_PD(Y,F);
653             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
654             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
655             GMX_MM_TRANSPOSE2_PD(G,H);
656             Heps             = _mm_mul_pd(vfeps,H);
657             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
658             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
659             velec            = _mm_mul_pd(qq32,VV);
660             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
661             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
662
663             /* Update potential sum for this i atom from the interaction with this j atom. */
664             velecsum         = _mm_add_pd(velecsum,velec);
665
666             fscal            = felec;
667
668             /* Calculate temporary vectorial force */
669             tx               = _mm_mul_pd(fscal,dx32);
670             ty               = _mm_mul_pd(fscal,dy32);
671             tz               = _mm_mul_pd(fscal,dz32);
672
673             /* Update vectorial force */
674             fix3             = _mm_add_pd(fix3,tx);
675             fiy3             = _mm_add_pd(fiy3,ty);
676             fiz3             = _mm_add_pd(fiz3,tz);
677
678             fjx2             = _mm_add_pd(fjx2,tx);
679             fjy2             = _mm_add_pd(fjy2,ty);
680             fjz2             = _mm_add_pd(fjz2,tz);
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             r33              = _mm_mul_pd(rsq33,rinv33);
687
688             /* Calculate table index by multiplying r with table scale and truncate to integer */
689             rt               = _mm_mul_pd(r33,vftabscale);
690             vfitab           = _mm_cvttpd_epi32(rt);
691             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
692             vfitab           = _mm_slli_epi32(vfitab,2);
693
694             /* CUBIC SPLINE TABLE ELECTROSTATICS */
695             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
696             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
697             GMX_MM_TRANSPOSE2_PD(Y,F);
698             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
699             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
700             GMX_MM_TRANSPOSE2_PD(G,H);
701             Heps             = _mm_mul_pd(vfeps,H);
702             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
703             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
704             velec            = _mm_mul_pd(qq33,VV);
705             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
706             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
707
708             /* Update potential sum for this i atom from the interaction with this j atom. */
709             velecsum         = _mm_add_pd(velecsum,velec);
710
711             fscal            = felec;
712
713             /* Calculate temporary vectorial force */
714             tx               = _mm_mul_pd(fscal,dx33);
715             ty               = _mm_mul_pd(fscal,dy33);
716             tz               = _mm_mul_pd(fscal,dz33);
717
718             /* Update vectorial force */
719             fix3             = _mm_add_pd(fix3,tx);
720             fiy3             = _mm_add_pd(fiy3,ty);
721             fiz3             = _mm_add_pd(fiz3,tz);
722
723             fjx3             = _mm_add_pd(fjx3,tx);
724             fjy3             = _mm_add_pd(fjy3,ty);
725             fjz3             = _mm_add_pd(fjz3,tz);
726
727             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
728
729             /* Inner loop uses 422 flops */
730         }
731
732         if(jidx<j_index_end)
733         {
734
735             jnrA             = jjnr[jidx];
736             j_coord_offsetA  = DIM*jnrA;
737
738             /* load j atom coordinates */
739             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
740                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
741                                               &jy2,&jz2,&jx3,&jy3,&jz3);
742
743             /* Calculate displacement vector */
744             dx00             = _mm_sub_pd(ix0,jx0);
745             dy00             = _mm_sub_pd(iy0,jy0);
746             dz00             = _mm_sub_pd(iz0,jz0);
747             dx11             = _mm_sub_pd(ix1,jx1);
748             dy11             = _mm_sub_pd(iy1,jy1);
749             dz11             = _mm_sub_pd(iz1,jz1);
750             dx12             = _mm_sub_pd(ix1,jx2);
751             dy12             = _mm_sub_pd(iy1,jy2);
752             dz12             = _mm_sub_pd(iz1,jz2);
753             dx13             = _mm_sub_pd(ix1,jx3);
754             dy13             = _mm_sub_pd(iy1,jy3);
755             dz13             = _mm_sub_pd(iz1,jz3);
756             dx21             = _mm_sub_pd(ix2,jx1);
757             dy21             = _mm_sub_pd(iy2,jy1);
758             dz21             = _mm_sub_pd(iz2,jz1);
759             dx22             = _mm_sub_pd(ix2,jx2);
760             dy22             = _mm_sub_pd(iy2,jy2);
761             dz22             = _mm_sub_pd(iz2,jz2);
762             dx23             = _mm_sub_pd(ix2,jx3);
763             dy23             = _mm_sub_pd(iy2,jy3);
764             dz23             = _mm_sub_pd(iz2,jz3);
765             dx31             = _mm_sub_pd(ix3,jx1);
766             dy31             = _mm_sub_pd(iy3,jy1);
767             dz31             = _mm_sub_pd(iz3,jz1);
768             dx32             = _mm_sub_pd(ix3,jx2);
769             dy32             = _mm_sub_pd(iy3,jy2);
770             dz32             = _mm_sub_pd(iz3,jz2);
771             dx33             = _mm_sub_pd(ix3,jx3);
772             dy33             = _mm_sub_pd(iy3,jy3);
773             dz33             = _mm_sub_pd(iz3,jz3);
774
775             /* Calculate squared distance and things based on it */
776             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
777             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
778             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
779             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
780             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
781             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
782             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
783             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
784             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
785             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
786
787             rinv11           = sse2_invsqrt_d(rsq11);
788             rinv12           = sse2_invsqrt_d(rsq12);
789             rinv13           = sse2_invsqrt_d(rsq13);
790             rinv21           = sse2_invsqrt_d(rsq21);
791             rinv22           = sse2_invsqrt_d(rsq22);
792             rinv23           = sse2_invsqrt_d(rsq23);
793             rinv31           = sse2_invsqrt_d(rsq31);
794             rinv32           = sse2_invsqrt_d(rsq32);
795             rinv33           = sse2_invsqrt_d(rsq33);
796
797             rinvsq00         = sse2_inv_d(rsq00);
798
799             fjx0             = _mm_setzero_pd();
800             fjy0             = _mm_setzero_pd();
801             fjz0             = _mm_setzero_pd();
802             fjx1             = _mm_setzero_pd();
803             fjy1             = _mm_setzero_pd();
804             fjz1             = _mm_setzero_pd();
805             fjx2             = _mm_setzero_pd();
806             fjy2             = _mm_setzero_pd();
807             fjz2             = _mm_setzero_pd();
808             fjx3             = _mm_setzero_pd();
809             fjy3             = _mm_setzero_pd();
810             fjz3             = _mm_setzero_pd();
811
812             /**************************
813              * CALCULATE INTERACTIONS *
814              **************************/
815
816             /* LENNARD-JONES DISPERSION/REPULSION */
817
818             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
819             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
820             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
821             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
822             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
823
824             /* Update potential sum for this i atom from the interaction with this j atom. */
825             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
826             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
827
828             fscal            = fvdw;
829
830             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
831
832             /* Calculate temporary vectorial force */
833             tx               = _mm_mul_pd(fscal,dx00);
834             ty               = _mm_mul_pd(fscal,dy00);
835             tz               = _mm_mul_pd(fscal,dz00);
836
837             /* Update vectorial force */
838             fix0             = _mm_add_pd(fix0,tx);
839             fiy0             = _mm_add_pd(fiy0,ty);
840             fiz0             = _mm_add_pd(fiz0,tz);
841
842             fjx0             = _mm_add_pd(fjx0,tx);
843             fjy0             = _mm_add_pd(fjy0,ty);
844             fjz0             = _mm_add_pd(fjz0,tz);
845
846             /**************************
847              * CALCULATE INTERACTIONS *
848              **************************/
849
850             r11              = _mm_mul_pd(rsq11,rinv11);
851
852             /* Calculate table index by multiplying r with table scale and truncate to integer */
853             rt               = _mm_mul_pd(r11,vftabscale);
854             vfitab           = _mm_cvttpd_epi32(rt);
855             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
856             vfitab           = _mm_slli_epi32(vfitab,2);
857
858             /* CUBIC SPLINE TABLE ELECTROSTATICS */
859             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
860             F                = _mm_setzero_pd();
861             GMX_MM_TRANSPOSE2_PD(Y,F);
862             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
863             H                = _mm_setzero_pd();
864             GMX_MM_TRANSPOSE2_PD(G,H);
865             Heps             = _mm_mul_pd(vfeps,H);
866             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
867             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
868             velec            = _mm_mul_pd(qq11,VV);
869             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
870             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
871
872             /* Update potential sum for this i atom from the interaction with this j atom. */
873             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
874             velecsum         = _mm_add_pd(velecsum,velec);
875
876             fscal            = felec;
877
878             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
879
880             /* Calculate temporary vectorial force */
881             tx               = _mm_mul_pd(fscal,dx11);
882             ty               = _mm_mul_pd(fscal,dy11);
883             tz               = _mm_mul_pd(fscal,dz11);
884
885             /* Update vectorial force */
886             fix1             = _mm_add_pd(fix1,tx);
887             fiy1             = _mm_add_pd(fiy1,ty);
888             fiz1             = _mm_add_pd(fiz1,tz);
889
890             fjx1             = _mm_add_pd(fjx1,tx);
891             fjy1             = _mm_add_pd(fjy1,ty);
892             fjz1             = _mm_add_pd(fjz1,tz);
893
894             /**************************
895              * CALCULATE INTERACTIONS *
896              **************************/
897
898             r12              = _mm_mul_pd(rsq12,rinv12);
899
900             /* Calculate table index by multiplying r with table scale and truncate to integer */
901             rt               = _mm_mul_pd(r12,vftabscale);
902             vfitab           = _mm_cvttpd_epi32(rt);
903             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
904             vfitab           = _mm_slli_epi32(vfitab,2);
905
906             /* CUBIC SPLINE TABLE ELECTROSTATICS */
907             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
908             F                = _mm_setzero_pd();
909             GMX_MM_TRANSPOSE2_PD(Y,F);
910             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
911             H                = _mm_setzero_pd();
912             GMX_MM_TRANSPOSE2_PD(G,H);
913             Heps             = _mm_mul_pd(vfeps,H);
914             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
915             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
916             velec            = _mm_mul_pd(qq12,VV);
917             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
918             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
922             velecsum         = _mm_add_pd(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm_mul_pd(fscal,dx12);
930             ty               = _mm_mul_pd(fscal,dy12);
931             tz               = _mm_mul_pd(fscal,dz12);
932
933             /* Update vectorial force */
934             fix1             = _mm_add_pd(fix1,tx);
935             fiy1             = _mm_add_pd(fiy1,ty);
936             fiz1             = _mm_add_pd(fiz1,tz);
937
938             fjx2             = _mm_add_pd(fjx2,tx);
939             fjy2             = _mm_add_pd(fjy2,ty);
940             fjz2             = _mm_add_pd(fjz2,tz);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r13              = _mm_mul_pd(rsq13,rinv13);
947
948             /* Calculate table index by multiplying r with table scale and truncate to integer */
949             rt               = _mm_mul_pd(r13,vftabscale);
950             vfitab           = _mm_cvttpd_epi32(rt);
951             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
952             vfitab           = _mm_slli_epi32(vfitab,2);
953
954             /* CUBIC SPLINE TABLE ELECTROSTATICS */
955             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
956             F                = _mm_setzero_pd();
957             GMX_MM_TRANSPOSE2_PD(Y,F);
958             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
959             H                = _mm_setzero_pd();
960             GMX_MM_TRANSPOSE2_PD(G,H);
961             Heps             = _mm_mul_pd(vfeps,H);
962             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
963             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
964             velec            = _mm_mul_pd(qq13,VV);
965             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
966             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
967
968             /* Update potential sum for this i atom from the interaction with this j atom. */
969             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
970             velecsum         = _mm_add_pd(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm_mul_pd(fscal,dx13);
978             ty               = _mm_mul_pd(fscal,dy13);
979             tz               = _mm_mul_pd(fscal,dz13);
980
981             /* Update vectorial force */
982             fix1             = _mm_add_pd(fix1,tx);
983             fiy1             = _mm_add_pd(fiy1,ty);
984             fiz1             = _mm_add_pd(fiz1,tz);
985
986             fjx3             = _mm_add_pd(fjx3,tx);
987             fjy3             = _mm_add_pd(fjy3,ty);
988             fjz3             = _mm_add_pd(fjz3,tz);
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r21              = _mm_mul_pd(rsq21,rinv21);
995
996             /* Calculate table index by multiplying r with table scale and truncate to integer */
997             rt               = _mm_mul_pd(r21,vftabscale);
998             vfitab           = _mm_cvttpd_epi32(rt);
999             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1000             vfitab           = _mm_slli_epi32(vfitab,2);
1001
1002             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1003             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1004             F                = _mm_setzero_pd();
1005             GMX_MM_TRANSPOSE2_PD(Y,F);
1006             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1007             H                = _mm_setzero_pd();
1008             GMX_MM_TRANSPOSE2_PD(G,H);
1009             Heps             = _mm_mul_pd(vfeps,H);
1010             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1011             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1012             velec            = _mm_mul_pd(qq21,VV);
1013             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1014             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1018             velecsum         = _mm_add_pd(velecsum,velec);
1019
1020             fscal            = felec;
1021
1022             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1023
1024             /* Calculate temporary vectorial force */
1025             tx               = _mm_mul_pd(fscal,dx21);
1026             ty               = _mm_mul_pd(fscal,dy21);
1027             tz               = _mm_mul_pd(fscal,dz21);
1028
1029             /* Update vectorial force */
1030             fix2             = _mm_add_pd(fix2,tx);
1031             fiy2             = _mm_add_pd(fiy2,ty);
1032             fiz2             = _mm_add_pd(fiz2,tz);
1033
1034             fjx1             = _mm_add_pd(fjx1,tx);
1035             fjy1             = _mm_add_pd(fjy1,ty);
1036             fjz1             = _mm_add_pd(fjz1,tz);
1037
1038             /**************************
1039              * CALCULATE INTERACTIONS *
1040              **************************/
1041
1042             r22              = _mm_mul_pd(rsq22,rinv22);
1043
1044             /* Calculate table index by multiplying r with table scale and truncate to integer */
1045             rt               = _mm_mul_pd(r22,vftabscale);
1046             vfitab           = _mm_cvttpd_epi32(rt);
1047             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1048             vfitab           = _mm_slli_epi32(vfitab,2);
1049
1050             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1052             F                = _mm_setzero_pd();
1053             GMX_MM_TRANSPOSE2_PD(Y,F);
1054             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1055             H                = _mm_setzero_pd();
1056             GMX_MM_TRANSPOSE2_PD(G,H);
1057             Heps             = _mm_mul_pd(vfeps,H);
1058             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1059             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1060             velec            = _mm_mul_pd(qq22,VV);
1061             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1062             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1066             velecsum         = _mm_add_pd(velecsum,velec);
1067
1068             fscal            = felec;
1069
1070             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1071
1072             /* Calculate temporary vectorial force */
1073             tx               = _mm_mul_pd(fscal,dx22);
1074             ty               = _mm_mul_pd(fscal,dy22);
1075             tz               = _mm_mul_pd(fscal,dz22);
1076
1077             /* Update vectorial force */
1078             fix2             = _mm_add_pd(fix2,tx);
1079             fiy2             = _mm_add_pd(fiy2,ty);
1080             fiz2             = _mm_add_pd(fiz2,tz);
1081
1082             fjx2             = _mm_add_pd(fjx2,tx);
1083             fjy2             = _mm_add_pd(fjy2,ty);
1084             fjz2             = _mm_add_pd(fjz2,tz);
1085
1086             /**************************
1087              * CALCULATE INTERACTIONS *
1088              **************************/
1089
1090             r23              = _mm_mul_pd(rsq23,rinv23);
1091
1092             /* Calculate table index by multiplying r with table scale and truncate to integer */
1093             rt               = _mm_mul_pd(r23,vftabscale);
1094             vfitab           = _mm_cvttpd_epi32(rt);
1095             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1096             vfitab           = _mm_slli_epi32(vfitab,2);
1097
1098             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1099             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1100             F                = _mm_setzero_pd();
1101             GMX_MM_TRANSPOSE2_PD(Y,F);
1102             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1103             H                = _mm_setzero_pd();
1104             GMX_MM_TRANSPOSE2_PD(G,H);
1105             Heps             = _mm_mul_pd(vfeps,H);
1106             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1107             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1108             velec            = _mm_mul_pd(qq23,VV);
1109             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1110             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1111
1112             /* Update potential sum for this i atom from the interaction with this j atom. */
1113             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1114             velecsum         = _mm_add_pd(velecsum,velec);
1115
1116             fscal            = felec;
1117
1118             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1119
1120             /* Calculate temporary vectorial force */
1121             tx               = _mm_mul_pd(fscal,dx23);
1122             ty               = _mm_mul_pd(fscal,dy23);
1123             tz               = _mm_mul_pd(fscal,dz23);
1124
1125             /* Update vectorial force */
1126             fix2             = _mm_add_pd(fix2,tx);
1127             fiy2             = _mm_add_pd(fiy2,ty);
1128             fiz2             = _mm_add_pd(fiz2,tz);
1129
1130             fjx3             = _mm_add_pd(fjx3,tx);
1131             fjy3             = _mm_add_pd(fjy3,ty);
1132             fjz3             = _mm_add_pd(fjz3,tz);
1133
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             r31              = _mm_mul_pd(rsq31,rinv31);
1139
1140             /* Calculate table index by multiplying r with table scale and truncate to integer */
1141             rt               = _mm_mul_pd(r31,vftabscale);
1142             vfitab           = _mm_cvttpd_epi32(rt);
1143             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1144             vfitab           = _mm_slli_epi32(vfitab,2);
1145
1146             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1148             F                = _mm_setzero_pd();
1149             GMX_MM_TRANSPOSE2_PD(Y,F);
1150             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1151             H                = _mm_setzero_pd();
1152             GMX_MM_TRANSPOSE2_PD(G,H);
1153             Heps             = _mm_mul_pd(vfeps,H);
1154             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1155             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1156             velec            = _mm_mul_pd(qq31,VV);
1157             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1158             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1162             velecsum         = _mm_add_pd(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm_mul_pd(fscal,dx31);
1170             ty               = _mm_mul_pd(fscal,dy31);
1171             tz               = _mm_mul_pd(fscal,dz31);
1172
1173             /* Update vectorial force */
1174             fix3             = _mm_add_pd(fix3,tx);
1175             fiy3             = _mm_add_pd(fiy3,ty);
1176             fiz3             = _mm_add_pd(fiz3,tz);
1177
1178             fjx1             = _mm_add_pd(fjx1,tx);
1179             fjy1             = _mm_add_pd(fjy1,ty);
1180             fjz1             = _mm_add_pd(fjz1,tz);
1181
1182             /**************************
1183              * CALCULATE INTERACTIONS *
1184              **************************/
1185
1186             r32              = _mm_mul_pd(rsq32,rinv32);
1187
1188             /* Calculate table index by multiplying r with table scale and truncate to integer */
1189             rt               = _mm_mul_pd(r32,vftabscale);
1190             vfitab           = _mm_cvttpd_epi32(rt);
1191             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1192             vfitab           = _mm_slli_epi32(vfitab,2);
1193
1194             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1195             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1196             F                = _mm_setzero_pd();
1197             GMX_MM_TRANSPOSE2_PD(Y,F);
1198             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1199             H                = _mm_setzero_pd();
1200             GMX_MM_TRANSPOSE2_PD(G,H);
1201             Heps             = _mm_mul_pd(vfeps,H);
1202             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1203             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1204             velec            = _mm_mul_pd(qq32,VV);
1205             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1206             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1207
1208             /* Update potential sum for this i atom from the interaction with this j atom. */
1209             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1210             velecsum         = _mm_add_pd(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1215
1216             /* Calculate temporary vectorial force */
1217             tx               = _mm_mul_pd(fscal,dx32);
1218             ty               = _mm_mul_pd(fscal,dy32);
1219             tz               = _mm_mul_pd(fscal,dz32);
1220
1221             /* Update vectorial force */
1222             fix3             = _mm_add_pd(fix3,tx);
1223             fiy3             = _mm_add_pd(fiy3,ty);
1224             fiz3             = _mm_add_pd(fiz3,tz);
1225
1226             fjx2             = _mm_add_pd(fjx2,tx);
1227             fjy2             = _mm_add_pd(fjy2,ty);
1228             fjz2             = _mm_add_pd(fjz2,tz);
1229
1230             /**************************
1231              * CALCULATE INTERACTIONS *
1232              **************************/
1233
1234             r33              = _mm_mul_pd(rsq33,rinv33);
1235
1236             /* Calculate table index by multiplying r with table scale and truncate to integer */
1237             rt               = _mm_mul_pd(r33,vftabscale);
1238             vfitab           = _mm_cvttpd_epi32(rt);
1239             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1240             vfitab           = _mm_slli_epi32(vfitab,2);
1241
1242             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1243             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1244             F                = _mm_setzero_pd();
1245             GMX_MM_TRANSPOSE2_PD(Y,F);
1246             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1247             H                = _mm_setzero_pd();
1248             GMX_MM_TRANSPOSE2_PD(G,H);
1249             Heps             = _mm_mul_pd(vfeps,H);
1250             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1251             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1252             velec            = _mm_mul_pd(qq33,VV);
1253             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1254             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1255
1256             /* Update potential sum for this i atom from the interaction with this j atom. */
1257             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1258             velecsum         = _mm_add_pd(velecsum,velec);
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm_mul_pd(fscal,dx33);
1266             ty               = _mm_mul_pd(fscal,dy33);
1267             tz               = _mm_mul_pd(fscal,dz33);
1268
1269             /* Update vectorial force */
1270             fix3             = _mm_add_pd(fix3,tx);
1271             fiy3             = _mm_add_pd(fiy3,ty);
1272             fiz3             = _mm_add_pd(fiz3,tz);
1273
1274             fjx3             = _mm_add_pd(fjx3,tx);
1275             fjy3             = _mm_add_pd(fjy3,ty);
1276             fjz3             = _mm_add_pd(fjz3,tz);
1277
1278             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1279
1280             /* Inner loop uses 422 flops */
1281         }
1282
1283         /* End of innermost loop */
1284
1285         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1286                                               f+i_coord_offset,fshift+i_shift_offset);
1287
1288         ggid                        = gid[iidx];
1289         /* Update potential energies */
1290         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1291         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1292
1293         /* Increment number of inner iterations */
1294         inneriter                  += j_index_end - j_index_start;
1295
1296         /* Outer loop uses 26 flops */
1297     }
1298
1299     /* Increment number of outer iterations */
1300     outeriter        += nri;
1301
1302     /* Update outer/inner flops */
1303
1304     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*422);
1305 }
1306 /*
1307  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1308  * Electrostatics interaction: CubicSplineTable
1309  * VdW interaction:            LennardJones
1310  * Geometry:                   Water4-Water4
1311  * Calculate force/pot:        Force
1312  */
1313 void
1314 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1315                     (t_nblist                    * gmx_restrict       nlist,
1316                      rvec                        * gmx_restrict          xx,
1317                      rvec                        * gmx_restrict          ff,
1318                      struct t_forcerec           * gmx_restrict          fr,
1319                      t_mdatoms                   * gmx_restrict     mdatoms,
1320                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1321                      t_nrnb                      * gmx_restrict        nrnb)
1322 {
1323     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1324      * just 0 for non-waters.
1325      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1326      * jnr indices corresponding to data put in the four positions in the SIMD register.
1327      */
1328     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1329     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1330     int              jnrA,jnrB;
1331     int              j_coord_offsetA,j_coord_offsetB;
1332     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1333     real             rcutoff_scalar;
1334     real             *shiftvec,*fshift,*x,*f;
1335     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1336     int              vdwioffset0;
1337     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1338     int              vdwioffset1;
1339     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1340     int              vdwioffset2;
1341     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1342     int              vdwioffset3;
1343     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1344     int              vdwjidx0A,vdwjidx0B;
1345     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1346     int              vdwjidx1A,vdwjidx1B;
1347     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1348     int              vdwjidx2A,vdwjidx2B;
1349     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1350     int              vdwjidx3A,vdwjidx3B;
1351     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1352     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1353     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1354     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1355     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1356     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1357     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1358     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1359     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1360     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1361     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1362     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1363     real             *charge;
1364     int              nvdwtype;
1365     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1366     int              *vdwtype;
1367     real             *vdwparam;
1368     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1369     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1370     __m128i          vfitab;
1371     __m128i          ifour       = _mm_set1_epi32(4);
1372     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1373     real             *vftab;
1374     __m128d          dummy_mask,cutoff_mask;
1375     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1376     __m128d          one     = _mm_set1_pd(1.0);
1377     __m128d          two     = _mm_set1_pd(2.0);
1378     x                = xx[0];
1379     f                = ff[0];
1380
1381     nri              = nlist->nri;
1382     iinr             = nlist->iinr;
1383     jindex           = nlist->jindex;
1384     jjnr             = nlist->jjnr;
1385     shiftidx         = nlist->shift;
1386     gid              = nlist->gid;
1387     shiftvec         = fr->shift_vec[0];
1388     fshift           = fr->fshift[0];
1389     facel            = _mm_set1_pd(fr->ic->epsfac);
1390     charge           = mdatoms->chargeA;
1391     nvdwtype         = fr->ntype;
1392     vdwparam         = fr->nbfp;
1393     vdwtype          = mdatoms->typeA;
1394
1395     vftab            = kernel_data->table_elec->data;
1396     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
1397
1398     /* Setup water-specific parameters */
1399     inr              = nlist->iinr[0];
1400     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1401     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1402     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1403     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1404
1405     jq1              = _mm_set1_pd(charge[inr+1]);
1406     jq2              = _mm_set1_pd(charge[inr+2]);
1407     jq3              = _mm_set1_pd(charge[inr+3]);
1408     vdwjidx0A        = 2*vdwtype[inr+0];
1409     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1410     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1411     qq11             = _mm_mul_pd(iq1,jq1);
1412     qq12             = _mm_mul_pd(iq1,jq2);
1413     qq13             = _mm_mul_pd(iq1,jq3);
1414     qq21             = _mm_mul_pd(iq2,jq1);
1415     qq22             = _mm_mul_pd(iq2,jq2);
1416     qq23             = _mm_mul_pd(iq2,jq3);
1417     qq31             = _mm_mul_pd(iq3,jq1);
1418     qq32             = _mm_mul_pd(iq3,jq2);
1419     qq33             = _mm_mul_pd(iq3,jq3);
1420
1421     /* Avoid stupid compiler warnings */
1422     jnrA = jnrB = 0;
1423     j_coord_offsetA = 0;
1424     j_coord_offsetB = 0;
1425
1426     outeriter        = 0;
1427     inneriter        = 0;
1428
1429     /* Start outer loop over neighborlists */
1430     for(iidx=0; iidx<nri; iidx++)
1431     {
1432         /* Load shift vector for this list */
1433         i_shift_offset   = DIM*shiftidx[iidx];
1434
1435         /* Load limits for loop over neighbors */
1436         j_index_start    = jindex[iidx];
1437         j_index_end      = jindex[iidx+1];
1438
1439         /* Get outer coordinate index */
1440         inr              = iinr[iidx];
1441         i_coord_offset   = DIM*inr;
1442
1443         /* Load i particle coords and add shift vector */
1444         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1445                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1446
1447         fix0             = _mm_setzero_pd();
1448         fiy0             = _mm_setzero_pd();
1449         fiz0             = _mm_setzero_pd();
1450         fix1             = _mm_setzero_pd();
1451         fiy1             = _mm_setzero_pd();
1452         fiz1             = _mm_setzero_pd();
1453         fix2             = _mm_setzero_pd();
1454         fiy2             = _mm_setzero_pd();
1455         fiz2             = _mm_setzero_pd();
1456         fix3             = _mm_setzero_pd();
1457         fiy3             = _mm_setzero_pd();
1458         fiz3             = _mm_setzero_pd();
1459
1460         /* Start inner kernel loop */
1461         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1462         {
1463
1464             /* Get j neighbor index, and coordinate index */
1465             jnrA             = jjnr[jidx];
1466             jnrB             = jjnr[jidx+1];
1467             j_coord_offsetA  = DIM*jnrA;
1468             j_coord_offsetB  = DIM*jnrB;
1469
1470             /* load j atom coordinates */
1471             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1472                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1473                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1474
1475             /* Calculate displacement vector */
1476             dx00             = _mm_sub_pd(ix0,jx0);
1477             dy00             = _mm_sub_pd(iy0,jy0);
1478             dz00             = _mm_sub_pd(iz0,jz0);
1479             dx11             = _mm_sub_pd(ix1,jx1);
1480             dy11             = _mm_sub_pd(iy1,jy1);
1481             dz11             = _mm_sub_pd(iz1,jz1);
1482             dx12             = _mm_sub_pd(ix1,jx2);
1483             dy12             = _mm_sub_pd(iy1,jy2);
1484             dz12             = _mm_sub_pd(iz1,jz2);
1485             dx13             = _mm_sub_pd(ix1,jx3);
1486             dy13             = _mm_sub_pd(iy1,jy3);
1487             dz13             = _mm_sub_pd(iz1,jz3);
1488             dx21             = _mm_sub_pd(ix2,jx1);
1489             dy21             = _mm_sub_pd(iy2,jy1);
1490             dz21             = _mm_sub_pd(iz2,jz1);
1491             dx22             = _mm_sub_pd(ix2,jx2);
1492             dy22             = _mm_sub_pd(iy2,jy2);
1493             dz22             = _mm_sub_pd(iz2,jz2);
1494             dx23             = _mm_sub_pd(ix2,jx3);
1495             dy23             = _mm_sub_pd(iy2,jy3);
1496             dz23             = _mm_sub_pd(iz2,jz3);
1497             dx31             = _mm_sub_pd(ix3,jx1);
1498             dy31             = _mm_sub_pd(iy3,jy1);
1499             dz31             = _mm_sub_pd(iz3,jz1);
1500             dx32             = _mm_sub_pd(ix3,jx2);
1501             dy32             = _mm_sub_pd(iy3,jy2);
1502             dz32             = _mm_sub_pd(iz3,jz2);
1503             dx33             = _mm_sub_pd(ix3,jx3);
1504             dy33             = _mm_sub_pd(iy3,jy3);
1505             dz33             = _mm_sub_pd(iz3,jz3);
1506
1507             /* Calculate squared distance and things based on it */
1508             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1509             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1510             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1511             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1512             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1513             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1514             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1515             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1516             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1517             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1518
1519             rinv11           = sse2_invsqrt_d(rsq11);
1520             rinv12           = sse2_invsqrt_d(rsq12);
1521             rinv13           = sse2_invsqrt_d(rsq13);
1522             rinv21           = sse2_invsqrt_d(rsq21);
1523             rinv22           = sse2_invsqrt_d(rsq22);
1524             rinv23           = sse2_invsqrt_d(rsq23);
1525             rinv31           = sse2_invsqrt_d(rsq31);
1526             rinv32           = sse2_invsqrt_d(rsq32);
1527             rinv33           = sse2_invsqrt_d(rsq33);
1528
1529             rinvsq00         = sse2_inv_d(rsq00);
1530
1531             fjx0             = _mm_setzero_pd();
1532             fjy0             = _mm_setzero_pd();
1533             fjz0             = _mm_setzero_pd();
1534             fjx1             = _mm_setzero_pd();
1535             fjy1             = _mm_setzero_pd();
1536             fjz1             = _mm_setzero_pd();
1537             fjx2             = _mm_setzero_pd();
1538             fjy2             = _mm_setzero_pd();
1539             fjz2             = _mm_setzero_pd();
1540             fjx3             = _mm_setzero_pd();
1541             fjy3             = _mm_setzero_pd();
1542             fjz3             = _mm_setzero_pd();
1543
1544             /**************************
1545              * CALCULATE INTERACTIONS *
1546              **************************/
1547
1548             /* LENNARD-JONES DISPERSION/REPULSION */
1549
1550             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1551             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1552
1553             fscal            = fvdw;
1554
1555             /* Calculate temporary vectorial force */
1556             tx               = _mm_mul_pd(fscal,dx00);
1557             ty               = _mm_mul_pd(fscal,dy00);
1558             tz               = _mm_mul_pd(fscal,dz00);
1559
1560             /* Update vectorial force */
1561             fix0             = _mm_add_pd(fix0,tx);
1562             fiy0             = _mm_add_pd(fiy0,ty);
1563             fiz0             = _mm_add_pd(fiz0,tz);
1564
1565             fjx0             = _mm_add_pd(fjx0,tx);
1566             fjy0             = _mm_add_pd(fjy0,ty);
1567             fjz0             = _mm_add_pd(fjz0,tz);
1568
1569             /**************************
1570              * CALCULATE INTERACTIONS *
1571              **************************/
1572
1573             r11              = _mm_mul_pd(rsq11,rinv11);
1574
1575             /* Calculate table index by multiplying r with table scale and truncate to integer */
1576             rt               = _mm_mul_pd(r11,vftabscale);
1577             vfitab           = _mm_cvttpd_epi32(rt);
1578             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1579             vfitab           = _mm_slli_epi32(vfitab,2);
1580
1581             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1582             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1583             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1584             GMX_MM_TRANSPOSE2_PD(Y,F);
1585             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1586             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1587             GMX_MM_TRANSPOSE2_PD(G,H);
1588             Heps             = _mm_mul_pd(vfeps,H);
1589             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1590             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1591             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1592
1593             fscal            = felec;
1594
1595             /* Calculate temporary vectorial force */
1596             tx               = _mm_mul_pd(fscal,dx11);
1597             ty               = _mm_mul_pd(fscal,dy11);
1598             tz               = _mm_mul_pd(fscal,dz11);
1599
1600             /* Update vectorial force */
1601             fix1             = _mm_add_pd(fix1,tx);
1602             fiy1             = _mm_add_pd(fiy1,ty);
1603             fiz1             = _mm_add_pd(fiz1,tz);
1604
1605             fjx1             = _mm_add_pd(fjx1,tx);
1606             fjy1             = _mm_add_pd(fjy1,ty);
1607             fjz1             = _mm_add_pd(fjz1,tz);
1608
1609             /**************************
1610              * CALCULATE INTERACTIONS *
1611              **************************/
1612
1613             r12              = _mm_mul_pd(rsq12,rinv12);
1614
1615             /* Calculate table index by multiplying r with table scale and truncate to integer */
1616             rt               = _mm_mul_pd(r12,vftabscale);
1617             vfitab           = _mm_cvttpd_epi32(rt);
1618             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1619             vfitab           = _mm_slli_epi32(vfitab,2);
1620
1621             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1622             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1623             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1624             GMX_MM_TRANSPOSE2_PD(Y,F);
1625             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1626             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1627             GMX_MM_TRANSPOSE2_PD(G,H);
1628             Heps             = _mm_mul_pd(vfeps,H);
1629             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1630             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1631             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1632
1633             fscal            = felec;
1634
1635             /* Calculate temporary vectorial force */
1636             tx               = _mm_mul_pd(fscal,dx12);
1637             ty               = _mm_mul_pd(fscal,dy12);
1638             tz               = _mm_mul_pd(fscal,dz12);
1639
1640             /* Update vectorial force */
1641             fix1             = _mm_add_pd(fix1,tx);
1642             fiy1             = _mm_add_pd(fiy1,ty);
1643             fiz1             = _mm_add_pd(fiz1,tz);
1644
1645             fjx2             = _mm_add_pd(fjx2,tx);
1646             fjy2             = _mm_add_pd(fjy2,ty);
1647             fjz2             = _mm_add_pd(fjz2,tz);
1648
1649             /**************************
1650              * CALCULATE INTERACTIONS *
1651              **************************/
1652
1653             r13              = _mm_mul_pd(rsq13,rinv13);
1654
1655             /* Calculate table index by multiplying r with table scale and truncate to integer */
1656             rt               = _mm_mul_pd(r13,vftabscale);
1657             vfitab           = _mm_cvttpd_epi32(rt);
1658             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1659             vfitab           = _mm_slli_epi32(vfitab,2);
1660
1661             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1662             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1663             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1664             GMX_MM_TRANSPOSE2_PD(Y,F);
1665             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1666             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1667             GMX_MM_TRANSPOSE2_PD(G,H);
1668             Heps             = _mm_mul_pd(vfeps,H);
1669             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1670             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1671             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1672
1673             fscal            = felec;
1674
1675             /* Calculate temporary vectorial force */
1676             tx               = _mm_mul_pd(fscal,dx13);
1677             ty               = _mm_mul_pd(fscal,dy13);
1678             tz               = _mm_mul_pd(fscal,dz13);
1679
1680             /* Update vectorial force */
1681             fix1             = _mm_add_pd(fix1,tx);
1682             fiy1             = _mm_add_pd(fiy1,ty);
1683             fiz1             = _mm_add_pd(fiz1,tz);
1684
1685             fjx3             = _mm_add_pd(fjx3,tx);
1686             fjy3             = _mm_add_pd(fjy3,ty);
1687             fjz3             = _mm_add_pd(fjz3,tz);
1688
1689             /**************************
1690              * CALCULATE INTERACTIONS *
1691              **************************/
1692
1693             r21              = _mm_mul_pd(rsq21,rinv21);
1694
1695             /* Calculate table index by multiplying r with table scale and truncate to integer */
1696             rt               = _mm_mul_pd(r21,vftabscale);
1697             vfitab           = _mm_cvttpd_epi32(rt);
1698             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1699             vfitab           = _mm_slli_epi32(vfitab,2);
1700
1701             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1703             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1704             GMX_MM_TRANSPOSE2_PD(Y,F);
1705             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1706             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1707             GMX_MM_TRANSPOSE2_PD(G,H);
1708             Heps             = _mm_mul_pd(vfeps,H);
1709             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1710             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1711             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1712
1713             fscal            = felec;
1714
1715             /* Calculate temporary vectorial force */
1716             tx               = _mm_mul_pd(fscal,dx21);
1717             ty               = _mm_mul_pd(fscal,dy21);
1718             tz               = _mm_mul_pd(fscal,dz21);
1719
1720             /* Update vectorial force */
1721             fix2             = _mm_add_pd(fix2,tx);
1722             fiy2             = _mm_add_pd(fiy2,ty);
1723             fiz2             = _mm_add_pd(fiz2,tz);
1724
1725             fjx1             = _mm_add_pd(fjx1,tx);
1726             fjy1             = _mm_add_pd(fjy1,ty);
1727             fjz1             = _mm_add_pd(fjz1,tz);
1728
1729             /**************************
1730              * CALCULATE INTERACTIONS *
1731              **************************/
1732
1733             r22              = _mm_mul_pd(rsq22,rinv22);
1734
1735             /* Calculate table index by multiplying r with table scale and truncate to integer */
1736             rt               = _mm_mul_pd(r22,vftabscale);
1737             vfitab           = _mm_cvttpd_epi32(rt);
1738             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1739             vfitab           = _mm_slli_epi32(vfitab,2);
1740
1741             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1743             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1744             GMX_MM_TRANSPOSE2_PD(Y,F);
1745             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1746             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1747             GMX_MM_TRANSPOSE2_PD(G,H);
1748             Heps             = _mm_mul_pd(vfeps,H);
1749             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1750             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1751             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1752
1753             fscal            = felec;
1754
1755             /* Calculate temporary vectorial force */
1756             tx               = _mm_mul_pd(fscal,dx22);
1757             ty               = _mm_mul_pd(fscal,dy22);
1758             tz               = _mm_mul_pd(fscal,dz22);
1759
1760             /* Update vectorial force */
1761             fix2             = _mm_add_pd(fix2,tx);
1762             fiy2             = _mm_add_pd(fiy2,ty);
1763             fiz2             = _mm_add_pd(fiz2,tz);
1764
1765             fjx2             = _mm_add_pd(fjx2,tx);
1766             fjy2             = _mm_add_pd(fjy2,ty);
1767             fjz2             = _mm_add_pd(fjz2,tz);
1768
1769             /**************************
1770              * CALCULATE INTERACTIONS *
1771              **************************/
1772
1773             r23              = _mm_mul_pd(rsq23,rinv23);
1774
1775             /* Calculate table index by multiplying r with table scale and truncate to integer */
1776             rt               = _mm_mul_pd(r23,vftabscale);
1777             vfitab           = _mm_cvttpd_epi32(rt);
1778             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1779             vfitab           = _mm_slli_epi32(vfitab,2);
1780
1781             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1782             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1783             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1784             GMX_MM_TRANSPOSE2_PD(Y,F);
1785             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1786             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1787             GMX_MM_TRANSPOSE2_PD(G,H);
1788             Heps             = _mm_mul_pd(vfeps,H);
1789             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1790             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1791             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1792
1793             fscal            = felec;
1794
1795             /* Calculate temporary vectorial force */
1796             tx               = _mm_mul_pd(fscal,dx23);
1797             ty               = _mm_mul_pd(fscal,dy23);
1798             tz               = _mm_mul_pd(fscal,dz23);
1799
1800             /* Update vectorial force */
1801             fix2             = _mm_add_pd(fix2,tx);
1802             fiy2             = _mm_add_pd(fiy2,ty);
1803             fiz2             = _mm_add_pd(fiz2,tz);
1804
1805             fjx3             = _mm_add_pd(fjx3,tx);
1806             fjy3             = _mm_add_pd(fjy3,ty);
1807             fjz3             = _mm_add_pd(fjz3,tz);
1808
1809             /**************************
1810              * CALCULATE INTERACTIONS *
1811              **************************/
1812
1813             r31              = _mm_mul_pd(rsq31,rinv31);
1814
1815             /* Calculate table index by multiplying r with table scale and truncate to integer */
1816             rt               = _mm_mul_pd(r31,vftabscale);
1817             vfitab           = _mm_cvttpd_epi32(rt);
1818             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1819             vfitab           = _mm_slli_epi32(vfitab,2);
1820
1821             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1822             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1823             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1824             GMX_MM_TRANSPOSE2_PD(Y,F);
1825             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1826             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1827             GMX_MM_TRANSPOSE2_PD(G,H);
1828             Heps             = _mm_mul_pd(vfeps,H);
1829             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1830             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1831             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1832
1833             fscal            = felec;
1834
1835             /* Calculate temporary vectorial force */
1836             tx               = _mm_mul_pd(fscal,dx31);
1837             ty               = _mm_mul_pd(fscal,dy31);
1838             tz               = _mm_mul_pd(fscal,dz31);
1839
1840             /* Update vectorial force */
1841             fix3             = _mm_add_pd(fix3,tx);
1842             fiy3             = _mm_add_pd(fiy3,ty);
1843             fiz3             = _mm_add_pd(fiz3,tz);
1844
1845             fjx1             = _mm_add_pd(fjx1,tx);
1846             fjy1             = _mm_add_pd(fjy1,ty);
1847             fjz1             = _mm_add_pd(fjz1,tz);
1848
1849             /**************************
1850              * CALCULATE INTERACTIONS *
1851              **************************/
1852
1853             r32              = _mm_mul_pd(rsq32,rinv32);
1854
1855             /* Calculate table index by multiplying r with table scale and truncate to integer */
1856             rt               = _mm_mul_pd(r32,vftabscale);
1857             vfitab           = _mm_cvttpd_epi32(rt);
1858             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1859             vfitab           = _mm_slli_epi32(vfitab,2);
1860
1861             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1862             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1863             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1864             GMX_MM_TRANSPOSE2_PD(Y,F);
1865             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1866             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1867             GMX_MM_TRANSPOSE2_PD(G,H);
1868             Heps             = _mm_mul_pd(vfeps,H);
1869             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1870             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1871             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1872
1873             fscal            = felec;
1874
1875             /* Calculate temporary vectorial force */
1876             tx               = _mm_mul_pd(fscal,dx32);
1877             ty               = _mm_mul_pd(fscal,dy32);
1878             tz               = _mm_mul_pd(fscal,dz32);
1879
1880             /* Update vectorial force */
1881             fix3             = _mm_add_pd(fix3,tx);
1882             fiy3             = _mm_add_pd(fiy3,ty);
1883             fiz3             = _mm_add_pd(fiz3,tz);
1884
1885             fjx2             = _mm_add_pd(fjx2,tx);
1886             fjy2             = _mm_add_pd(fjy2,ty);
1887             fjz2             = _mm_add_pd(fjz2,tz);
1888
1889             /**************************
1890              * CALCULATE INTERACTIONS *
1891              **************************/
1892
1893             r33              = _mm_mul_pd(rsq33,rinv33);
1894
1895             /* Calculate table index by multiplying r with table scale and truncate to integer */
1896             rt               = _mm_mul_pd(r33,vftabscale);
1897             vfitab           = _mm_cvttpd_epi32(rt);
1898             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1899             vfitab           = _mm_slli_epi32(vfitab,2);
1900
1901             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1902             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1903             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1904             GMX_MM_TRANSPOSE2_PD(Y,F);
1905             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1906             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1907             GMX_MM_TRANSPOSE2_PD(G,H);
1908             Heps             = _mm_mul_pd(vfeps,H);
1909             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1910             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1911             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1912
1913             fscal            = felec;
1914
1915             /* Calculate temporary vectorial force */
1916             tx               = _mm_mul_pd(fscal,dx33);
1917             ty               = _mm_mul_pd(fscal,dy33);
1918             tz               = _mm_mul_pd(fscal,dz33);
1919
1920             /* Update vectorial force */
1921             fix3             = _mm_add_pd(fix3,tx);
1922             fiy3             = _mm_add_pd(fiy3,ty);
1923             fiz3             = _mm_add_pd(fiz3,tz);
1924
1925             fjx3             = _mm_add_pd(fjx3,tx);
1926             fjy3             = _mm_add_pd(fjy3,ty);
1927             fjz3             = _mm_add_pd(fjz3,tz);
1928
1929             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1930
1931             /* Inner loop uses 381 flops */
1932         }
1933
1934         if(jidx<j_index_end)
1935         {
1936
1937             jnrA             = jjnr[jidx];
1938             j_coord_offsetA  = DIM*jnrA;
1939
1940             /* load j atom coordinates */
1941             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1942                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1943                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1944
1945             /* Calculate displacement vector */
1946             dx00             = _mm_sub_pd(ix0,jx0);
1947             dy00             = _mm_sub_pd(iy0,jy0);
1948             dz00             = _mm_sub_pd(iz0,jz0);
1949             dx11             = _mm_sub_pd(ix1,jx1);
1950             dy11             = _mm_sub_pd(iy1,jy1);
1951             dz11             = _mm_sub_pd(iz1,jz1);
1952             dx12             = _mm_sub_pd(ix1,jx2);
1953             dy12             = _mm_sub_pd(iy1,jy2);
1954             dz12             = _mm_sub_pd(iz1,jz2);
1955             dx13             = _mm_sub_pd(ix1,jx3);
1956             dy13             = _mm_sub_pd(iy1,jy3);
1957             dz13             = _mm_sub_pd(iz1,jz3);
1958             dx21             = _mm_sub_pd(ix2,jx1);
1959             dy21             = _mm_sub_pd(iy2,jy1);
1960             dz21             = _mm_sub_pd(iz2,jz1);
1961             dx22             = _mm_sub_pd(ix2,jx2);
1962             dy22             = _mm_sub_pd(iy2,jy2);
1963             dz22             = _mm_sub_pd(iz2,jz2);
1964             dx23             = _mm_sub_pd(ix2,jx3);
1965             dy23             = _mm_sub_pd(iy2,jy3);
1966             dz23             = _mm_sub_pd(iz2,jz3);
1967             dx31             = _mm_sub_pd(ix3,jx1);
1968             dy31             = _mm_sub_pd(iy3,jy1);
1969             dz31             = _mm_sub_pd(iz3,jz1);
1970             dx32             = _mm_sub_pd(ix3,jx2);
1971             dy32             = _mm_sub_pd(iy3,jy2);
1972             dz32             = _mm_sub_pd(iz3,jz2);
1973             dx33             = _mm_sub_pd(ix3,jx3);
1974             dy33             = _mm_sub_pd(iy3,jy3);
1975             dz33             = _mm_sub_pd(iz3,jz3);
1976
1977             /* Calculate squared distance and things based on it */
1978             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1979             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1980             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1981             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1982             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1983             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1984             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1985             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1986             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1987             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1988
1989             rinv11           = sse2_invsqrt_d(rsq11);
1990             rinv12           = sse2_invsqrt_d(rsq12);
1991             rinv13           = sse2_invsqrt_d(rsq13);
1992             rinv21           = sse2_invsqrt_d(rsq21);
1993             rinv22           = sse2_invsqrt_d(rsq22);
1994             rinv23           = sse2_invsqrt_d(rsq23);
1995             rinv31           = sse2_invsqrt_d(rsq31);
1996             rinv32           = sse2_invsqrt_d(rsq32);
1997             rinv33           = sse2_invsqrt_d(rsq33);
1998
1999             rinvsq00         = sse2_inv_d(rsq00);
2000
2001             fjx0             = _mm_setzero_pd();
2002             fjy0             = _mm_setzero_pd();
2003             fjz0             = _mm_setzero_pd();
2004             fjx1             = _mm_setzero_pd();
2005             fjy1             = _mm_setzero_pd();
2006             fjz1             = _mm_setzero_pd();
2007             fjx2             = _mm_setzero_pd();
2008             fjy2             = _mm_setzero_pd();
2009             fjz2             = _mm_setzero_pd();
2010             fjx3             = _mm_setzero_pd();
2011             fjy3             = _mm_setzero_pd();
2012             fjz3             = _mm_setzero_pd();
2013
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             /* LENNARD-JONES DISPERSION/REPULSION */
2019
2020             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2021             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2022
2023             fscal            = fvdw;
2024
2025             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2026
2027             /* Calculate temporary vectorial force */
2028             tx               = _mm_mul_pd(fscal,dx00);
2029             ty               = _mm_mul_pd(fscal,dy00);
2030             tz               = _mm_mul_pd(fscal,dz00);
2031
2032             /* Update vectorial force */
2033             fix0             = _mm_add_pd(fix0,tx);
2034             fiy0             = _mm_add_pd(fiy0,ty);
2035             fiz0             = _mm_add_pd(fiz0,tz);
2036
2037             fjx0             = _mm_add_pd(fjx0,tx);
2038             fjy0             = _mm_add_pd(fjy0,ty);
2039             fjz0             = _mm_add_pd(fjz0,tz);
2040
2041             /**************************
2042              * CALCULATE INTERACTIONS *
2043              **************************/
2044
2045             r11              = _mm_mul_pd(rsq11,rinv11);
2046
2047             /* Calculate table index by multiplying r with table scale and truncate to integer */
2048             rt               = _mm_mul_pd(r11,vftabscale);
2049             vfitab           = _mm_cvttpd_epi32(rt);
2050             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2051             vfitab           = _mm_slli_epi32(vfitab,2);
2052
2053             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2054             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2055             F                = _mm_setzero_pd();
2056             GMX_MM_TRANSPOSE2_PD(Y,F);
2057             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2058             H                = _mm_setzero_pd();
2059             GMX_MM_TRANSPOSE2_PD(G,H);
2060             Heps             = _mm_mul_pd(vfeps,H);
2061             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2062             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2063             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2064
2065             fscal            = felec;
2066
2067             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2068
2069             /* Calculate temporary vectorial force */
2070             tx               = _mm_mul_pd(fscal,dx11);
2071             ty               = _mm_mul_pd(fscal,dy11);
2072             tz               = _mm_mul_pd(fscal,dz11);
2073
2074             /* Update vectorial force */
2075             fix1             = _mm_add_pd(fix1,tx);
2076             fiy1             = _mm_add_pd(fiy1,ty);
2077             fiz1             = _mm_add_pd(fiz1,tz);
2078
2079             fjx1             = _mm_add_pd(fjx1,tx);
2080             fjy1             = _mm_add_pd(fjy1,ty);
2081             fjz1             = _mm_add_pd(fjz1,tz);
2082
2083             /**************************
2084              * CALCULATE INTERACTIONS *
2085              **************************/
2086
2087             r12              = _mm_mul_pd(rsq12,rinv12);
2088
2089             /* Calculate table index by multiplying r with table scale and truncate to integer */
2090             rt               = _mm_mul_pd(r12,vftabscale);
2091             vfitab           = _mm_cvttpd_epi32(rt);
2092             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2093             vfitab           = _mm_slli_epi32(vfitab,2);
2094
2095             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2096             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2097             F                = _mm_setzero_pd();
2098             GMX_MM_TRANSPOSE2_PD(Y,F);
2099             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2100             H                = _mm_setzero_pd();
2101             GMX_MM_TRANSPOSE2_PD(G,H);
2102             Heps             = _mm_mul_pd(vfeps,H);
2103             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2104             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2105             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2106
2107             fscal            = felec;
2108
2109             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2110
2111             /* Calculate temporary vectorial force */
2112             tx               = _mm_mul_pd(fscal,dx12);
2113             ty               = _mm_mul_pd(fscal,dy12);
2114             tz               = _mm_mul_pd(fscal,dz12);
2115
2116             /* Update vectorial force */
2117             fix1             = _mm_add_pd(fix1,tx);
2118             fiy1             = _mm_add_pd(fiy1,ty);
2119             fiz1             = _mm_add_pd(fiz1,tz);
2120
2121             fjx2             = _mm_add_pd(fjx2,tx);
2122             fjy2             = _mm_add_pd(fjy2,ty);
2123             fjz2             = _mm_add_pd(fjz2,tz);
2124
2125             /**************************
2126              * CALCULATE INTERACTIONS *
2127              **************************/
2128
2129             r13              = _mm_mul_pd(rsq13,rinv13);
2130
2131             /* Calculate table index by multiplying r with table scale and truncate to integer */
2132             rt               = _mm_mul_pd(r13,vftabscale);
2133             vfitab           = _mm_cvttpd_epi32(rt);
2134             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2135             vfitab           = _mm_slli_epi32(vfitab,2);
2136
2137             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2138             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2139             F                = _mm_setzero_pd();
2140             GMX_MM_TRANSPOSE2_PD(Y,F);
2141             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2142             H                = _mm_setzero_pd();
2143             GMX_MM_TRANSPOSE2_PD(G,H);
2144             Heps             = _mm_mul_pd(vfeps,H);
2145             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2146             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2147             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2148
2149             fscal            = felec;
2150
2151             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2152
2153             /* Calculate temporary vectorial force */
2154             tx               = _mm_mul_pd(fscal,dx13);
2155             ty               = _mm_mul_pd(fscal,dy13);
2156             tz               = _mm_mul_pd(fscal,dz13);
2157
2158             /* Update vectorial force */
2159             fix1             = _mm_add_pd(fix1,tx);
2160             fiy1             = _mm_add_pd(fiy1,ty);
2161             fiz1             = _mm_add_pd(fiz1,tz);
2162
2163             fjx3             = _mm_add_pd(fjx3,tx);
2164             fjy3             = _mm_add_pd(fjy3,ty);
2165             fjz3             = _mm_add_pd(fjz3,tz);
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             r21              = _mm_mul_pd(rsq21,rinv21);
2172
2173             /* Calculate table index by multiplying r with table scale and truncate to integer */
2174             rt               = _mm_mul_pd(r21,vftabscale);
2175             vfitab           = _mm_cvttpd_epi32(rt);
2176             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2177             vfitab           = _mm_slli_epi32(vfitab,2);
2178
2179             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2180             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2181             F                = _mm_setzero_pd();
2182             GMX_MM_TRANSPOSE2_PD(Y,F);
2183             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2184             H                = _mm_setzero_pd();
2185             GMX_MM_TRANSPOSE2_PD(G,H);
2186             Heps             = _mm_mul_pd(vfeps,H);
2187             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2188             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2189             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2190
2191             fscal            = felec;
2192
2193             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2194
2195             /* Calculate temporary vectorial force */
2196             tx               = _mm_mul_pd(fscal,dx21);
2197             ty               = _mm_mul_pd(fscal,dy21);
2198             tz               = _mm_mul_pd(fscal,dz21);
2199
2200             /* Update vectorial force */
2201             fix2             = _mm_add_pd(fix2,tx);
2202             fiy2             = _mm_add_pd(fiy2,ty);
2203             fiz2             = _mm_add_pd(fiz2,tz);
2204
2205             fjx1             = _mm_add_pd(fjx1,tx);
2206             fjy1             = _mm_add_pd(fjy1,ty);
2207             fjz1             = _mm_add_pd(fjz1,tz);
2208
2209             /**************************
2210              * CALCULATE INTERACTIONS *
2211              **************************/
2212
2213             r22              = _mm_mul_pd(rsq22,rinv22);
2214
2215             /* Calculate table index by multiplying r with table scale and truncate to integer */
2216             rt               = _mm_mul_pd(r22,vftabscale);
2217             vfitab           = _mm_cvttpd_epi32(rt);
2218             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2219             vfitab           = _mm_slli_epi32(vfitab,2);
2220
2221             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2222             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2223             F                = _mm_setzero_pd();
2224             GMX_MM_TRANSPOSE2_PD(Y,F);
2225             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2226             H                = _mm_setzero_pd();
2227             GMX_MM_TRANSPOSE2_PD(G,H);
2228             Heps             = _mm_mul_pd(vfeps,H);
2229             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2230             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2231             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2232
2233             fscal            = felec;
2234
2235             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2236
2237             /* Calculate temporary vectorial force */
2238             tx               = _mm_mul_pd(fscal,dx22);
2239             ty               = _mm_mul_pd(fscal,dy22);
2240             tz               = _mm_mul_pd(fscal,dz22);
2241
2242             /* Update vectorial force */
2243             fix2             = _mm_add_pd(fix2,tx);
2244             fiy2             = _mm_add_pd(fiy2,ty);
2245             fiz2             = _mm_add_pd(fiz2,tz);
2246
2247             fjx2             = _mm_add_pd(fjx2,tx);
2248             fjy2             = _mm_add_pd(fjy2,ty);
2249             fjz2             = _mm_add_pd(fjz2,tz);
2250
2251             /**************************
2252              * CALCULATE INTERACTIONS *
2253              **************************/
2254
2255             r23              = _mm_mul_pd(rsq23,rinv23);
2256
2257             /* Calculate table index by multiplying r with table scale and truncate to integer */
2258             rt               = _mm_mul_pd(r23,vftabscale);
2259             vfitab           = _mm_cvttpd_epi32(rt);
2260             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2261             vfitab           = _mm_slli_epi32(vfitab,2);
2262
2263             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2264             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2265             F                = _mm_setzero_pd();
2266             GMX_MM_TRANSPOSE2_PD(Y,F);
2267             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2268             H                = _mm_setzero_pd();
2269             GMX_MM_TRANSPOSE2_PD(G,H);
2270             Heps             = _mm_mul_pd(vfeps,H);
2271             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2272             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2273             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2274
2275             fscal            = felec;
2276
2277             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2278
2279             /* Calculate temporary vectorial force */
2280             tx               = _mm_mul_pd(fscal,dx23);
2281             ty               = _mm_mul_pd(fscal,dy23);
2282             tz               = _mm_mul_pd(fscal,dz23);
2283
2284             /* Update vectorial force */
2285             fix2             = _mm_add_pd(fix2,tx);
2286             fiy2             = _mm_add_pd(fiy2,ty);
2287             fiz2             = _mm_add_pd(fiz2,tz);
2288
2289             fjx3             = _mm_add_pd(fjx3,tx);
2290             fjy3             = _mm_add_pd(fjy3,ty);
2291             fjz3             = _mm_add_pd(fjz3,tz);
2292
2293             /**************************
2294              * CALCULATE INTERACTIONS *
2295              **************************/
2296
2297             r31              = _mm_mul_pd(rsq31,rinv31);
2298
2299             /* Calculate table index by multiplying r with table scale and truncate to integer */
2300             rt               = _mm_mul_pd(r31,vftabscale);
2301             vfitab           = _mm_cvttpd_epi32(rt);
2302             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2303             vfitab           = _mm_slli_epi32(vfitab,2);
2304
2305             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2306             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2307             F                = _mm_setzero_pd();
2308             GMX_MM_TRANSPOSE2_PD(Y,F);
2309             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2310             H                = _mm_setzero_pd();
2311             GMX_MM_TRANSPOSE2_PD(G,H);
2312             Heps             = _mm_mul_pd(vfeps,H);
2313             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2314             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2315             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2316
2317             fscal            = felec;
2318
2319             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2320
2321             /* Calculate temporary vectorial force */
2322             tx               = _mm_mul_pd(fscal,dx31);
2323             ty               = _mm_mul_pd(fscal,dy31);
2324             tz               = _mm_mul_pd(fscal,dz31);
2325
2326             /* Update vectorial force */
2327             fix3             = _mm_add_pd(fix3,tx);
2328             fiy3             = _mm_add_pd(fiy3,ty);
2329             fiz3             = _mm_add_pd(fiz3,tz);
2330
2331             fjx1             = _mm_add_pd(fjx1,tx);
2332             fjy1             = _mm_add_pd(fjy1,ty);
2333             fjz1             = _mm_add_pd(fjz1,tz);
2334
2335             /**************************
2336              * CALCULATE INTERACTIONS *
2337              **************************/
2338
2339             r32              = _mm_mul_pd(rsq32,rinv32);
2340
2341             /* Calculate table index by multiplying r with table scale and truncate to integer */
2342             rt               = _mm_mul_pd(r32,vftabscale);
2343             vfitab           = _mm_cvttpd_epi32(rt);
2344             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2345             vfitab           = _mm_slli_epi32(vfitab,2);
2346
2347             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2348             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2349             F                = _mm_setzero_pd();
2350             GMX_MM_TRANSPOSE2_PD(Y,F);
2351             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2352             H                = _mm_setzero_pd();
2353             GMX_MM_TRANSPOSE2_PD(G,H);
2354             Heps             = _mm_mul_pd(vfeps,H);
2355             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2356             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2357             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2358
2359             fscal            = felec;
2360
2361             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2362
2363             /* Calculate temporary vectorial force */
2364             tx               = _mm_mul_pd(fscal,dx32);
2365             ty               = _mm_mul_pd(fscal,dy32);
2366             tz               = _mm_mul_pd(fscal,dz32);
2367
2368             /* Update vectorial force */
2369             fix3             = _mm_add_pd(fix3,tx);
2370             fiy3             = _mm_add_pd(fiy3,ty);
2371             fiz3             = _mm_add_pd(fiz3,tz);
2372
2373             fjx2             = _mm_add_pd(fjx2,tx);
2374             fjy2             = _mm_add_pd(fjy2,ty);
2375             fjz2             = _mm_add_pd(fjz2,tz);
2376
2377             /**************************
2378              * CALCULATE INTERACTIONS *
2379              **************************/
2380
2381             r33              = _mm_mul_pd(rsq33,rinv33);
2382
2383             /* Calculate table index by multiplying r with table scale and truncate to integer */
2384             rt               = _mm_mul_pd(r33,vftabscale);
2385             vfitab           = _mm_cvttpd_epi32(rt);
2386             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2387             vfitab           = _mm_slli_epi32(vfitab,2);
2388
2389             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2390             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2391             F                = _mm_setzero_pd();
2392             GMX_MM_TRANSPOSE2_PD(Y,F);
2393             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2394             H                = _mm_setzero_pd();
2395             GMX_MM_TRANSPOSE2_PD(G,H);
2396             Heps             = _mm_mul_pd(vfeps,H);
2397             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2398             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2399             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2400
2401             fscal            = felec;
2402
2403             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2404
2405             /* Calculate temporary vectorial force */
2406             tx               = _mm_mul_pd(fscal,dx33);
2407             ty               = _mm_mul_pd(fscal,dy33);
2408             tz               = _mm_mul_pd(fscal,dz33);
2409
2410             /* Update vectorial force */
2411             fix3             = _mm_add_pd(fix3,tx);
2412             fiy3             = _mm_add_pd(fiy3,ty);
2413             fiz3             = _mm_add_pd(fiz3,tz);
2414
2415             fjx3             = _mm_add_pd(fjx3,tx);
2416             fjy3             = _mm_add_pd(fjy3,ty);
2417             fjz3             = _mm_add_pd(fjz3,tz);
2418
2419             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2420
2421             /* Inner loop uses 381 flops */
2422         }
2423
2424         /* End of innermost loop */
2425
2426         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2427                                               f+i_coord_offset,fshift+i_shift_offset);
2428
2429         /* Increment number of inner iterations */
2430         inneriter                  += j_index_end - j_index_start;
2431
2432         /* Outer loop uses 24 flops */
2433     }
2434
2435     /* Increment number of outer iterations */
2436     outeriter        += nri;
2437
2438     /* Update outer/inner flops */
2439
2440     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*381);
2441 }