Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEw_VdwCSTab_GeomW4W4_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B;
91     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B;
93     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B;
95     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B;
97     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
115     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
116     __m128i          vfitab;
117     __m128i          ifour       = _mm_set1_epi32(4);
118     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119     real             *vftab;
120     __m128i          ewitab;
121     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
122     real             *ewtab;
123     __m128d          dummy_mask,cutoff_mask;
124     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
125     __m128d          one     = _mm_set1_pd(1.0);
126     __m128d          two     = _mm_set1_pd(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm_set1_pd(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     nvdwtype         = fr->ntype;
141     vdwparam         = fr->nbfp;
142     vdwtype          = mdatoms->typeA;
143
144     vftab            = kernel_data->table_vdw->data;
145     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
146
147     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
148     ewtab            = fr->ic->tabq_coul_FDV0;
149     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
150     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
151
152     /* Setup water-specific parameters */
153     inr              = nlist->iinr[0];
154     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
155     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
156     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
157     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
158
159     jq1              = _mm_set1_pd(charge[inr+1]);
160     jq2              = _mm_set1_pd(charge[inr+2]);
161     jq3              = _mm_set1_pd(charge[inr+3]);
162     vdwjidx0A        = 2*vdwtype[inr+0];
163     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
164     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
165     qq11             = _mm_mul_pd(iq1,jq1);
166     qq12             = _mm_mul_pd(iq1,jq2);
167     qq13             = _mm_mul_pd(iq1,jq3);
168     qq21             = _mm_mul_pd(iq2,jq1);
169     qq22             = _mm_mul_pd(iq2,jq2);
170     qq23             = _mm_mul_pd(iq2,jq3);
171     qq31             = _mm_mul_pd(iq3,jq1);
172     qq32             = _mm_mul_pd(iq3,jq2);
173     qq33             = _mm_mul_pd(iq3,jq3);
174
175     /* Avoid stupid compiler warnings */
176     jnrA = jnrB = 0;
177     j_coord_offsetA = 0;
178     j_coord_offsetB = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     /* Start outer loop over neighborlists */
184     for(iidx=0; iidx<nri; iidx++)
185     {
186         /* Load shift vector for this list */
187         i_shift_offset   = DIM*shiftidx[iidx];
188
189         /* Load limits for loop over neighbors */
190         j_index_start    = jindex[iidx];
191         j_index_end      = jindex[iidx+1];
192
193         /* Get outer coordinate index */
194         inr              = iinr[iidx];
195         i_coord_offset   = DIM*inr;
196
197         /* Load i particle coords and add shift vector */
198         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
199                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
200
201         fix0             = _mm_setzero_pd();
202         fiy0             = _mm_setzero_pd();
203         fiz0             = _mm_setzero_pd();
204         fix1             = _mm_setzero_pd();
205         fiy1             = _mm_setzero_pd();
206         fiz1             = _mm_setzero_pd();
207         fix2             = _mm_setzero_pd();
208         fiy2             = _mm_setzero_pd();
209         fiz2             = _mm_setzero_pd();
210         fix3             = _mm_setzero_pd();
211         fiy3             = _mm_setzero_pd();
212         fiz3             = _mm_setzero_pd();
213
214         /* Reset potential sums */
215         velecsum         = _mm_setzero_pd();
216         vvdwsum          = _mm_setzero_pd();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227
228             /* load j atom coordinates */
229             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
230                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
231                                               &jy2,&jz2,&jx3,&jy3,&jz3);
232
233             /* Calculate displacement vector */
234             dx00             = _mm_sub_pd(ix0,jx0);
235             dy00             = _mm_sub_pd(iy0,jy0);
236             dz00             = _mm_sub_pd(iz0,jz0);
237             dx11             = _mm_sub_pd(ix1,jx1);
238             dy11             = _mm_sub_pd(iy1,jy1);
239             dz11             = _mm_sub_pd(iz1,jz1);
240             dx12             = _mm_sub_pd(ix1,jx2);
241             dy12             = _mm_sub_pd(iy1,jy2);
242             dz12             = _mm_sub_pd(iz1,jz2);
243             dx13             = _mm_sub_pd(ix1,jx3);
244             dy13             = _mm_sub_pd(iy1,jy3);
245             dz13             = _mm_sub_pd(iz1,jz3);
246             dx21             = _mm_sub_pd(ix2,jx1);
247             dy21             = _mm_sub_pd(iy2,jy1);
248             dz21             = _mm_sub_pd(iz2,jz1);
249             dx22             = _mm_sub_pd(ix2,jx2);
250             dy22             = _mm_sub_pd(iy2,jy2);
251             dz22             = _mm_sub_pd(iz2,jz2);
252             dx23             = _mm_sub_pd(ix2,jx3);
253             dy23             = _mm_sub_pd(iy2,jy3);
254             dz23             = _mm_sub_pd(iz2,jz3);
255             dx31             = _mm_sub_pd(ix3,jx1);
256             dy31             = _mm_sub_pd(iy3,jy1);
257             dz31             = _mm_sub_pd(iz3,jz1);
258             dx32             = _mm_sub_pd(ix3,jx2);
259             dy32             = _mm_sub_pd(iy3,jy2);
260             dz32             = _mm_sub_pd(iz3,jz2);
261             dx33             = _mm_sub_pd(ix3,jx3);
262             dy33             = _mm_sub_pd(iy3,jy3);
263             dz33             = _mm_sub_pd(iz3,jz3);
264
265             /* Calculate squared distance and things based on it */
266             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
267             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
268             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
269             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
270             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
271             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
272             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
273             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
274             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
275             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
276
277             rinv00           = gmx_mm_invsqrt_pd(rsq00);
278             rinv11           = gmx_mm_invsqrt_pd(rsq11);
279             rinv12           = gmx_mm_invsqrt_pd(rsq12);
280             rinv13           = gmx_mm_invsqrt_pd(rsq13);
281             rinv21           = gmx_mm_invsqrt_pd(rsq21);
282             rinv22           = gmx_mm_invsqrt_pd(rsq22);
283             rinv23           = gmx_mm_invsqrt_pd(rsq23);
284             rinv31           = gmx_mm_invsqrt_pd(rsq31);
285             rinv32           = gmx_mm_invsqrt_pd(rsq32);
286             rinv33           = gmx_mm_invsqrt_pd(rsq33);
287
288             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
289             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
290             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
291             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
292             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
293             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
294             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
295             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
296             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
297
298             fjx0             = _mm_setzero_pd();
299             fjy0             = _mm_setzero_pd();
300             fjz0             = _mm_setzero_pd();
301             fjx1             = _mm_setzero_pd();
302             fjy1             = _mm_setzero_pd();
303             fjz1             = _mm_setzero_pd();
304             fjx2             = _mm_setzero_pd();
305             fjy2             = _mm_setzero_pd();
306             fjz2             = _mm_setzero_pd();
307             fjx3             = _mm_setzero_pd();
308             fjy3             = _mm_setzero_pd();
309             fjz3             = _mm_setzero_pd();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             r00              = _mm_mul_pd(rsq00,rinv00);
316
317             /* Calculate table index by multiplying r with table scale and truncate to integer */
318             rt               = _mm_mul_pd(r00,vftabscale);
319             vfitab           = _mm_cvttpd_epi32(rt);
320             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
321             vfitab           = _mm_slli_epi32(vfitab,3);
322
323             /* CUBIC SPLINE TABLE DISPERSION */
324             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
325             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
326             GMX_MM_TRANSPOSE2_PD(Y,F);
327             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
328             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
329             GMX_MM_TRANSPOSE2_PD(G,H);
330             Heps             = _mm_mul_pd(vfeps,H);
331             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
332             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
333             vvdw6            = _mm_mul_pd(c6_00,VV);
334             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
335             fvdw6            = _mm_mul_pd(c6_00,FF);
336
337             /* CUBIC SPLINE TABLE REPULSION */
338             vfitab           = _mm_add_epi32(vfitab,ifour);
339             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
340             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
341             GMX_MM_TRANSPOSE2_PD(Y,F);
342             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
343             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
344             GMX_MM_TRANSPOSE2_PD(G,H);
345             Heps             = _mm_mul_pd(vfeps,H);
346             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
347             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
348             vvdw12           = _mm_mul_pd(c12_00,VV);
349             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
350             fvdw12           = _mm_mul_pd(c12_00,FF);
351             vvdw             = _mm_add_pd(vvdw12,vvdw6);
352             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
356
357             fscal            = fvdw;
358
359             /* Calculate temporary vectorial force */
360             tx               = _mm_mul_pd(fscal,dx00);
361             ty               = _mm_mul_pd(fscal,dy00);
362             tz               = _mm_mul_pd(fscal,dz00);
363
364             /* Update vectorial force */
365             fix0             = _mm_add_pd(fix0,tx);
366             fiy0             = _mm_add_pd(fiy0,ty);
367             fiz0             = _mm_add_pd(fiz0,tz);
368
369             fjx0             = _mm_add_pd(fjx0,tx);
370             fjy0             = _mm_add_pd(fjy0,ty);
371             fjz0             = _mm_add_pd(fjz0,tz);
372
373             /**************************
374              * CALCULATE INTERACTIONS *
375              **************************/
376
377             r11              = _mm_mul_pd(rsq11,rinv11);
378
379             /* EWALD ELECTROSTATICS */
380
381             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
382             ewrt             = _mm_mul_pd(r11,ewtabscale);
383             ewitab           = _mm_cvttpd_epi32(ewrt);
384             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
385             ewitab           = _mm_slli_epi32(ewitab,2);
386             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
387             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
388             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
389             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
390             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
391             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
392             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
393             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
394             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
395             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
396
397             /* Update potential sum for this i atom from the interaction with this j atom. */
398             velecsum         = _mm_add_pd(velecsum,velec);
399
400             fscal            = felec;
401
402             /* Calculate temporary vectorial force */
403             tx               = _mm_mul_pd(fscal,dx11);
404             ty               = _mm_mul_pd(fscal,dy11);
405             tz               = _mm_mul_pd(fscal,dz11);
406
407             /* Update vectorial force */
408             fix1             = _mm_add_pd(fix1,tx);
409             fiy1             = _mm_add_pd(fiy1,ty);
410             fiz1             = _mm_add_pd(fiz1,tz);
411
412             fjx1             = _mm_add_pd(fjx1,tx);
413             fjy1             = _mm_add_pd(fjy1,ty);
414             fjz1             = _mm_add_pd(fjz1,tz);
415
416             /**************************
417              * CALCULATE INTERACTIONS *
418              **************************/
419
420             r12              = _mm_mul_pd(rsq12,rinv12);
421
422             /* EWALD ELECTROSTATICS */
423
424             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
425             ewrt             = _mm_mul_pd(r12,ewtabscale);
426             ewitab           = _mm_cvttpd_epi32(ewrt);
427             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
428             ewitab           = _mm_slli_epi32(ewitab,2);
429             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
430             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
431             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
432             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
433             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
434             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
435             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
436             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
437             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
438             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velecsum         = _mm_add_pd(velecsum,velec);
442
443             fscal            = felec;
444
445             /* Calculate temporary vectorial force */
446             tx               = _mm_mul_pd(fscal,dx12);
447             ty               = _mm_mul_pd(fscal,dy12);
448             tz               = _mm_mul_pd(fscal,dz12);
449
450             /* Update vectorial force */
451             fix1             = _mm_add_pd(fix1,tx);
452             fiy1             = _mm_add_pd(fiy1,ty);
453             fiz1             = _mm_add_pd(fiz1,tz);
454
455             fjx2             = _mm_add_pd(fjx2,tx);
456             fjy2             = _mm_add_pd(fjy2,ty);
457             fjz2             = _mm_add_pd(fjz2,tz);
458
459             /**************************
460              * CALCULATE INTERACTIONS *
461              **************************/
462
463             r13              = _mm_mul_pd(rsq13,rinv13);
464
465             /* EWALD ELECTROSTATICS */
466
467             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468             ewrt             = _mm_mul_pd(r13,ewtabscale);
469             ewitab           = _mm_cvttpd_epi32(ewrt);
470             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
471             ewitab           = _mm_slli_epi32(ewitab,2);
472             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
475             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
476             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
477             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
478             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
479             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
480             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
481             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velecsum         = _mm_add_pd(velecsum,velec);
485
486             fscal            = felec;
487
488             /* Calculate temporary vectorial force */
489             tx               = _mm_mul_pd(fscal,dx13);
490             ty               = _mm_mul_pd(fscal,dy13);
491             tz               = _mm_mul_pd(fscal,dz13);
492
493             /* Update vectorial force */
494             fix1             = _mm_add_pd(fix1,tx);
495             fiy1             = _mm_add_pd(fiy1,ty);
496             fiz1             = _mm_add_pd(fiz1,tz);
497
498             fjx3             = _mm_add_pd(fjx3,tx);
499             fjy3             = _mm_add_pd(fjy3,ty);
500             fjz3             = _mm_add_pd(fjz3,tz);
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             r21              = _mm_mul_pd(rsq21,rinv21);
507
508             /* EWALD ELECTROSTATICS */
509
510             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511             ewrt             = _mm_mul_pd(r21,ewtabscale);
512             ewitab           = _mm_cvttpd_epi32(ewrt);
513             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
514             ewitab           = _mm_slli_epi32(ewitab,2);
515             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
518             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
519             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
520             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
521             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
522             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
523             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
524             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velecsum         = _mm_add_pd(velecsum,velec);
528
529             fscal            = felec;
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm_mul_pd(fscal,dx21);
533             ty               = _mm_mul_pd(fscal,dy21);
534             tz               = _mm_mul_pd(fscal,dz21);
535
536             /* Update vectorial force */
537             fix2             = _mm_add_pd(fix2,tx);
538             fiy2             = _mm_add_pd(fiy2,ty);
539             fiz2             = _mm_add_pd(fiz2,tz);
540
541             fjx1             = _mm_add_pd(fjx1,tx);
542             fjy1             = _mm_add_pd(fjy1,ty);
543             fjz1             = _mm_add_pd(fjz1,tz);
544
545             /**************************
546              * CALCULATE INTERACTIONS *
547              **************************/
548
549             r22              = _mm_mul_pd(rsq22,rinv22);
550
551             /* EWALD ELECTROSTATICS */
552
553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
554             ewrt             = _mm_mul_pd(r22,ewtabscale);
555             ewitab           = _mm_cvttpd_epi32(ewrt);
556             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
557             ewitab           = _mm_slli_epi32(ewitab,2);
558             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
559             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
560             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
561             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
562             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
563             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
564             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
565             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
566             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
567             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
568
569             /* Update potential sum for this i atom from the interaction with this j atom. */
570             velecsum         = _mm_add_pd(velecsum,velec);
571
572             fscal            = felec;
573
574             /* Calculate temporary vectorial force */
575             tx               = _mm_mul_pd(fscal,dx22);
576             ty               = _mm_mul_pd(fscal,dy22);
577             tz               = _mm_mul_pd(fscal,dz22);
578
579             /* Update vectorial force */
580             fix2             = _mm_add_pd(fix2,tx);
581             fiy2             = _mm_add_pd(fiy2,ty);
582             fiz2             = _mm_add_pd(fiz2,tz);
583
584             fjx2             = _mm_add_pd(fjx2,tx);
585             fjy2             = _mm_add_pd(fjy2,ty);
586             fjz2             = _mm_add_pd(fjz2,tz);
587
588             /**************************
589              * CALCULATE INTERACTIONS *
590              **************************/
591
592             r23              = _mm_mul_pd(rsq23,rinv23);
593
594             /* EWALD ELECTROSTATICS */
595
596             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597             ewrt             = _mm_mul_pd(r23,ewtabscale);
598             ewitab           = _mm_cvttpd_epi32(ewrt);
599             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
600             ewitab           = _mm_slli_epi32(ewitab,2);
601             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
602             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
603             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
604             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
605             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
606             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
607             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
608             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
609             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
610             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
611
612             /* Update potential sum for this i atom from the interaction with this j atom. */
613             velecsum         = _mm_add_pd(velecsum,velec);
614
615             fscal            = felec;
616
617             /* Calculate temporary vectorial force */
618             tx               = _mm_mul_pd(fscal,dx23);
619             ty               = _mm_mul_pd(fscal,dy23);
620             tz               = _mm_mul_pd(fscal,dz23);
621
622             /* Update vectorial force */
623             fix2             = _mm_add_pd(fix2,tx);
624             fiy2             = _mm_add_pd(fiy2,ty);
625             fiz2             = _mm_add_pd(fiz2,tz);
626
627             fjx3             = _mm_add_pd(fjx3,tx);
628             fjy3             = _mm_add_pd(fjy3,ty);
629             fjz3             = _mm_add_pd(fjz3,tz);
630
631             /**************************
632              * CALCULATE INTERACTIONS *
633              **************************/
634
635             r31              = _mm_mul_pd(rsq31,rinv31);
636
637             /* EWALD ELECTROSTATICS */
638
639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
640             ewrt             = _mm_mul_pd(r31,ewtabscale);
641             ewitab           = _mm_cvttpd_epi32(ewrt);
642             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
643             ewitab           = _mm_slli_epi32(ewitab,2);
644             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
645             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
646             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
647             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
648             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
649             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
650             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
651             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
652             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
653             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
654
655             /* Update potential sum for this i atom from the interaction with this j atom. */
656             velecsum         = _mm_add_pd(velecsum,velec);
657
658             fscal            = felec;
659
660             /* Calculate temporary vectorial force */
661             tx               = _mm_mul_pd(fscal,dx31);
662             ty               = _mm_mul_pd(fscal,dy31);
663             tz               = _mm_mul_pd(fscal,dz31);
664
665             /* Update vectorial force */
666             fix3             = _mm_add_pd(fix3,tx);
667             fiy3             = _mm_add_pd(fiy3,ty);
668             fiz3             = _mm_add_pd(fiz3,tz);
669
670             fjx1             = _mm_add_pd(fjx1,tx);
671             fjy1             = _mm_add_pd(fjy1,ty);
672             fjz1             = _mm_add_pd(fjz1,tz);
673
674             /**************************
675              * CALCULATE INTERACTIONS *
676              **************************/
677
678             r32              = _mm_mul_pd(rsq32,rinv32);
679
680             /* EWALD ELECTROSTATICS */
681
682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
683             ewrt             = _mm_mul_pd(r32,ewtabscale);
684             ewitab           = _mm_cvttpd_epi32(ewrt);
685             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
686             ewitab           = _mm_slli_epi32(ewitab,2);
687             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
688             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
689             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
690             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
691             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
692             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
693             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
694             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
695             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
696             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
697
698             /* Update potential sum for this i atom from the interaction with this j atom. */
699             velecsum         = _mm_add_pd(velecsum,velec);
700
701             fscal            = felec;
702
703             /* Calculate temporary vectorial force */
704             tx               = _mm_mul_pd(fscal,dx32);
705             ty               = _mm_mul_pd(fscal,dy32);
706             tz               = _mm_mul_pd(fscal,dz32);
707
708             /* Update vectorial force */
709             fix3             = _mm_add_pd(fix3,tx);
710             fiy3             = _mm_add_pd(fiy3,ty);
711             fiz3             = _mm_add_pd(fiz3,tz);
712
713             fjx2             = _mm_add_pd(fjx2,tx);
714             fjy2             = _mm_add_pd(fjy2,ty);
715             fjz2             = _mm_add_pd(fjz2,tz);
716
717             /**************************
718              * CALCULATE INTERACTIONS *
719              **************************/
720
721             r33              = _mm_mul_pd(rsq33,rinv33);
722
723             /* EWALD ELECTROSTATICS */
724
725             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726             ewrt             = _mm_mul_pd(r33,ewtabscale);
727             ewitab           = _mm_cvttpd_epi32(ewrt);
728             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
729             ewitab           = _mm_slli_epi32(ewitab,2);
730             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
731             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
732             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
733             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
734             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
735             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
736             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
737             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
738             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
739             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
740
741             /* Update potential sum for this i atom from the interaction with this j atom. */
742             velecsum         = _mm_add_pd(velecsum,velec);
743
744             fscal            = felec;
745
746             /* Calculate temporary vectorial force */
747             tx               = _mm_mul_pd(fscal,dx33);
748             ty               = _mm_mul_pd(fscal,dy33);
749             tz               = _mm_mul_pd(fscal,dz33);
750
751             /* Update vectorial force */
752             fix3             = _mm_add_pd(fix3,tx);
753             fiy3             = _mm_add_pd(fiy3,ty);
754             fiz3             = _mm_add_pd(fiz3,tz);
755
756             fjx3             = _mm_add_pd(fjx3,tx);
757             fjy3             = _mm_add_pd(fjy3,ty);
758             fjz3             = _mm_add_pd(fjz3,tz);
759
760             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
761
762             /* Inner loop uses 428 flops */
763         }
764
765         if(jidx<j_index_end)
766         {
767
768             jnrA             = jjnr[jidx];
769             j_coord_offsetA  = DIM*jnrA;
770
771             /* load j atom coordinates */
772             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
773                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
774                                               &jy2,&jz2,&jx3,&jy3,&jz3);
775
776             /* Calculate displacement vector */
777             dx00             = _mm_sub_pd(ix0,jx0);
778             dy00             = _mm_sub_pd(iy0,jy0);
779             dz00             = _mm_sub_pd(iz0,jz0);
780             dx11             = _mm_sub_pd(ix1,jx1);
781             dy11             = _mm_sub_pd(iy1,jy1);
782             dz11             = _mm_sub_pd(iz1,jz1);
783             dx12             = _mm_sub_pd(ix1,jx2);
784             dy12             = _mm_sub_pd(iy1,jy2);
785             dz12             = _mm_sub_pd(iz1,jz2);
786             dx13             = _mm_sub_pd(ix1,jx3);
787             dy13             = _mm_sub_pd(iy1,jy3);
788             dz13             = _mm_sub_pd(iz1,jz3);
789             dx21             = _mm_sub_pd(ix2,jx1);
790             dy21             = _mm_sub_pd(iy2,jy1);
791             dz21             = _mm_sub_pd(iz2,jz1);
792             dx22             = _mm_sub_pd(ix2,jx2);
793             dy22             = _mm_sub_pd(iy2,jy2);
794             dz22             = _mm_sub_pd(iz2,jz2);
795             dx23             = _mm_sub_pd(ix2,jx3);
796             dy23             = _mm_sub_pd(iy2,jy3);
797             dz23             = _mm_sub_pd(iz2,jz3);
798             dx31             = _mm_sub_pd(ix3,jx1);
799             dy31             = _mm_sub_pd(iy3,jy1);
800             dz31             = _mm_sub_pd(iz3,jz1);
801             dx32             = _mm_sub_pd(ix3,jx2);
802             dy32             = _mm_sub_pd(iy3,jy2);
803             dz32             = _mm_sub_pd(iz3,jz2);
804             dx33             = _mm_sub_pd(ix3,jx3);
805             dy33             = _mm_sub_pd(iy3,jy3);
806             dz33             = _mm_sub_pd(iz3,jz3);
807
808             /* Calculate squared distance and things based on it */
809             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
810             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
811             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
812             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
813             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
814             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
815             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
816             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
817             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
818             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
819
820             rinv00           = gmx_mm_invsqrt_pd(rsq00);
821             rinv11           = gmx_mm_invsqrt_pd(rsq11);
822             rinv12           = gmx_mm_invsqrt_pd(rsq12);
823             rinv13           = gmx_mm_invsqrt_pd(rsq13);
824             rinv21           = gmx_mm_invsqrt_pd(rsq21);
825             rinv22           = gmx_mm_invsqrt_pd(rsq22);
826             rinv23           = gmx_mm_invsqrt_pd(rsq23);
827             rinv31           = gmx_mm_invsqrt_pd(rsq31);
828             rinv32           = gmx_mm_invsqrt_pd(rsq32);
829             rinv33           = gmx_mm_invsqrt_pd(rsq33);
830
831             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
832             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
833             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
834             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
835             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
836             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
837             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
838             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
839             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
840
841             fjx0             = _mm_setzero_pd();
842             fjy0             = _mm_setzero_pd();
843             fjz0             = _mm_setzero_pd();
844             fjx1             = _mm_setzero_pd();
845             fjy1             = _mm_setzero_pd();
846             fjz1             = _mm_setzero_pd();
847             fjx2             = _mm_setzero_pd();
848             fjy2             = _mm_setzero_pd();
849             fjz2             = _mm_setzero_pd();
850             fjx3             = _mm_setzero_pd();
851             fjy3             = _mm_setzero_pd();
852             fjz3             = _mm_setzero_pd();
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             r00              = _mm_mul_pd(rsq00,rinv00);
859
860             /* Calculate table index by multiplying r with table scale and truncate to integer */
861             rt               = _mm_mul_pd(r00,vftabscale);
862             vfitab           = _mm_cvttpd_epi32(rt);
863             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
864             vfitab           = _mm_slli_epi32(vfitab,3);
865
866             /* CUBIC SPLINE TABLE DISPERSION */
867             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
868             F                = _mm_setzero_pd();
869             GMX_MM_TRANSPOSE2_PD(Y,F);
870             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
871             H                = _mm_setzero_pd();
872             GMX_MM_TRANSPOSE2_PD(G,H);
873             Heps             = _mm_mul_pd(vfeps,H);
874             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
875             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
876             vvdw6            = _mm_mul_pd(c6_00,VV);
877             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
878             fvdw6            = _mm_mul_pd(c6_00,FF);
879
880             /* CUBIC SPLINE TABLE REPULSION */
881             vfitab           = _mm_add_epi32(vfitab,ifour);
882             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
883             F                = _mm_setzero_pd();
884             GMX_MM_TRANSPOSE2_PD(Y,F);
885             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
886             H                = _mm_setzero_pd();
887             GMX_MM_TRANSPOSE2_PD(G,H);
888             Heps             = _mm_mul_pd(vfeps,H);
889             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
890             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
891             vvdw12           = _mm_mul_pd(c12_00,VV);
892             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
893             fvdw12           = _mm_mul_pd(c12_00,FF);
894             vvdw             = _mm_add_pd(vvdw12,vvdw6);
895             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
896
897             /* Update potential sum for this i atom from the interaction with this j atom. */
898             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
899             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
900
901             fscal            = fvdw;
902
903             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904
905             /* Calculate temporary vectorial force */
906             tx               = _mm_mul_pd(fscal,dx00);
907             ty               = _mm_mul_pd(fscal,dy00);
908             tz               = _mm_mul_pd(fscal,dz00);
909
910             /* Update vectorial force */
911             fix0             = _mm_add_pd(fix0,tx);
912             fiy0             = _mm_add_pd(fiy0,ty);
913             fiz0             = _mm_add_pd(fiz0,tz);
914
915             fjx0             = _mm_add_pd(fjx0,tx);
916             fjy0             = _mm_add_pd(fjy0,ty);
917             fjz0             = _mm_add_pd(fjz0,tz);
918
919             /**************************
920              * CALCULATE INTERACTIONS *
921              **************************/
922
923             r11              = _mm_mul_pd(rsq11,rinv11);
924
925             /* EWALD ELECTROSTATICS */
926
927             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
928             ewrt             = _mm_mul_pd(r11,ewtabscale);
929             ewitab           = _mm_cvttpd_epi32(ewrt);
930             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
931             ewitab           = _mm_slli_epi32(ewitab,2);
932             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
933             ewtabD           = _mm_setzero_pd();
934             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
935             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
936             ewtabFn          = _mm_setzero_pd();
937             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
938             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
939             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
940             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
941             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
942
943             /* Update potential sum for this i atom from the interaction with this j atom. */
944             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
945             velecsum         = _mm_add_pd(velecsum,velec);
946
947             fscal            = felec;
948
949             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
950
951             /* Calculate temporary vectorial force */
952             tx               = _mm_mul_pd(fscal,dx11);
953             ty               = _mm_mul_pd(fscal,dy11);
954             tz               = _mm_mul_pd(fscal,dz11);
955
956             /* Update vectorial force */
957             fix1             = _mm_add_pd(fix1,tx);
958             fiy1             = _mm_add_pd(fiy1,ty);
959             fiz1             = _mm_add_pd(fiz1,tz);
960
961             fjx1             = _mm_add_pd(fjx1,tx);
962             fjy1             = _mm_add_pd(fjy1,ty);
963             fjz1             = _mm_add_pd(fjz1,tz);
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             r12              = _mm_mul_pd(rsq12,rinv12);
970
971             /* EWALD ELECTROSTATICS */
972
973             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
974             ewrt             = _mm_mul_pd(r12,ewtabscale);
975             ewitab           = _mm_cvttpd_epi32(ewrt);
976             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
977             ewitab           = _mm_slli_epi32(ewitab,2);
978             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
979             ewtabD           = _mm_setzero_pd();
980             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
981             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
982             ewtabFn          = _mm_setzero_pd();
983             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
984             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
985             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
986             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
987             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
988
989             /* Update potential sum for this i atom from the interaction with this j atom. */
990             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
991             velecsum         = _mm_add_pd(velecsum,velec);
992
993             fscal            = felec;
994
995             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
996
997             /* Calculate temporary vectorial force */
998             tx               = _mm_mul_pd(fscal,dx12);
999             ty               = _mm_mul_pd(fscal,dy12);
1000             tz               = _mm_mul_pd(fscal,dz12);
1001
1002             /* Update vectorial force */
1003             fix1             = _mm_add_pd(fix1,tx);
1004             fiy1             = _mm_add_pd(fiy1,ty);
1005             fiz1             = _mm_add_pd(fiz1,tz);
1006
1007             fjx2             = _mm_add_pd(fjx2,tx);
1008             fjy2             = _mm_add_pd(fjy2,ty);
1009             fjz2             = _mm_add_pd(fjz2,tz);
1010
1011             /**************************
1012              * CALCULATE INTERACTIONS *
1013              **************************/
1014
1015             r13              = _mm_mul_pd(rsq13,rinv13);
1016
1017             /* EWALD ELECTROSTATICS */
1018
1019             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1020             ewrt             = _mm_mul_pd(r13,ewtabscale);
1021             ewitab           = _mm_cvttpd_epi32(ewrt);
1022             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1023             ewitab           = _mm_slli_epi32(ewitab,2);
1024             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1025             ewtabD           = _mm_setzero_pd();
1026             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1027             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1028             ewtabFn          = _mm_setzero_pd();
1029             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1030             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1031             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1032             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1033             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1034
1035             /* Update potential sum for this i atom from the interaction with this j atom. */
1036             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1037             velecsum         = _mm_add_pd(velecsum,velec);
1038
1039             fscal            = felec;
1040
1041             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1042
1043             /* Calculate temporary vectorial force */
1044             tx               = _mm_mul_pd(fscal,dx13);
1045             ty               = _mm_mul_pd(fscal,dy13);
1046             tz               = _mm_mul_pd(fscal,dz13);
1047
1048             /* Update vectorial force */
1049             fix1             = _mm_add_pd(fix1,tx);
1050             fiy1             = _mm_add_pd(fiy1,ty);
1051             fiz1             = _mm_add_pd(fiz1,tz);
1052
1053             fjx3             = _mm_add_pd(fjx3,tx);
1054             fjy3             = _mm_add_pd(fjy3,ty);
1055             fjz3             = _mm_add_pd(fjz3,tz);
1056
1057             /**************************
1058              * CALCULATE INTERACTIONS *
1059              **************************/
1060
1061             r21              = _mm_mul_pd(rsq21,rinv21);
1062
1063             /* EWALD ELECTROSTATICS */
1064
1065             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066             ewrt             = _mm_mul_pd(r21,ewtabscale);
1067             ewitab           = _mm_cvttpd_epi32(ewrt);
1068             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1069             ewitab           = _mm_slli_epi32(ewitab,2);
1070             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1071             ewtabD           = _mm_setzero_pd();
1072             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1073             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1074             ewtabFn          = _mm_setzero_pd();
1075             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1076             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1077             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1078             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1079             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1080
1081             /* Update potential sum for this i atom from the interaction with this j atom. */
1082             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1083             velecsum         = _mm_add_pd(velecsum,velec);
1084
1085             fscal            = felec;
1086
1087             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1088
1089             /* Calculate temporary vectorial force */
1090             tx               = _mm_mul_pd(fscal,dx21);
1091             ty               = _mm_mul_pd(fscal,dy21);
1092             tz               = _mm_mul_pd(fscal,dz21);
1093
1094             /* Update vectorial force */
1095             fix2             = _mm_add_pd(fix2,tx);
1096             fiy2             = _mm_add_pd(fiy2,ty);
1097             fiz2             = _mm_add_pd(fiz2,tz);
1098
1099             fjx1             = _mm_add_pd(fjx1,tx);
1100             fjy1             = _mm_add_pd(fjy1,ty);
1101             fjz1             = _mm_add_pd(fjz1,tz);
1102
1103             /**************************
1104              * CALCULATE INTERACTIONS *
1105              **************************/
1106
1107             r22              = _mm_mul_pd(rsq22,rinv22);
1108
1109             /* EWALD ELECTROSTATICS */
1110
1111             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1112             ewrt             = _mm_mul_pd(r22,ewtabscale);
1113             ewitab           = _mm_cvttpd_epi32(ewrt);
1114             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1115             ewitab           = _mm_slli_epi32(ewitab,2);
1116             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1117             ewtabD           = _mm_setzero_pd();
1118             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1119             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1120             ewtabFn          = _mm_setzero_pd();
1121             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1122             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1123             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1124             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1125             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1126
1127             /* Update potential sum for this i atom from the interaction with this j atom. */
1128             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1129             velecsum         = _mm_add_pd(velecsum,velec);
1130
1131             fscal            = felec;
1132
1133             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1134
1135             /* Calculate temporary vectorial force */
1136             tx               = _mm_mul_pd(fscal,dx22);
1137             ty               = _mm_mul_pd(fscal,dy22);
1138             tz               = _mm_mul_pd(fscal,dz22);
1139
1140             /* Update vectorial force */
1141             fix2             = _mm_add_pd(fix2,tx);
1142             fiy2             = _mm_add_pd(fiy2,ty);
1143             fiz2             = _mm_add_pd(fiz2,tz);
1144
1145             fjx2             = _mm_add_pd(fjx2,tx);
1146             fjy2             = _mm_add_pd(fjy2,ty);
1147             fjz2             = _mm_add_pd(fjz2,tz);
1148
1149             /**************************
1150              * CALCULATE INTERACTIONS *
1151              **************************/
1152
1153             r23              = _mm_mul_pd(rsq23,rinv23);
1154
1155             /* EWALD ELECTROSTATICS */
1156
1157             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1158             ewrt             = _mm_mul_pd(r23,ewtabscale);
1159             ewitab           = _mm_cvttpd_epi32(ewrt);
1160             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1161             ewitab           = _mm_slli_epi32(ewitab,2);
1162             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1163             ewtabD           = _mm_setzero_pd();
1164             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1165             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1166             ewtabFn          = _mm_setzero_pd();
1167             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1168             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1169             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1170             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1171             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1172
1173             /* Update potential sum for this i atom from the interaction with this j atom. */
1174             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1175             velecsum         = _mm_add_pd(velecsum,velec);
1176
1177             fscal            = felec;
1178
1179             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1180
1181             /* Calculate temporary vectorial force */
1182             tx               = _mm_mul_pd(fscal,dx23);
1183             ty               = _mm_mul_pd(fscal,dy23);
1184             tz               = _mm_mul_pd(fscal,dz23);
1185
1186             /* Update vectorial force */
1187             fix2             = _mm_add_pd(fix2,tx);
1188             fiy2             = _mm_add_pd(fiy2,ty);
1189             fiz2             = _mm_add_pd(fiz2,tz);
1190
1191             fjx3             = _mm_add_pd(fjx3,tx);
1192             fjy3             = _mm_add_pd(fjy3,ty);
1193             fjz3             = _mm_add_pd(fjz3,tz);
1194
1195             /**************************
1196              * CALCULATE INTERACTIONS *
1197              **************************/
1198
1199             r31              = _mm_mul_pd(rsq31,rinv31);
1200
1201             /* EWALD ELECTROSTATICS */
1202
1203             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1204             ewrt             = _mm_mul_pd(r31,ewtabscale);
1205             ewitab           = _mm_cvttpd_epi32(ewrt);
1206             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1207             ewitab           = _mm_slli_epi32(ewitab,2);
1208             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1209             ewtabD           = _mm_setzero_pd();
1210             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1211             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1212             ewtabFn          = _mm_setzero_pd();
1213             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1214             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1215             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1216             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1217             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1218
1219             /* Update potential sum for this i atom from the interaction with this j atom. */
1220             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1221             velecsum         = _mm_add_pd(velecsum,velec);
1222
1223             fscal            = felec;
1224
1225             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1226
1227             /* Calculate temporary vectorial force */
1228             tx               = _mm_mul_pd(fscal,dx31);
1229             ty               = _mm_mul_pd(fscal,dy31);
1230             tz               = _mm_mul_pd(fscal,dz31);
1231
1232             /* Update vectorial force */
1233             fix3             = _mm_add_pd(fix3,tx);
1234             fiy3             = _mm_add_pd(fiy3,ty);
1235             fiz3             = _mm_add_pd(fiz3,tz);
1236
1237             fjx1             = _mm_add_pd(fjx1,tx);
1238             fjy1             = _mm_add_pd(fjy1,ty);
1239             fjz1             = _mm_add_pd(fjz1,tz);
1240
1241             /**************************
1242              * CALCULATE INTERACTIONS *
1243              **************************/
1244
1245             r32              = _mm_mul_pd(rsq32,rinv32);
1246
1247             /* EWALD ELECTROSTATICS */
1248
1249             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1250             ewrt             = _mm_mul_pd(r32,ewtabscale);
1251             ewitab           = _mm_cvttpd_epi32(ewrt);
1252             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1253             ewitab           = _mm_slli_epi32(ewitab,2);
1254             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1255             ewtabD           = _mm_setzero_pd();
1256             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1257             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1258             ewtabFn          = _mm_setzero_pd();
1259             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1260             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1261             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1262             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1263             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1264
1265             /* Update potential sum for this i atom from the interaction with this j atom. */
1266             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1267             velecsum         = _mm_add_pd(velecsum,velec);
1268
1269             fscal            = felec;
1270
1271             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1272
1273             /* Calculate temporary vectorial force */
1274             tx               = _mm_mul_pd(fscal,dx32);
1275             ty               = _mm_mul_pd(fscal,dy32);
1276             tz               = _mm_mul_pd(fscal,dz32);
1277
1278             /* Update vectorial force */
1279             fix3             = _mm_add_pd(fix3,tx);
1280             fiy3             = _mm_add_pd(fiy3,ty);
1281             fiz3             = _mm_add_pd(fiz3,tz);
1282
1283             fjx2             = _mm_add_pd(fjx2,tx);
1284             fjy2             = _mm_add_pd(fjy2,ty);
1285             fjz2             = _mm_add_pd(fjz2,tz);
1286
1287             /**************************
1288              * CALCULATE INTERACTIONS *
1289              **************************/
1290
1291             r33              = _mm_mul_pd(rsq33,rinv33);
1292
1293             /* EWALD ELECTROSTATICS */
1294
1295             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1296             ewrt             = _mm_mul_pd(r33,ewtabscale);
1297             ewitab           = _mm_cvttpd_epi32(ewrt);
1298             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1299             ewitab           = _mm_slli_epi32(ewitab,2);
1300             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1301             ewtabD           = _mm_setzero_pd();
1302             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1303             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1304             ewtabFn          = _mm_setzero_pd();
1305             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1306             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1307             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1308             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1309             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1310
1311             /* Update potential sum for this i atom from the interaction with this j atom. */
1312             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1313             velecsum         = _mm_add_pd(velecsum,velec);
1314
1315             fscal            = felec;
1316
1317             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1318
1319             /* Calculate temporary vectorial force */
1320             tx               = _mm_mul_pd(fscal,dx33);
1321             ty               = _mm_mul_pd(fscal,dy33);
1322             tz               = _mm_mul_pd(fscal,dz33);
1323
1324             /* Update vectorial force */
1325             fix3             = _mm_add_pd(fix3,tx);
1326             fiy3             = _mm_add_pd(fiy3,ty);
1327             fiz3             = _mm_add_pd(fiz3,tz);
1328
1329             fjx3             = _mm_add_pd(fjx3,tx);
1330             fjy3             = _mm_add_pd(fjy3,ty);
1331             fjz3             = _mm_add_pd(fjz3,tz);
1332
1333             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1334
1335             /* Inner loop uses 428 flops */
1336         }
1337
1338         /* End of innermost loop */
1339
1340         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1341                                               f+i_coord_offset,fshift+i_shift_offset);
1342
1343         ggid                        = gid[iidx];
1344         /* Update potential energies */
1345         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1346         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1347
1348         /* Increment number of inner iterations */
1349         inneriter                  += j_index_end - j_index_start;
1350
1351         /* Outer loop uses 26 flops */
1352     }
1353
1354     /* Increment number of outer iterations */
1355     outeriter        += nri;
1356
1357     /* Update outer/inner flops */
1358
1359     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*428);
1360 }
1361 /*
1362  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double
1363  * Electrostatics interaction: Ewald
1364  * VdW interaction:            CubicSplineTable
1365  * Geometry:                   Water4-Water4
1366  * Calculate force/pot:        Force
1367  */
1368 void
1369 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_double
1370                     (t_nblist                    * gmx_restrict       nlist,
1371                      rvec                        * gmx_restrict          xx,
1372                      rvec                        * gmx_restrict          ff,
1373                      t_forcerec                  * gmx_restrict          fr,
1374                      t_mdatoms                   * gmx_restrict     mdatoms,
1375                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1376                      t_nrnb                      * gmx_restrict        nrnb)
1377 {
1378     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1379      * just 0 for non-waters.
1380      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1381      * jnr indices corresponding to data put in the four positions in the SIMD register.
1382      */
1383     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1384     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1385     int              jnrA,jnrB;
1386     int              j_coord_offsetA,j_coord_offsetB;
1387     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1388     real             rcutoff_scalar;
1389     real             *shiftvec,*fshift,*x,*f;
1390     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1391     int              vdwioffset0;
1392     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1393     int              vdwioffset1;
1394     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1395     int              vdwioffset2;
1396     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1397     int              vdwioffset3;
1398     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1399     int              vdwjidx0A,vdwjidx0B;
1400     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1401     int              vdwjidx1A,vdwjidx1B;
1402     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1403     int              vdwjidx2A,vdwjidx2B;
1404     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1405     int              vdwjidx3A,vdwjidx3B;
1406     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1407     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1408     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1409     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1410     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1411     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1412     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1413     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1414     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1415     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1416     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1417     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1418     real             *charge;
1419     int              nvdwtype;
1420     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1421     int              *vdwtype;
1422     real             *vdwparam;
1423     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1424     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1425     __m128i          vfitab;
1426     __m128i          ifour       = _mm_set1_epi32(4);
1427     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1428     real             *vftab;
1429     __m128i          ewitab;
1430     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1431     real             *ewtab;
1432     __m128d          dummy_mask,cutoff_mask;
1433     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1434     __m128d          one     = _mm_set1_pd(1.0);
1435     __m128d          two     = _mm_set1_pd(2.0);
1436     x                = xx[0];
1437     f                = ff[0];
1438
1439     nri              = nlist->nri;
1440     iinr             = nlist->iinr;
1441     jindex           = nlist->jindex;
1442     jjnr             = nlist->jjnr;
1443     shiftidx         = nlist->shift;
1444     gid              = nlist->gid;
1445     shiftvec         = fr->shift_vec[0];
1446     fshift           = fr->fshift[0];
1447     facel            = _mm_set1_pd(fr->epsfac);
1448     charge           = mdatoms->chargeA;
1449     nvdwtype         = fr->ntype;
1450     vdwparam         = fr->nbfp;
1451     vdwtype          = mdatoms->typeA;
1452
1453     vftab            = kernel_data->table_vdw->data;
1454     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
1455
1456     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1457     ewtab            = fr->ic->tabq_coul_F;
1458     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1459     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1460
1461     /* Setup water-specific parameters */
1462     inr              = nlist->iinr[0];
1463     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1464     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1465     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1466     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1467
1468     jq1              = _mm_set1_pd(charge[inr+1]);
1469     jq2              = _mm_set1_pd(charge[inr+2]);
1470     jq3              = _mm_set1_pd(charge[inr+3]);
1471     vdwjidx0A        = 2*vdwtype[inr+0];
1472     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1473     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1474     qq11             = _mm_mul_pd(iq1,jq1);
1475     qq12             = _mm_mul_pd(iq1,jq2);
1476     qq13             = _mm_mul_pd(iq1,jq3);
1477     qq21             = _mm_mul_pd(iq2,jq1);
1478     qq22             = _mm_mul_pd(iq2,jq2);
1479     qq23             = _mm_mul_pd(iq2,jq3);
1480     qq31             = _mm_mul_pd(iq3,jq1);
1481     qq32             = _mm_mul_pd(iq3,jq2);
1482     qq33             = _mm_mul_pd(iq3,jq3);
1483
1484     /* Avoid stupid compiler warnings */
1485     jnrA = jnrB = 0;
1486     j_coord_offsetA = 0;
1487     j_coord_offsetB = 0;
1488
1489     outeriter        = 0;
1490     inneriter        = 0;
1491
1492     /* Start outer loop over neighborlists */
1493     for(iidx=0; iidx<nri; iidx++)
1494     {
1495         /* Load shift vector for this list */
1496         i_shift_offset   = DIM*shiftidx[iidx];
1497
1498         /* Load limits for loop over neighbors */
1499         j_index_start    = jindex[iidx];
1500         j_index_end      = jindex[iidx+1];
1501
1502         /* Get outer coordinate index */
1503         inr              = iinr[iidx];
1504         i_coord_offset   = DIM*inr;
1505
1506         /* Load i particle coords and add shift vector */
1507         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1508                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1509
1510         fix0             = _mm_setzero_pd();
1511         fiy0             = _mm_setzero_pd();
1512         fiz0             = _mm_setzero_pd();
1513         fix1             = _mm_setzero_pd();
1514         fiy1             = _mm_setzero_pd();
1515         fiz1             = _mm_setzero_pd();
1516         fix2             = _mm_setzero_pd();
1517         fiy2             = _mm_setzero_pd();
1518         fiz2             = _mm_setzero_pd();
1519         fix3             = _mm_setzero_pd();
1520         fiy3             = _mm_setzero_pd();
1521         fiz3             = _mm_setzero_pd();
1522
1523         /* Start inner kernel loop */
1524         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1525         {
1526
1527             /* Get j neighbor index, and coordinate index */
1528             jnrA             = jjnr[jidx];
1529             jnrB             = jjnr[jidx+1];
1530             j_coord_offsetA  = DIM*jnrA;
1531             j_coord_offsetB  = DIM*jnrB;
1532
1533             /* load j atom coordinates */
1534             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1535                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1536                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1537
1538             /* Calculate displacement vector */
1539             dx00             = _mm_sub_pd(ix0,jx0);
1540             dy00             = _mm_sub_pd(iy0,jy0);
1541             dz00             = _mm_sub_pd(iz0,jz0);
1542             dx11             = _mm_sub_pd(ix1,jx1);
1543             dy11             = _mm_sub_pd(iy1,jy1);
1544             dz11             = _mm_sub_pd(iz1,jz1);
1545             dx12             = _mm_sub_pd(ix1,jx2);
1546             dy12             = _mm_sub_pd(iy1,jy2);
1547             dz12             = _mm_sub_pd(iz1,jz2);
1548             dx13             = _mm_sub_pd(ix1,jx3);
1549             dy13             = _mm_sub_pd(iy1,jy3);
1550             dz13             = _mm_sub_pd(iz1,jz3);
1551             dx21             = _mm_sub_pd(ix2,jx1);
1552             dy21             = _mm_sub_pd(iy2,jy1);
1553             dz21             = _mm_sub_pd(iz2,jz1);
1554             dx22             = _mm_sub_pd(ix2,jx2);
1555             dy22             = _mm_sub_pd(iy2,jy2);
1556             dz22             = _mm_sub_pd(iz2,jz2);
1557             dx23             = _mm_sub_pd(ix2,jx3);
1558             dy23             = _mm_sub_pd(iy2,jy3);
1559             dz23             = _mm_sub_pd(iz2,jz3);
1560             dx31             = _mm_sub_pd(ix3,jx1);
1561             dy31             = _mm_sub_pd(iy3,jy1);
1562             dz31             = _mm_sub_pd(iz3,jz1);
1563             dx32             = _mm_sub_pd(ix3,jx2);
1564             dy32             = _mm_sub_pd(iy3,jy2);
1565             dz32             = _mm_sub_pd(iz3,jz2);
1566             dx33             = _mm_sub_pd(ix3,jx3);
1567             dy33             = _mm_sub_pd(iy3,jy3);
1568             dz33             = _mm_sub_pd(iz3,jz3);
1569
1570             /* Calculate squared distance and things based on it */
1571             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1572             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1573             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1574             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1575             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1576             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1577             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1578             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1579             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1580             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1581
1582             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1583             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1584             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1585             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1586             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1587             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1588             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1589             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1590             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1591             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1592
1593             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1594             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1595             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1596             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1597             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1598             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1599             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1600             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1601             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1602
1603             fjx0             = _mm_setzero_pd();
1604             fjy0             = _mm_setzero_pd();
1605             fjz0             = _mm_setzero_pd();
1606             fjx1             = _mm_setzero_pd();
1607             fjy1             = _mm_setzero_pd();
1608             fjz1             = _mm_setzero_pd();
1609             fjx2             = _mm_setzero_pd();
1610             fjy2             = _mm_setzero_pd();
1611             fjz2             = _mm_setzero_pd();
1612             fjx3             = _mm_setzero_pd();
1613             fjy3             = _mm_setzero_pd();
1614             fjz3             = _mm_setzero_pd();
1615
1616             /**************************
1617              * CALCULATE INTERACTIONS *
1618              **************************/
1619
1620             r00              = _mm_mul_pd(rsq00,rinv00);
1621
1622             /* Calculate table index by multiplying r with table scale and truncate to integer */
1623             rt               = _mm_mul_pd(r00,vftabscale);
1624             vfitab           = _mm_cvttpd_epi32(rt);
1625             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1626             vfitab           = _mm_slli_epi32(vfitab,3);
1627
1628             /* CUBIC SPLINE TABLE DISPERSION */
1629             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1630             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1631             GMX_MM_TRANSPOSE2_PD(Y,F);
1632             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1633             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1634             GMX_MM_TRANSPOSE2_PD(G,H);
1635             Heps             = _mm_mul_pd(vfeps,H);
1636             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1637             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1638             fvdw6            = _mm_mul_pd(c6_00,FF);
1639
1640             /* CUBIC SPLINE TABLE REPULSION */
1641             vfitab           = _mm_add_epi32(vfitab,ifour);
1642             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1643             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1644             GMX_MM_TRANSPOSE2_PD(Y,F);
1645             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1646             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1647             GMX_MM_TRANSPOSE2_PD(G,H);
1648             Heps             = _mm_mul_pd(vfeps,H);
1649             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1650             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1651             fvdw12           = _mm_mul_pd(c12_00,FF);
1652             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1653
1654             fscal            = fvdw;
1655
1656             /* Calculate temporary vectorial force */
1657             tx               = _mm_mul_pd(fscal,dx00);
1658             ty               = _mm_mul_pd(fscal,dy00);
1659             tz               = _mm_mul_pd(fscal,dz00);
1660
1661             /* Update vectorial force */
1662             fix0             = _mm_add_pd(fix0,tx);
1663             fiy0             = _mm_add_pd(fiy0,ty);
1664             fiz0             = _mm_add_pd(fiz0,tz);
1665
1666             fjx0             = _mm_add_pd(fjx0,tx);
1667             fjy0             = _mm_add_pd(fjy0,ty);
1668             fjz0             = _mm_add_pd(fjz0,tz);
1669
1670             /**************************
1671              * CALCULATE INTERACTIONS *
1672              **************************/
1673
1674             r11              = _mm_mul_pd(rsq11,rinv11);
1675
1676             /* EWALD ELECTROSTATICS */
1677
1678             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1679             ewrt             = _mm_mul_pd(r11,ewtabscale);
1680             ewitab           = _mm_cvttpd_epi32(ewrt);
1681             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1682             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1683                                          &ewtabF,&ewtabFn);
1684             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1685             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1686
1687             fscal            = felec;
1688
1689             /* Calculate temporary vectorial force */
1690             tx               = _mm_mul_pd(fscal,dx11);
1691             ty               = _mm_mul_pd(fscal,dy11);
1692             tz               = _mm_mul_pd(fscal,dz11);
1693
1694             /* Update vectorial force */
1695             fix1             = _mm_add_pd(fix1,tx);
1696             fiy1             = _mm_add_pd(fiy1,ty);
1697             fiz1             = _mm_add_pd(fiz1,tz);
1698
1699             fjx1             = _mm_add_pd(fjx1,tx);
1700             fjy1             = _mm_add_pd(fjy1,ty);
1701             fjz1             = _mm_add_pd(fjz1,tz);
1702
1703             /**************************
1704              * CALCULATE INTERACTIONS *
1705              **************************/
1706
1707             r12              = _mm_mul_pd(rsq12,rinv12);
1708
1709             /* EWALD ELECTROSTATICS */
1710
1711             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1712             ewrt             = _mm_mul_pd(r12,ewtabscale);
1713             ewitab           = _mm_cvttpd_epi32(ewrt);
1714             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1715             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1716                                          &ewtabF,&ewtabFn);
1717             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1718             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1719
1720             fscal            = felec;
1721
1722             /* Calculate temporary vectorial force */
1723             tx               = _mm_mul_pd(fscal,dx12);
1724             ty               = _mm_mul_pd(fscal,dy12);
1725             tz               = _mm_mul_pd(fscal,dz12);
1726
1727             /* Update vectorial force */
1728             fix1             = _mm_add_pd(fix1,tx);
1729             fiy1             = _mm_add_pd(fiy1,ty);
1730             fiz1             = _mm_add_pd(fiz1,tz);
1731
1732             fjx2             = _mm_add_pd(fjx2,tx);
1733             fjy2             = _mm_add_pd(fjy2,ty);
1734             fjz2             = _mm_add_pd(fjz2,tz);
1735
1736             /**************************
1737              * CALCULATE INTERACTIONS *
1738              **************************/
1739
1740             r13              = _mm_mul_pd(rsq13,rinv13);
1741
1742             /* EWALD ELECTROSTATICS */
1743
1744             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1745             ewrt             = _mm_mul_pd(r13,ewtabscale);
1746             ewitab           = _mm_cvttpd_epi32(ewrt);
1747             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1748             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1749                                          &ewtabF,&ewtabFn);
1750             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1751             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1752
1753             fscal            = felec;
1754
1755             /* Calculate temporary vectorial force */
1756             tx               = _mm_mul_pd(fscal,dx13);
1757             ty               = _mm_mul_pd(fscal,dy13);
1758             tz               = _mm_mul_pd(fscal,dz13);
1759
1760             /* Update vectorial force */
1761             fix1             = _mm_add_pd(fix1,tx);
1762             fiy1             = _mm_add_pd(fiy1,ty);
1763             fiz1             = _mm_add_pd(fiz1,tz);
1764
1765             fjx3             = _mm_add_pd(fjx3,tx);
1766             fjy3             = _mm_add_pd(fjy3,ty);
1767             fjz3             = _mm_add_pd(fjz3,tz);
1768
1769             /**************************
1770              * CALCULATE INTERACTIONS *
1771              **************************/
1772
1773             r21              = _mm_mul_pd(rsq21,rinv21);
1774
1775             /* EWALD ELECTROSTATICS */
1776
1777             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778             ewrt             = _mm_mul_pd(r21,ewtabscale);
1779             ewitab           = _mm_cvttpd_epi32(ewrt);
1780             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1781             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1782                                          &ewtabF,&ewtabFn);
1783             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1784             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1785
1786             fscal            = felec;
1787
1788             /* Calculate temporary vectorial force */
1789             tx               = _mm_mul_pd(fscal,dx21);
1790             ty               = _mm_mul_pd(fscal,dy21);
1791             tz               = _mm_mul_pd(fscal,dz21);
1792
1793             /* Update vectorial force */
1794             fix2             = _mm_add_pd(fix2,tx);
1795             fiy2             = _mm_add_pd(fiy2,ty);
1796             fiz2             = _mm_add_pd(fiz2,tz);
1797
1798             fjx1             = _mm_add_pd(fjx1,tx);
1799             fjy1             = _mm_add_pd(fjy1,ty);
1800             fjz1             = _mm_add_pd(fjz1,tz);
1801
1802             /**************************
1803              * CALCULATE INTERACTIONS *
1804              **************************/
1805
1806             r22              = _mm_mul_pd(rsq22,rinv22);
1807
1808             /* EWALD ELECTROSTATICS */
1809
1810             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1811             ewrt             = _mm_mul_pd(r22,ewtabscale);
1812             ewitab           = _mm_cvttpd_epi32(ewrt);
1813             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1814             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1815                                          &ewtabF,&ewtabFn);
1816             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1817             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1818
1819             fscal            = felec;
1820
1821             /* Calculate temporary vectorial force */
1822             tx               = _mm_mul_pd(fscal,dx22);
1823             ty               = _mm_mul_pd(fscal,dy22);
1824             tz               = _mm_mul_pd(fscal,dz22);
1825
1826             /* Update vectorial force */
1827             fix2             = _mm_add_pd(fix2,tx);
1828             fiy2             = _mm_add_pd(fiy2,ty);
1829             fiz2             = _mm_add_pd(fiz2,tz);
1830
1831             fjx2             = _mm_add_pd(fjx2,tx);
1832             fjy2             = _mm_add_pd(fjy2,ty);
1833             fjz2             = _mm_add_pd(fjz2,tz);
1834
1835             /**************************
1836              * CALCULATE INTERACTIONS *
1837              **************************/
1838
1839             r23              = _mm_mul_pd(rsq23,rinv23);
1840
1841             /* EWALD ELECTROSTATICS */
1842
1843             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844             ewrt             = _mm_mul_pd(r23,ewtabscale);
1845             ewitab           = _mm_cvttpd_epi32(ewrt);
1846             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1847             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1848                                          &ewtabF,&ewtabFn);
1849             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1850             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1851
1852             fscal            = felec;
1853
1854             /* Calculate temporary vectorial force */
1855             tx               = _mm_mul_pd(fscal,dx23);
1856             ty               = _mm_mul_pd(fscal,dy23);
1857             tz               = _mm_mul_pd(fscal,dz23);
1858
1859             /* Update vectorial force */
1860             fix2             = _mm_add_pd(fix2,tx);
1861             fiy2             = _mm_add_pd(fiy2,ty);
1862             fiz2             = _mm_add_pd(fiz2,tz);
1863
1864             fjx3             = _mm_add_pd(fjx3,tx);
1865             fjy3             = _mm_add_pd(fjy3,ty);
1866             fjz3             = _mm_add_pd(fjz3,tz);
1867
1868             /**************************
1869              * CALCULATE INTERACTIONS *
1870              **************************/
1871
1872             r31              = _mm_mul_pd(rsq31,rinv31);
1873
1874             /* EWALD ELECTROSTATICS */
1875
1876             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1877             ewrt             = _mm_mul_pd(r31,ewtabscale);
1878             ewitab           = _mm_cvttpd_epi32(ewrt);
1879             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1880             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1881                                          &ewtabF,&ewtabFn);
1882             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1883             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1884
1885             fscal            = felec;
1886
1887             /* Calculate temporary vectorial force */
1888             tx               = _mm_mul_pd(fscal,dx31);
1889             ty               = _mm_mul_pd(fscal,dy31);
1890             tz               = _mm_mul_pd(fscal,dz31);
1891
1892             /* Update vectorial force */
1893             fix3             = _mm_add_pd(fix3,tx);
1894             fiy3             = _mm_add_pd(fiy3,ty);
1895             fiz3             = _mm_add_pd(fiz3,tz);
1896
1897             fjx1             = _mm_add_pd(fjx1,tx);
1898             fjy1             = _mm_add_pd(fjy1,ty);
1899             fjz1             = _mm_add_pd(fjz1,tz);
1900
1901             /**************************
1902              * CALCULATE INTERACTIONS *
1903              **************************/
1904
1905             r32              = _mm_mul_pd(rsq32,rinv32);
1906
1907             /* EWALD ELECTROSTATICS */
1908
1909             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1910             ewrt             = _mm_mul_pd(r32,ewtabscale);
1911             ewitab           = _mm_cvttpd_epi32(ewrt);
1912             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1913             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1914                                          &ewtabF,&ewtabFn);
1915             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1916             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1917
1918             fscal            = felec;
1919
1920             /* Calculate temporary vectorial force */
1921             tx               = _mm_mul_pd(fscal,dx32);
1922             ty               = _mm_mul_pd(fscal,dy32);
1923             tz               = _mm_mul_pd(fscal,dz32);
1924
1925             /* Update vectorial force */
1926             fix3             = _mm_add_pd(fix3,tx);
1927             fiy3             = _mm_add_pd(fiy3,ty);
1928             fiz3             = _mm_add_pd(fiz3,tz);
1929
1930             fjx2             = _mm_add_pd(fjx2,tx);
1931             fjy2             = _mm_add_pd(fjy2,ty);
1932             fjz2             = _mm_add_pd(fjz2,tz);
1933
1934             /**************************
1935              * CALCULATE INTERACTIONS *
1936              **************************/
1937
1938             r33              = _mm_mul_pd(rsq33,rinv33);
1939
1940             /* EWALD ELECTROSTATICS */
1941
1942             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1943             ewrt             = _mm_mul_pd(r33,ewtabscale);
1944             ewitab           = _mm_cvttpd_epi32(ewrt);
1945             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1946             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1947                                          &ewtabF,&ewtabFn);
1948             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1949             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1950
1951             fscal            = felec;
1952
1953             /* Calculate temporary vectorial force */
1954             tx               = _mm_mul_pd(fscal,dx33);
1955             ty               = _mm_mul_pd(fscal,dy33);
1956             tz               = _mm_mul_pd(fscal,dz33);
1957
1958             /* Update vectorial force */
1959             fix3             = _mm_add_pd(fix3,tx);
1960             fiy3             = _mm_add_pd(fiy3,ty);
1961             fiz3             = _mm_add_pd(fiz3,tz);
1962
1963             fjx3             = _mm_add_pd(fjx3,tx);
1964             fjy3             = _mm_add_pd(fjy3,ty);
1965             fjz3             = _mm_add_pd(fjz3,tz);
1966
1967             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1968
1969             /* Inner loop uses 375 flops */
1970         }
1971
1972         if(jidx<j_index_end)
1973         {
1974
1975             jnrA             = jjnr[jidx];
1976             j_coord_offsetA  = DIM*jnrA;
1977
1978             /* load j atom coordinates */
1979             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1980                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1981                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1982
1983             /* Calculate displacement vector */
1984             dx00             = _mm_sub_pd(ix0,jx0);
1985             dy00             = _mm_sub_pd(iy0,jy0);
1986             dz00             = _mm_sub_pd(iz0,jz0);
1987             dx11             = _mm_sub_pd(ix1,jx1);
1988             dy11             = _mm_sub_pd(iy1,jy1);
1989             dz11             = _mm_sub_pd(iz1,jz1);
1990             dx12             = _mm_sub_pd(ix1,jx2);
1991             dy12             = _mm_sub_pd(iy1,jy2);
1992             dz12             = _mm_sub_pd(iz1,jz2);
1993             dx13             = _mm_sub_pd(ix1,jx3);
1994             dy13             = _mm_sub_pd(iy1,jy3);
1995             dz13             = _mm_sub_pd(iz1,jz3);
1996             dx21             = _mm_sub_pd(ix2,jx1);
1997             dy21             = _mm_sub_pd(iy2,jy1);
1998             dz21             = _mm_sub_pd(iz2,jz1);
1999             dx22             = _mm_sub_pd(ix2,jx2);
2000             dy22             = _mm_sub_pd(iy2,jy2);
2001             dz22             = _mm_sub_pd(iz2,jz2);
2002             dx23             = _mm_sub_pd(ix2,jx3);
2003             dy23             = _mm_sub_pd(iy2,jy3);
2004             dz23             = _mm_sub_pd(iz2,jz3);
2005             dx31             = _mm_sub_pd(ix3,jx1);
2006             dy31             = _mm_sub_pd(iy3,jy1);
2007             dz31             = _mm_sub_pd(iz3,jz1);
2008             dx32             = _mm_sub_pd(ix3,jx2);
2009             dy32             = _mm_sub_pd(iy3,jy2);
2010             dz32             = _mm_sub_pd(iz3,jz2);
2011             dx33             = _mm_sub_pd(ix3,jx3);
2012             dy33             = _mm_sub_pd(iy3,jy3);
2013             dz33             = _mm_sub_pd(iz3,jz3);
2014
2015             /* Calculate squared distance and things based on it */
2016             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2017             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2018             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2019             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2020             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2021             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2022             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2023             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2024             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2025             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2026
2027             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2028             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2029             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2030             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2031             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2032             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2033             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2034             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2035             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2036             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2037
2038             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
2039             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
2040             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
2041             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
2042             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
2043             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
2044             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
2045             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
2046             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
2047
2048             fjx0             = _mm_setzero_pd();
2049             fjy0             = _mm_setzero_pd();
2050             fjz0             = _mm_setzero_pd();
2051             fjx1             = _mm_setzero_pd();
2052             fjy1             = _mm_setzero_pd();
2053             fjz1             = _mm_setzero_pd();
2054             fjx2             = _mm_setzero_pd();
2055             fjy2             = _mm_setzero_pd();
2056             fjz2             = _mm_setzero_pd();
2057             fjx3             = _mm_setzero_pd();
2058             fjy3             = _mm_setzero_pd();
2059             fjz3             = _mm_setzero_pd();
2060
2061             /**************************
2062              * CALCULATE INTERACTIONS *
2063              **************************/
2064
2065             r00              = _mm_mul_pd(rsq00,rinv00);
2066
2067             /* Calculate table index by multiplying r with table scale and truncate to integer */
2068             rt               = _mm_mul_pd(r00,vftabscale);
2069             vfitab           = _mm_cvttpd_epi32(rt);
2070             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2071             vfitab           = _mm_slli_epi32(vfitab,3);
2072
2073             /* CUBIC SPLINE TABLE DISPERSION */
2074             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2075             F                = _mm_setzero_pd();
2076             GMX_MM_TRANSPOSE2_PD(Y,F);
2077             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2078             H                = _mm_setzero_pd();
2079             GMX_MM_TRANSPOSE2_PD(G,H);
2080             Heps             = _mm_mul_pd(vfeps,H);
2081             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2082             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2083             fvdw6            = _mm_mul_pd(c6_00,FF);
2084
2085             /* CUBIC SPLINE TABLE REPULSION */
2086             vfitab           = _mm_add_epi32(vfitab,ifour);
2087             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2088             F                = _mm_setzero_pd();
2089             GMX_MM_TRANSPOSE2_PD(Y,F);
2090             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2091             H                = _mm_setzero_pd();
2092             GMX_MM_TRANSPOSE2_PD(G,H);
2093             Heps             = _mm_mul_pd(vfeps,H);
2094             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2095             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2096             fvdw12           = _mm_mul_pd(c12_00,FF);
2097             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2098
2099             fscal            = fvdw;
2100
2101             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2102
2103             /* Calculate temporary vectorial force */
2104             tx               = _mm_mul_pd(fscal,dx00);
2105             ty               = _mm_mul_pd(fscal,dy00);
2106             tz               = _mm_mul_pd(fscal,dz00);
2107
2108             /* Update vectorial force */
2109             fix0             = _mm_add_pd(fix0,tx);
2110             fiy0             = _mm_add_pd(fiy0,ty);
2111             fiz0             = _mm_add_pd(fiz0,tz);
2112
2113             fjx0             = _mm_add_pd(fjx0,tx);
2114             fjy0             = _mm_add_pd(fjy0,ty);
2115             fjz0             = _mm_add_pd(fjz0,tz);
2116
2117             /**************************
2118              * CALCULATE INTERACTIONS *
2119              **************************/
2120
2121             r11              = _mm_mul_pd(rsq11,rinv11);
2122
2123             /* EWALD ELECTROSTATICS */
2124
2125             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2126             ewrt             = _mm_mul_pd(r11,ewtabscale);
2127             ewitab           = _mm_cvttpd_epi32(ewrt);
2128             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2129             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2130             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2131             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2132
2133             fscal            = felec;
2134
2135             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2136
2137             /* Calculate temporary vectorial force */
2138             tx               = _mm_mul_pd(fscal,dx11);
2139             ty               = _mm_mul_pd(fscal,dy11);
2140             tz               = _mm_mul_pd(fscal,dz11);
2141
2142             /* Update vectorial force */
2143             fix1             = _mm_add_pd(fix1,tx);
2144             fiy1             = _mm_add_pd(fiy1,ty);
2145             fiz1             = _mm_add_pd(fiz1,tz);
2146
2147             fjx1             = _mm_add_pd(fjx1,tx);
2148             fjy1             = _mm_add_pd(fjy1,ty);
2149             fjz1             = _mm_add_pd(fjz1,tz);
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             r12              = _mm_mul_pd(rsq12,rinv12);
2156
2157             /* EWALD ELECTROSTATICS */
2158
2159             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160             ewrt             = _mm_mul_pd(r12,ewtabscale);
2161             ewitab           = _mm_cvttpd_epi32(ewrt);
2162             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2163             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2164             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2165             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2166
2167             fscal            = felec;
2168
2169             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2170
2171             /* Calculate temporary vectorial force */
2172             tx               = _mm_mul_pd(fscal,dx12);
2173             ty               = _mm_mul_pd(fscal,dy12);
2174             tz               = _mm_mul_pd(fscal,dz12);
2175
2176             /* Update vectorial force */
2177             fix1             = _mm_add_pd(fix1,tx);
2178             fiy1             = _mm_add_pd(fiy1,ty);
2179             fiz1             = _mm_add_pd(fiz1,tz);
2180
2181             fjx2             = _mm_add_pd(fjx2,tx);
2182             fjy2             = _mm_add_pd(fjy2,ty);
2183             fjz2             = _mm_add_pd(fjz2,tz);
2184
2185             /**************************
2186              * CALCULATE INTERACTIONS *
2187              **************************/
2188
2189             r13              = _mm_mul_pd(rsq13,rinv13);
2190
2191             /* EWALD ELECTROSTATICS */
2192
2193             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194             ewrt             = _mm_mul_pd(r13,ewtabscale);
2195             ewitab           = _mm_cvttpd_epi32(ewrt);
2196             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2197             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2198             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2199             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2200
2201             fscal            = felec;
2202
2203             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm_mul_pd(fscal,dx13);
2207             ty               = _mm_mul_pd(fscal,dy13);
2208             tz               = _mm_mul_pd(fscal,dz13);
2209
2210             /* Update vectorial force */
2211             fix1             = _mm_add_pd(fix1,tx);
2212             fiy1             = _mm_add_pd(fiy1,ty);
2213             fiz1             = _mm_add_pd(fiz1,tz);
2214
2215             fjx3             = _mm_add_pd(fjx3,tx);
2216             fjy3             = _mm_add_pd(fjy3,ty);
2217             fjz3             = _mm_add_pd(fjz3,tz);
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             r21              = _mm_mul_pd(rsq21,rinv21);
2224
2225             /* EWALD ELECTROSTATICS */
2226
2227             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228             ewrt             = _mm_mul_pd(r21,ewtabscale);
2229             ewitab           = _mm_cvttpd_epi32(ewrt);
2230             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2231             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2232             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2233             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2234
2235             fscal            = felec;
2236
2237             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2238
2239             /* Calculate temporary vectorial force */
2240             tx               = _mm_mul_pd(fscal,dx21);
2241             ty               = _mm_mul_pd(fscal,dy21);
2242             tz               = _mm_mul_pd(fscal,dz21);
2243
2244             /* Update vectorial force */
2245             fix2             = _mm_add_pd(fix2,tx);
2246             fiy2             = _mm_add_pd(fiy2,ty);
2247             fiz2             = _mm_add_pd(fiz2,tz);
2248
2249             fjx1             = _mm_add_pd(fjx1,tx);
2250             fjy1             = _mm_add_pd(fjy1,ty);
2251             fjz1             = _mm_add_pd(fjz1,tz);
2252
2253             /**************************
2254              * CALCULATE INTERACTIONS *
2255              **************************/
2256
2257             r22              = _mm_mul_pd(rsq22,rinv22);
2258
2259             /* EWALD ELECTROSTATICS */
2260
2261             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2262             ewrt             = _mm_mul_pd(r22,ewtabscale);
2263             ewitab           = _mm_cvttpd_epi32(ewrt);
2264             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2265             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2266             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2267             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2268
2269             fscal            = felec;
2270
2271             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2272
2273             /* Calculate temporary vectorial force */
2274             tx               = _mm_mul_pd(fscal,dx22);
2275             ty               = _mm_mul_pd(fscal,dy22);
2276             tz               = _mm_mul_pd(fscal,dz22);
2277
2278             /* Update vectorial force */
2279             fix2             = _mm_add_pd(fix2,tx);
2280             fiy2             = _mm_add_pd(fiy2,ty);
2281             fiz2             = _mm_add_pd(fiz2,tz);
2282
2283             fjx2             = _mm_add_pd(fjx2,tx);
2284             fjy2             = _mm_add_pd(fjy2,ty);
2285             fjz2             = _mm_add_pd(fjz2,tz);
2286
2287             /**************************
2288              * CALCULATE INTERACTIONS *
2289              **************************/
2290
2291             r23              = _mm_mul_pd(rsq23,rinv23);
2292
2293             /* EWALD ELECTROSTATICS */
2294
2295             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296             ewrt             = _mm_mul_pd(r23,ewtabscale);
2297             ewitab           = _mm_cvttpd_epi32(ewrt);
2298             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2299             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2300             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2301             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2302
2303             fscal            = felec;
2304
2305             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm_mul_pd(fscal,dx23);
2309             ty               = _mm_mul_pd(fscal,dy23);
2310             tz               = _mm_mul_pd(fscal,dz23);
2311
2312             /* Update vectorial force */
2313             fix2             = _mm_add_pd(fix2,tx);
2314             fiy2             = _mm_add_pd(fiy2,ty);
2315             fiz2             = _mm_add_pd(fiz2,tz);
2316
2317             fjx3             = _mm_add_pd(fjx3,tx);
2318             fjy3             = _mm_add_pd(fjy3,ty);
2319             fjz3             = _mm_add_pd(fjz3,tz);
2320
2321             /**************************
2322              * CALCULATE INTERACTIONS *
2323              **************************/
2324
2325             r31              = _mm_mul_pd(rsq31,rinv31);
2326
2327             /* EWALD ELECTROSTATICS */
2328
2329             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2330             ewrt             = _mm_mul_pd(r31,ewtabscale);
2331             ewitab           = _mm_cvttpd_epi32(ewrt);
2332             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2333             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2334             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2335             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2336
2337             fscal            = felec;
2338
2339             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2340
2341             /* Calculate temporary vectorial force */
2342             tx               = _mm_mul_pd(fscal,dx31);
2343             ty               = _mm_mul_pd(fscal,dy31);
2344             tz               = _mm_mul_pd(fscal,dz31);
2345
2346             /* Update vectorial force */
2347             fix3             = _mm_add_pd(fix3,tx);
2348             fiy3             = _mm_add_pd(fiy3,ty);
2349             fiz3             = _mm_add_pd(fiz3,tz);
2350
2351             fjx1             = _mm_add_pd(fjx1,tx);
2352             fjy1             = _mm_add_pd(fjy1,ty);
2353             fjz1             = _mm_add_pd(fjz1,tz);
2354
2355             /**************************
2356              * CALCULATE INTERACTIONS *
2357              **************************/
2358
2359             r32              = _mm_mul_pd(rsq32,rinv32);
2360
2361             /* EWALD ELECTROSTATICS */
2362
2363             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364             ewrt             = _mm_mul_pd(r32,ewtabscale);
2365             ewitab           = _mm_cvttpd_epi32(ewrt);
2366             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2367             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2368             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2369             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2370
2371             fscal            = felec;
2372
2373             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2374
2375             /* Calculate temporary vectorial force */
2376             tx               = _mm_mul_pd(fscal,dx32);
2377             ty               = _mm_mul_pd(fscal,dy32);
2378             tz               = _mm_mul_pd(fscal,dz32);
2379
2380             /* Update vectorial force */
2381             fix3             = _mm_add_pd(fix3,tx);
2382             fiy3             = _mm_add_pd(fiy3,ty);
2383             fiz3             = _mm_add_pd(fiz3,tz);
2384
2385             fjx2             = _mm_add_pd(fjx2,tx);
2386             fjy2             = _mm_add_pd(fjy2,ty);
2387             fjz2             = _mm_add_pd(fjz2,tz);
2388
2389             /**************************
2390              * CALCULATE INTERACTIONS *
2391              **************************/
2392
2393             r33              = _mm_mul_pd(rsq33,rinv33);
2394
2395             /* EWALD ELECTROSTATICS */
2396
2397             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2398             ewrt             = _mm_mul_pd(r33,ewtabscale);
2399             ewitab           = _mm_cvttpd_epi32(ewrt);
2400             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2401             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2402             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2403             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2404
2405             fscal            = felec;
2406
2407             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2408
2409             /* Calculate temporary vectorial force */
2410             tx               = _mm_mul_pd(fscal,dx33);
2411             ty               = _mm_mul_pd(fscal,dy33);
2412             tz               = _mm_mul_pd(fscal,dz33);
2413
2414             /* Update vectorial force */
2415             fix3             = _mm_add_pd(fix3,tx);
2416             fiy3             = _mm_add_pd(fiy3,ty);
2417             fiz3             = _mm_add_pd(fiz3,tz);
2418
2419             fjx3             = _mm_add_pd(fjx3,tx);
2420             fjy3             = _mm_add_pd(fjy3,ty);
2421             fjz3             = _mm_add_pd(fjz3,tz);
2422
2423             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2424
2425             /* Inner loop uses 375 flops */
2426         }
2427
2428         /* End of innermost loop */
2429
2430         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2431                                               f+i_coord_offset,fshift+i_shift_offset);
2432
2433         /* Increment number of inner iterations */
2434         inneriter                  += j_index_end - j_index_start;
2435
2436         /* Outer loop uses 24 flops */
2437     }
2438
2439     /* Increment number of outer iterations */
2440     outeriter        += nri;
2441
2442     /* Update outer/inner flops */
2443
2444     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*375);
2445 }