Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
110     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
111     __m128i          vfitab;
112     __m128i          ifour       = _mm_set1_epi32(4);
113     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
114     real             *vftab;
115     __m128d          dummy_mask,cutoff_mask;
116     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117     __m128d          one     = _mm_set1_pd(1.0);
118     __m128d          two     = _mm_set1_pd(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_pd(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_elec->data;
137     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_pd(charge[inr+0]);
147     jq1              = _mm_set1_pd(charge[inr+1]);
148     jq2              = _mm_set1_pd(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_pd(iq0,jq0);
151     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_pd(iq0,jq1);
154     qq02             = _mm_mul_pd(iq0,jq2);
155     qq10             = _mm_mul_pd(iq1,jq0);
156     qq11             = _mm_mul_pd(iq1,jq1);
157     qq12             = _mm_mul_pd(iq1,jq2);
158     qq20             = _mm_mul_pd(iq2,jq0);
159     qq21             = _mm_mul_pd(iq2,jq1);
160     qq22             = _mm_mul_pd(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166
167     outeriter        = 0;
168     inneriter        = 0;
169
170     /* Start outer loop over neighborlists */
171     for(iidx=0; iidx<nri; iidx++)
172     {
173         /* Load shift vector for this list */
174         i_shift_offset   = DIM*shiftidx[iidx];
175
176         /* Load limits for loop over neighbors */
177         j_index_start    = jindex[iidx];
178         j_index_end      = jindex[iidx+1];
179
180         /* Get outer coordinate index */
181         inr              = iinr[iidx];
182         i_coord_offset   = DIM*inr;
183
184         /* Load i particle coords and add shift vector */
185         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187
188         fix0             = _mm_setzero_pd();
189         fiy0             = _mm_setzero_pd();
190         fiz0             = _mm_setzero_pd();
191         fix1             = _mm_setzero_pd();
192         fiy1             = _mm_setzero_pd();
193         fiz1             = _mm_setzero_pd();
194         fix2             = _mm_setzero_pd();
195         fiy2             = _mm_setzero_pd();
196         fiz2             = _mm_setzero_pd();
197
198         /* Reset potential sums */
199         velecsum         = _mm_setzero_pd();
200         vvdwsum          = _mm_setzero_pd();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_pd(ix0,jx0);
218             dy00             = _mm_sub_pd(iy0,jy0);
219             dz00             = _mm_sub_pd(iz0,jz0);
220             dx01             = _mm_sub_pd(ix0,jx1);
221             dy01             = _mm_sub_pd(iy0,jy1);
222             dz01             = _mm_sub_pd(iz0,jz1);
223             dx02             = _mm_sub_pd(ix0,jx2);
224             dy02             = _mm_sub_pd(iy0,jy2);
225             dz02             = _mm_sub_pd(iz0,jz2);
226             dx10             = _mm_sub_pd(ix1,jx0);
227             dy10             = _mm_sub_pd(iy1,jy0);
228             dz10             = _mm_sub_pd(iz1,jz0);
229             dx11             = _mm_sub_pd(ix1,jx1);
230             dy11             = _mm_sub_pd(iy1,jy1);
231             dz11             = _mm_sub_pd(iz1,jz1);
232             dx12             = _mm_sub_pd(ix1,jx2);
233             dy12             = _mm_sub_pd(iy1,jy2);
234             dz12             = _mm_sub_pd(iz1,jz2);
235             dx20             = _mm_sub_pd(ix2,jx0);
236             dy20             = _mm_sub_pd(iy2,jy0);
237             dz20             = _mm_sub_pd(iz2,jz0);
238             dx21             = _mm_sub_pd(ix2,jx1);
239             dy21             = _mm_sub_pd(iy2,jy1);
240             dz21             = _mm_sub_pd(iz2,jz1);
241             dx22             = _mm_sub_pd(ix2,jx2);
242             dy22             = _mm_sub_pd(iy2,jy2);
243             dz22             = _mm_sub_pd(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
247             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
248             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
249             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
250             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
253             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
255
256             rinv00           = gmx_mm_invsqrt_pd(rsq00);
257             rinv01           = gmx_mm_invsqrt_pd(rsq01);
258             rinv02           = gmx_mm_invsqrt_pd(rsq02);
259             rinv10           = gmx_mm_invsqrt_pd(rsq10);
260             rinv11           = gmx_mm_invsqrt_pd(rsq11);
261             rinv12           = gmx_mm_invsqrt_pd(rsq12);
262             rinv20           = gmx_mm_invsqrt_pd(rsq20);
263             rinv21           = gmx_mm_invsqrt_pd(rsq21);
264             rinv22           = gmx_mm_invsqrt_pd(rsq22);
265
266             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
267
268             fjx0             = _mm_setzero_pd();
269             fjy0             = _mm_setzero_pd();
270             fjz0             = _mm_setzero_pd();
271             fjx1             = _mm_setzero_pd();
272             fjy1             = _mm_setzero_pd();
273             fjz1             = _mm_setzero_pd();
274             fjx2             = _mm_setzero_pd();
275             fjy2             = _mm_setzero_pd();
276             fjz2             = _mm_setzero_pd();
277
278             /**************************
279              * CALCULATE INTERACTIONS *
280              **************************/
281
282             r00              = _mm_mul_pd(rsq00,rinv00);
283
284             /* Calculate table index by multiplying r with table scale and truncate to integer */
285             rt               = _mm_mul_pd(r00,vftabscale);
286             vfitab           = _mm_cvttpd_epi32(rt);
287             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
288             vfitab           = _mm_slli_epi32(vfitab,2);
289
290             /* CUBIC SPLINE TABLE ELECTROSTATICS */
291             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
292             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
293             GMX_MM_TRANSPOSE2_PD(Y,F);
294             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
295             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
296             GMX_MM_TRANSPOSE2_PD(G,H);
297             Heps             = _mm_mul_pd(vfeps,H);
298             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
299             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
300             velec            = _mm_mul_pd(qq00,VV);
301             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
302             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
303
304             /* LENNARD-JONES DISPERSION/REPULSION */
305
306             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
307             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
308             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
309             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
310             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
311
312             /* Update potential sum for this i atom from the interaction with this j atom. */
313             velecsum         = _mm_add_pd(velecsum,velec);
314             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
315
316             fscal            = _mm_add_pd(felec,fvdw);
317
318             /* Calculate temporary vectorial force */
319             tx               = _mm_mul_pd(fscal,dx00);
320             ty               = _mm_mul_pd(fscal,dy00);
321             tz               = _mm_mul_pd(fscal,dz00);
322
323             /* Update vectorial force */
324             fix0             = _mm_add_pd(fix0,tx);
325             fiy0             = _mm_add_pd(fiy0,ty);
326             fiz0             = _mm_add_pd(fiz0,tz);
327
328             fjx0             = _mm_add_pd(fjx0,tx);
329             fjy0             = _mm_add_pd(fjy0,ty);
330             fjz0             = _mm_add_pd(fjz0,tz);
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             r01              = _mm_mul_pd(rsq01,rinv01);
337
338             /* Calculate table index by multiplying r with table scale and truncate to integer */
339             rt               = _mm_mul_pd(r01,vftabscale);
340             vfitab           = _mm_cvttpd_epi32(rt);
341             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
342             vfitab           = _mm_slli_epi32(vfitab,2);
343
344             /* CUBIC SPLINE TABLE ELECTROSTATICS */
345             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
346             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
347             GMX_MM_TRANSPOSE2_PD(Y,F);
348             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
349             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
350             GMX_MM_TRANSPOSE2_PD(G,H);
351             Heps             = _mm_mul_pd(vfeps,H);
352             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
353             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
354             velec            = _mm_mul_pd(qq01,VV);
355             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
356             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
357
358             /* Update potential sum for this i atom from the interaction with this j atom. */
359             velecsum         = _mm_add_pd(velecsum,velec);
360
361             fscal            = felec;
362
363             /* Calculate temporary vectorial force */
364             tx               = _mm_mul_pd(fscal,dx01);
365             ty               = _mm_mul_pd(fscal,dy01);
366             tz               = _mm_mul_pd(fscal,dz01);
367
368             /* Update vectorial force */
369             fix0             = _mm_add_pd(fix0,tx);
370             fiy0             = _mm_add_pd(fiy0,ty);
371             fiz0             = _mm_add_pd(fiz0,tz);
372
373             fjx1             = _mm_add_pd(fjx1,tx);
374             fjy1             = _mm_add_pd(fjy1,ty);
375             fjz1             = _mm_add_pd(fjz1,tz);
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             r02              = _mm_mul_pd(rsq02,rinv02);
382
383             /* Calculate table index by multiplying r with table scale and truncate to integer */
384             rt               = _mm_mul_pd(r02,vftabscale);
385             vfitab           = _mm_cvttpd_epi32(rt);
386             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
387             vfitab           = _mm_slli_epi32(vfitab,2);
388
389             /* CUBIC SPLINE TABLE ELECTROSTATICS */
390             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
391             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
392             GMX_MM_TRANSPOSE2_PD(Y,F);
393             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
394             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
395             GMX_MM_TRANSPOSE2_PD(G,H);
396             Heps             = _mm_mul_pd(vfeps,H);
397             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
398             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
399             velec            = _mm_mul_pd(qq02,VV);
400             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
401             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velecsum         = _mm_add_pd(velecsum,velec);
405
406             fscal            = felec;
407
408             /* Calculate temporary vectorial force */
409             tx               = _mm_mul_pd(fscal,dx02);
410             ty               = _mm_mul_pd(fscal,dy02);
411             tz               = _mm_mul_pd(fscal,dz02);
412
413             /* Update vectorial force */
414             fix0             = _mm_add_pd(fix0,tx);
415             fiy0             = _mm_add_pd(fiy0,ty);
416             fiz0             = _mm_add_pd(fiz0,tz);
417
418             fjx2             = _mm_add_pd(fjx2,tx);
419             fjy2             = _mm_add_pd(fjy2,ty);
420             fjz2             = _mm_add_pd(fjz2,tz);
421
422             /**************************
423              * CALCULATE INTERACTIONS *
424              **************************/
425
426             r10              = _mm_mul_pd(rsq10,rinv10);
427
428             /* Calculate table index by multiplying r with table scale and truncate to integer */
429             rt               = _mm_mul_pd(r10,vftabscale);
430             vfitab           = _mm_cvttpd_epi32(rt);
431             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
432             vfitab           = _mm_slli_epi32(vfitab,2);
433
434             /* CUBIC SPLINE TABLE ELECTROSTATICS */
435             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
436             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
437             GMX_MM_TRANSPOSE2_PD(Y,F);
438             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
439             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
440             GMX_MM_TRANSPOSE2_PD(G,H);
441             Heps             = _mm_mul_pd(vfeps,H);
442             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
443             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
444             velec            = _mm_mul_pd(qq10,VV);
445             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
446             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
447
448             /* Update potential sum for this i atom from the interaction with this j atom. */
449             velecsum         = _mm_add_pd(velecsum,velec);
450
451             fscal            = felec;
452
453             /* Calculate temporary vectorial force */
454             tx               = _mm_mul_pd(fscal,dx10);
455             ty               = _mm_mul_pd(fscal,dy10);
456             tz               = _mm_mul_pd(fscal,dz10);
457
458             /* Update vectorial force */
459             fix1             = _mm_add_pd(fix1,tx);
460             fiy1             = _mm_add_pd(fiy1,ty);
461             fiz1             = _mm_add_pd(fiz1,tz);
462
463             fjx0             = _mm_add_pd(fjx0,tx);
464             fjy0             = _mm_add_pd(fjy0,ty);
465             fjz0             = _mm_add_pd(fjz0,tz);
466
467             /**************************
468              * CALCULATE INTERACTIONS *
469              **************************/
470
471             r11              = _mm_mul_pd(rsq11,rinv11);
472
473             /* Calculate table index by multiplying r with table scale and truncate to integer */
474             rt               = _mm_mul_pd(r11,vftabscale);
475             vfitab           = _mm_cvttpd_epi32(rt);
476             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
477             vfitab           = _mm_slli_epi32(vfitab,2);
478
479             /* CUBIC SPLINE TABLE ELECTROSTATICS */
480             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
481             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
482             GMX_MM_TRANSPOSE2_PD(Y,F);
483             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
484             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
485             GMX_MM_TRANSPOSE2_PD(G,H);
486             Heps             = _mm_mul_pd(vfeps,H);
487             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
488             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
489             velec            = _mm_mul_pd(qq11,VV);
490             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
491             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velecsum         = _mm_add_pd(velecsum,velec);
495
496             fscal            = felec;
497
498             /* Calculate temporary vectorial force */
499             tx               = _mm_mul_pd(fscal,dx11);
500             ty               = _mm_mul_pd(fscal,dy11);
501             tz               = _mm_mul_pd(fscal,dz11);
502
503             /* Update vectorial force */
504             fix1             = _mm_add_pd(fix1,tx);
505             fiy1             = _mm_add_pd(fiy1,ty);
506             fiz1             = _mm_add_pd(fiz1,tz);
507
508             fjx1             = _mm_add_pd(fjx1,tx);
509             fjy1             = _mm_add_pd(fjy1,ty);
510             fjz1             = _mm_add_pd(fjz1,tz);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r12              = _mm_mul_pd(rsq12,rinv12);
517
518             /* Calculate table index by multiplying r with table scale and truncate to integer */
519             rt               = _mm_mul_pd(r12,vftabscale);
520             vfitab           = _mm_cvttpd_epi32(rt);
521             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
522             vfitab           = _mm_slli_epi32(vfitab,2);
523
524             /* CUBIC SPLINE TABLE ELECTROSTATICS */
525             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
526             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
527             GMX_MM_TRANSPOSE2_PD(Y,F);
528             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
529             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
530             GMX_MM_TRANSPOSE2_PD(G,H);
531             Heps             = _mm_mul_pd(vfeps,H);
532             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
533             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
534             velec            = _mm_mul_pd(qq12,VV);
535             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
536             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
537
538             /* Update potential sum for this i atom from the interaction with this j atom. */
539             velecsum         = _mm_add_pd(velecsum,velec);
540
541             fscal            = felec;
542
543             /* Calculate temporary vectorial force */
544             tx               = _mm_mul_pd(fscal,dx12);
545             ty               = _mm_mul_pd(fscal,dy12);
546             tz               = _mm_mul_pd(fscal,dz12);
547
548             /* Update vectorial force */
549             fix1             = _mm_add_pd(fix1,tx);
550             fiy1             = _mm_add_pd(fiy1,ty);
551             fiz1             = _mm_add_pd(fiz1,tz);
552
553             fjx2             = _mm_add_pd(fjx2,tx);
554             fjy2             = _mm_add_pd(fjy2,ty);
555             fjz2             = _mm_add_pd(fjz2,tz);
556
557             /**************************
558              * CALCULATE INTERACTIONS *
559              **************************/
560
561             r20              = _mm_mul_pd(rsq20,rinv20);
562
563             /* Calculate table index by multiplying r with table scale and truncate to integer */
564             rt               = _mm_mul_pd(r20,vftabscale);
565             vfitab           = _mm_cvttpd_epi32(rt);
566             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
567             vfitab           = _mm_slli_epi32(vfitab,2);
568
569             /* CUBIC SPLINE TABLE ELECTROSTATICS */
570             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
571             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
572             GMX_MM_TRANSPOSE2_PD(Y,F);
573             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
574             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
575             GMX_MM_TRANSPOSE2_PD(G,H);
576             Heps             = _mm_mul_pd(vfeps,H);
577             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
578             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
579             velec            = _mm_mul_pd(qq20,VV);
580             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
581             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
582
583             /* Update potential sum for this i atom from the interaction with this j atom. */
584             velecsum         = _mm_add_pd(velecsum,velec);
585
586             fscal            = felec;
587
588             /* Calculate temporary vectorial force */
589             tx               = _mm_mul_pd(fscal,dx20);
590             ty               = _mm_mul_pd(fscal,dy20);
591             tz               = _mm_mul_pd(fscal,dz20);
592
593             /* Update vectorial force */
594             fix2             = _mm_add_pd(fix2,tx);
595             fiy2             = _mm_add_pd(fiy2,ty);
596             fiz2             = _mm_add_pd(fiz2,tz);
597
598             fjx0             = _mm_add_pd(fjx0,tx);
599             fjy0             = _mm_add_pd(fjy0,ty);
600             fjz0             = _mm_add_pd(fjz0,tz);
601
602             /**************************
603              * CALCULATE INTERACTIONS *
604              **************************/
605
606             r21              = _mm_mul_pd(rsq21,rinv21);
607
608             /* Calculate table index by multiplying r with table scale and truncate to integer */
609             rt               = _mm_mul_pd(r21,vftabscale);
610             vfitab           = _mm_cvttpd_epi32(rt);
611             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
612             vfitab           = _mm_slli_epi32(vfitab,2);
613
614             /* CUBIC SPLINE TABLE ELECTROSTATICS */
615             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
616             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
617             GMX_MM_TRANSPOSE2_PD(Y,F);
618             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
619             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
620             GMX_MM_TRANSPOSE2_PD(G,H);
621             Heps             = _mm_mul_pd(vfeps,H);
622             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
623             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
624             velec            = _mm_mul_pd(qq21,VV);
625             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
626             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
627
628             /* Update potential sum for this i atom from the interaction with this j atom. */
629             velecsum         = _mm_add_pd(velecsum,velec);
630
631             fscal            = felec;
632
633             /* Calculate temporary vectorial force */
634             tx               = _mm_mul_pd(fscal,dx21);
635             ty               = _mm_mul_pd(fscal,dy21);
636             tz               = _mm_mul_pd(fscal,dz21);
637
638             /* Update vectorial force */
639             fix2             = _mm_add_pd(fix2,tx);
640             fiy2             = _mm_add_pd(fiy2,ty);
641             fiz2             = _mm_add_pd(fiz2,tz);
642
643             fjx1             = _mm_add_pd(fjx1,tx);
644             fjy1             = _mm_add_pd(fjy1,ty);
645             fjz1             = _mm_add_pd(fjz1,tz);
646
647             /**************************
648              * CALCULATE INTERACTIONS *
649              **************************/
650
651             r22              = _mm_mul_pd(rsq22,rinv22);
652
653             /* Calculate table index by multiplying r with table scale and truncate to integer */
654             rt               = _mm_mul_pd(r22,vftabscale);
655             vfitab           = _mm_cvttpd_epi32(rt);
656             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
657             vfitab           = _mm_slli_epi32(vfitab,2);
658
659             /* CUBIC SPLINE TABLE ELECTROSTATICS */
660             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
661             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
662             GMX_MM_TRANSPOSE2_PD(Y,F);
663             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
664             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
665             GMX_MM_TRANSPOSE2_PD(G,H);
666             Heps             = _mm_mul_pd(vfeps,H);
667             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
668             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
669             velec            = _mm_mul_pd(qq22,VV);
670             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
671             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velecsum         = _mm_add_pd(velecsum,velec);
675
676             fscal            = felec;
677
678             /* Calculate temporary vectorial force */
679             tx               = _mm_mul_pd(fscal,dx22);
680             ty               = _mm_mul_pd(fscal,dy22);
681             tz               = _mm_mul_pd(fscal,dz22);
682
683             /* Update vectorial force */
684             fix2             = _mm_add_pd(fix2,tx);
685             fiy2             = _mm_add_pd(fiy2,ty);
686             fiz2             = _mm_add_pd(fiz2,tz);
687
688             fjx2             = _mm_add_pd(fjx2,tx);
689             fjy2             = _mm_add_pd(fjy2,ty);
690             fjz2             = _mm_add_pd(fjz2,tz);
691
692             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
693
694             /* Inner loop uses 400 flops */
695         }
696
697         if(jidx<j_index_end)
698         {
699
700             jnrA             = jjnr[jidx];
701             j_coord_offsetA  = DIM*jnrA;
702
703             /* load j atom coordinates */
704             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
705                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
706
707             /* Calculate displacement vector */
708             dx00             = _mm_sub_pd(ix0,jx0);
709             dy00             = _mm_sub_pd(iy0,jy0);
710             dz00             = _mm_sub_pd(iz0,jz0);
711             dx01             = _mm_sub_pd(ix0,jx1);
712             dy01             = _mm_sub_pd(iy0,jy1);
713             dz01             = _mm_sub_pd(iz0,jz1);
714             dx02             = _mm_sub_pd(ix0,jx2);
715             dy02             = _mm_sub_pd(iy0,jy2);
716             dz02             = _mm_sub_pd(iz0,jz2);
717             dx10             = _mm_sub_pd(ix1,jx0);
718             dy10             = _mm_sub_pd(iy1,jy0);
719             dz10             = _mm_sub_pd(iz1,jz0);
720             dx11             = _mm_sub_pd(ix1,jx1);
721             dy11             = _mm_sub_pd(iy1,jy1);
722             dz11             = _mm_sub_pd(iz1,jz1);
723             dx12             = _mm_sub_pd(ix1,jx2);
724             dy12             = _mm_sub_pd(iy1,jy2);
725             dz12             = _mm_sub_pd(iz1,jz2);
726             dx20             = _mm_sub_pd(ix2,jx0);
727             dy20             = _mm_sub_pd(iy2,jy0);
728             dz20             = _mm_sub_pd(iz2,jz0);
729             dx21             = _mm_sub_pd(ix2,jx1);
730             dy21             = _mm_sub_pd(iy2,jy1);
731             dz21             = _mm_sub_pd(iz2,jz1);
732             dx22             = _mm_sub_pd(ix2,jx2);
733             dy22             = _mm_sub_pd(iy2,jy2);
734             dz22             = _mm_sub_pd(iz2,jz2);
735
736             /* Calculate squared distance and things based on it */
737             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
738             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
739             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
740             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
741             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
742             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
743             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
744             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
745             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
746
747             rinv00           = gmx_mm_invsqrt_pd(rsq00);
748             rinv01           = gmx_mm_invsqrt_pd(rsq01);
749             rinv02           = gmx_mm_invsqrt_pd(rsq02);
750             rinv10           = gmx_mm_invsqrt_pd(rsq10);
751             rinv11           = gmx_mm_invsqrt_pd(rsq11);
752             rinv12           = gmx_mm_invsqrt_pd(rsq12);
753             rinv20           = gmx_mm_invsqrt_pd(rsq20);
754             rinv21           = gmx_mm_invsqrt_pd(rsq21);
755             rinv22           = gmx_mm_invsqrt_pd(rsq22);
756
757             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
758
759             fjx0             = _mm_setzero_pd();
760             fjy0             = _mm_setzero_pd();
761             fjz0             = _mm_setzero_pd();
762             fjx1             = _mm_setzero_pd();
763             fjy1             = _mm_setzero_pd();
764             fjz1             = _mm_setzero_pd();
765             fjx2             = _mm_setzero_pd();
766             fjy2             = _mm_setzero_pd();
767             fjz2             = _mm_setzero_pd();
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             r00              = _mm_mul_pd(rsq00,rinv00);
774
775             /* Calculate table index by multiplying r with table scale and truncate to integer */
776             rt               = _mm_mul_pd(r00,vftabscale);
777             vfitab           = _mm_cvttpd_epi32(rt);
778             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
779             vfitab           = _mm_slli_epi32(vfitab,2);
780
781             /* CUBIC SPLINE TABLE ELECTROSTATICS */
782             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
783             F                = _mm_setzero_pd();
784             GMX_MM_TRANSPOSE2_PD(Y,F);
785             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
786             H                = _mm_setzero_pd();
787             GMX_MM_TRANSPOSE2_PD(G,H);
788             Heps             = _mm_mul_pd(vfeps,H);
789             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
790             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
791             velec            = _mm_mul_pd(qq00,VV);
792             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
793             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
794
795             /* LENNARD-JONES DISPERSION/REPULSION */
796
797             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
798             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
799             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
800             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
801             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
802
803             /* Update potential sum for this i atom from the interaction with this j atom. */
804             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
805             velecsum         = _mm_add_pd(velecsum,velec);
806             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
807             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
808
809             fscal            = _mm_add_pd(felec,fvdw);
810
811             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
812
813             /* Calculate temporary vectorial force */
814             tx               = _mm_mul_pd(fscal,dx00);
815             ty               = _mm_mul_pd(fscal,dy00);
816             tz               = _mm_mul_pd(fscal,dz00);
817
818             /* Update vectorial force */
819             fix0             = _mm_add_pd(fix0,tx);
820             fiy0             = _mm_add_pd(fiy0,ty);
821             fiz0             = _mm_add_pd(fiz0,tz);
822
823             fjx0             = _mm_add_pd(fjx0,tx);
824             fjy0             = _mm_add_pd(fjy0,ty);
825             fjz0             = _mm_add_pd(fjz0,tz);
826
827             /**************************
828              * CALCULATE INTERACTIONS *
829              **************************/
830
831             r01              = _mm_mul_pd(rsq01,rinv01);
832
833             /* Calculate table index by multiplying r with table scale and truncate to integer */
834             rt               = _mm_mul_pd(r01,vftabscale);
835             vfitab           = _mm_cvttpd_epi32(rt);
836             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
837             vfitab           = _mm_slli_epi32(vfitab,2);
838
839             /* CUBIC SPLINE TABLE ELECTROSTATICS */
840             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
841             F                = _mm_setzero_pd();
842             GMX_MM_TRANSPOSE2_PD(Y,F);
843             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
844             H                = _mm_setzero_pd();
845             GMX_MM_TRANSPOSE2_PD(G,H);
846             Heps             = _mm_mul_pd(vfeps,H);
847             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
848             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
849             velec            = _mm_mul_pd(qq01,VV);
850             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
851             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
852
853             /* Update potential sum for this i atom from the interaction with this j atom. */
854             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
855             velecsum         = _mm_add_pd(velecsum,velec);
856
857             fscal            = felec;
858
859             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
860
861             /* Calculate temporary vectorial force */
862             tx               = _mm_mul_pd(fscal,dx01);
863             ty               = _mm_mul_pd(fscal,dy01);
864             tz               = _mm_mul_pd(fscal,dz01);
865
866             /* Update vectorial force */
867             fix0             = _mm_add_pd(fix0,tx);
868             fiy0             = _mm_add_pd(fiy0,ty);
869             fiz0             = _mm_add_pd(fiz0,tz);
870
871             fjx1             = _mm_add_pd(fjx1,tx);
872             fjy1             = _mm_add_pd(fjy1,ty);
873             fjz1             = _mm_add_pd(fjz1,tz);
874
875             /**************************
876              * CALCULATE INTERACTIONS *
877              **************************/
878
879             r02              = _mm_mul_pd(rsq02,rinv02);
880
881             /* Calculate table index by multiplying r with table scale and truncate to integer */
882             rt               = _mm_mul_pd(r02,vftabscale);
883             vfitab           = _mm_cvttpd_epi32(rt);
884             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
885             vfitab           = _mm_slli_epi32(vfitab,2);
886
887             /* CUBIC SPLINE TABLE ELECTROSTATICS */
888             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
889             F                = _mm_setzero_pd();
890             GMX_MM_TRANSPOSE2_PD(Y,F);
891             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
892             H                = _mm_setzero_pd();
893             GMX_MM_TRANSPOSE2_PD(G,H);
894             Heps             = _mm_mul_pd(vfeps,H);
895             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
896             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
897             velec            = _mm_mul_pd(qq02,VV);
898             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
899             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
900
901             /* Update potential sum for this i atom from the interaction with this j atom. */
902             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
903             velecsum         = _mm_add_pd(velecsum,velec);
904
905             fscal            = felec;
906
907             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908
909             /* Calculate temporary vectorial force */
910             tx               = _mm_mul_pd(fscal,dx02);
911             ty               = _mm_mul_pd(fscal,dy02);
912             tz               = _mm_mul_pd(fscal,dz02);
913
914             /* Update vectorial force */
915             fix0             = _mm_add_pd(fix0,tx);
916             fiy0             = _mm_add_pd(fiy0,ty);
917             fiz0             = _mm_add_pd(fiz0,tz);
918
919             fjx2             = _mm_add_pd(fjx2,tx);
920             fjy2             = _mm_add_pd(fjy2,ty);
921             fjz2             = _mm_add_pd(fjz2,tz);
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             r10              = _mm_mul_pd(rsq10,rinv10);
928
929             /* Calculate table index by multiplying r with table scale and truncate to integer */
930             rt               = _mm_mul_pd(r10,vftabscale);
931             vfitab           = _mm_cvttpd_epi32(rt);
932             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
933             vfitab           = _mm_slli_epi32(vfitab,2);
934
935             /* CUBIC SPLINE TABLE ELECTROSTATICS */
936             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
937             F                = _mm_setzero_pd();
938             GMX_MM_TRANSPOSE2_PD(Y,F);
939             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
940             H                = _mm_setzero_pd();
941             GMX_MM_TRANSPOSE2_PD(G,H);
942             Heps             = _mm_mul_pd(vfeps,H);
943             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
944             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
945             velec            = _mm_mul_pd(qq10,VV);
946             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
947             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
948
949             /* Update potential sum for this i atom from the interaction with this j atom. */
950             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
951             velecsum         = _mm_add_pd(velecsum,velec);
952
953             fscal            = felec;
954
955             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
956
957             /* Calculate temporary vectorial force */
958             tx               = _mm_mul_pd(fscal,dx10);
959             ty               = _mm_mul_pd(fscal,dy10);
960             tz               = _mm_mul_pd(fscal,dz10);
961
962             /* Update vectorial force */
963             fix1             = _mm_add_pd(fix1,tx);
964             fiy1             = _mm_add_pd(fiy1,ty);
965             fiz1             = _mm_add_pd(fiz1,tz);
966
967             fjx0             = _mm_add_pd(fjx0,tx);
968             fjy0             = _mm_add_pd(fjy0,ty);
969             fjz0             = _mm_add_pd(fjz0,tz);
970
971             /**************************
972              * CALCULATE INTERACTIONS *
973              **************************/
974
975             r11              = _mm_mul_pd(rsq11,rinv11);
976
977             /* Calculate table index by multiplying r with table scale and truncate to integer */
978             rt               = _mm_mul_pd(r11,vftabscale);
979             vfitab           = _mm_cvttpd_epi32(rt);
980             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
981             vfitab           = _mm_slli_epi32(vfitab,2);
982
983             /* CUBIC SPLINE TABLE ELECTROSTATICS */
984             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
985             F                = _mm_setzero_pd();
986             GMX_MM_TRANSPOSE2_PD(Y,F);
987             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
988             H                = _mm_setzero_pd();
989             GMX_MM_TRANSPOSE2_PD(G,H);
990             Heps             = _mm_mul_pd(vfeps,H);
991             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
992             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
993             velec            = _mm_mul_pd(qq11,VV);
994             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
995             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
996
997             /* Update potential sum for this i atom from the interaction with this j atom. */
998             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
999             velecsum         = _mm_add_pd(velecsum,velec);
1000
1001             fscal            = felec;
1002
1003             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1004
1005             /* Calculate temporary vectorial force */
1006             tx               = _mm_mul_pd(fscal,dx11);
1007             ty               = _mm_mul_pd(fscal,dy11);
1008             tz               = _mm_mul_pd(fscal,dz11);
1009
1010             /* Update vectorial force */
1011             fix1             = _mm_add_pd(fix1,tx);
1012             fiy1             = _mm_add_pd(fiy1,ty);
1013             fiz1             = _mm_add_pd(fiz1,tz);
1014
1015             fjx1             = _mm_add_pd(fjx1,tx);
1016             fjy1             = _mm_add_pd(fjy1,ty);
1017             fjz1             = _mm_add_pd(fjz1,tz);
1018
1019             /**************************
1020              * CALCULATE INTERACTIONS *
1021              **************************/
1022
1023             r12              = _mm_mul_pd(rsq12,rinv12);
1024
1025             /* Calculate table index by multiplying r with table scale and truncate to integer */
1026             rt               = _mm_mul_pd(r12,vftabscale);
1027             vfitab           = _mm_cvttpd_epi32(rt);
1028             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1029             vfitab           = _mm_slli_epi32(vfitab,2);
1030
1031             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1032             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1033             F                = _mm_setzero_pd();
1034             GMX_MM_TRANSPOSE2_PD(Y,F);
1035             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1036             H                = _mm_setzero_pd();
1037             GMX_MM_TRANSPOSE2_PD(G,H);
1038             Heps             = _mm_mul_pd(vfeps,H);
1039             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1040             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1041             velec            = _mm_mul_pd(qq12,VV);
1042             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1043             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1047             velecsum         = _mm_add_pd(velecsum,velec);
1048
1049             fscal            = felec;
1050
1051             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1052
1053             /* Calculate temporary vectorial force */
1054             tx               = _mm_mul_pd(fscal,dx12);
1055             ty               = _mm_mul_pd(fscal,dy12);
1056             tz               = _mm_mul_pd(fscal,dz12);
1057
1058             /* Update vectorial force */
1059             fix1             = _mm_add_pd(fix1,tx);
1060             fiy1             = _mm_add_pd(fiy1,ty);
1061             fiz1             = _mm_add_pd(fiz1,tz);
1062
1063             fjx2             = _mm_add_pd(fjx2,tx);
1064             fjy2             = _mm_add_pd(fjy2,ty);
1065             fjz2             = _mm_add_pd(fjz2,tz);
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             r20              = _mm_mul_pd(rsq20,rinv20);
1072
1073             /* Calculate table index by multiplying r with table scale and truncate to integer */
1074             rt               = _mm_mul_pd(r20,vftabscale);
1075             vfitab           = _mm_cvttpd_epi32(rt);
1076             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1077             vfitab           = _mm_slli_epi32(vfitab,2);
1078
1079             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1080             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1081             F                = _mm_setzero_pd();
1082             GMX_MM_TRANSPOSE2_PD(Y,F);
1083             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1084             H                = _mm_setzero_pd();
1085             GMX_MM_TRANSPOSE2_PD(G,H);
1086             Heps             = _mm_mul_pd(vfeps,H);
1087             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1088             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1089             velec            = _mm_mul_pd(qq20,VV);
1090             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1091             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1092
1093             /* Update potential sum for this i atom from the interaction with this j atom. */
1094             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1095             velecsum         = _mm_add_pd(velecsum,velec);
1096
1097             fscal            = felec;
1098
1099             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1100
1101             /* Calculate temporary vectorial force */
1102             tx               = _mm_mul_pd(fscal,dx20);
1103             ty               = _mm_mul_pd(fscal,dy20);
1104             tz               = _mm_mul_pd(fscal,dz20);
1105
1106             /* Update vectorial force */
1107             fix2             = _mm_add_pd(fix2,tx);
1108             fiy2             = _mm_add_pd(fiy2,ty);
1109             fiz2             = _mm_add_pd(fiz2,tz);
1110
1111             fjx0             = _mm_add_pd(fjx0,tx);
1112             fjy0             = _mm_add_pd(fjy0,ty);
1113             fjz0             = _mm_add_pd(fjz0,tz);
1114
1115             /**************************
1116              * CALCULATE INTERACTIONS *
1117              **************************/
1118
1119             r21              = _mm_mul_pd(rsq21,rinv21);
1120
1121             /* Calculate table index by multiplying r with table scale and truncate to integer */
1122             rt               = _mm_mul_pd(r21,vftabscale);
1123             vfitab           = _mm_cvttpd_epi32(rt);
1124             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1125             vfitab           = _mm_slli_epi32(vfitab,2);
1126
1127             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1128             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1129             F                = _mm_setzero_pd();
1130             GMX_MM_TRANSPOSE2_PD(Y,F);
1131             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1132             H                = _mm_setzero_pd();
1133             GMX_MM_TRANSPOSE2_PD(G,H);
1134             Heps             = _mm_mul_pd(vfeps,H);
1135             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1136             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1137             velec            = _mm_mul_pd(qq21,VV);
1138             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1139             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1140
1141             /* Update potential sum for this i atom from the interaction with this j atom. */
1142             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1143             velecsum         = _mm_add_pd(velecsum,velec);
1144
1145             fscal            = felec;
1146
1147             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1148
1149             /* Calculate temporary vectorial force */
1150             tx               = _mm_mul_pd(fscal,dx21);
1151             ty               = _mm_mul_pd(fscal,dy21);
1152             tz               = _mm_mul_pd(fscal,dz21);
1153
1154             /* Update vectorial force */
1155             fix2             = _mm_add_pd(fix2,tx);
1156             fiy2             = _mm_add_pd(fiy2,ty);
1157             fiz2             = _mm_add_pd(fiz2,tz);
1158
1159             fjx1             = _mm_add_pd(fjx1,tx);
1160             fjy1             = _mm_add_pd(fjy1,ty);
1161             fjz1             = _mm_add_pd(fjz1,tz);
1162
1163             /**************************
1164              * CALCULATE INTERACTIONS *
1165              **************************/
1166
1167             r22              = _mm_mul_pd(rsq22,rinv22);
1168
1169             /* Calculate table index by multiplying r with table scale and truncate to integer */
1170             rt               = _mm_mul_pd(r22,vftabscale);
1171             vfitab           = _mm_cvttpd_epi32(rt);
1172             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1173             vfitab           = _mm_slli_epi32(vfitab,2);
1174
1175             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1176             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1177             F                = _mm_setzero_pd();
1178             GMX_MM_TRANSPOSE2_PD(Y,F);
1179             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1180             H                = _mm_setzero_pd();
1181             GMX_MM_TRANSPOSE2_PD(G,H);
1182             Heps             = _mm_mul_pd(vfeps,H);
1183             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1184             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1185             velec            = _mm_mul_pd(qq22,VV);
1186             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1187             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1188
1189             /* Update potential sum for this i atom from the interaction with this j atom. */
1190             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1191             velecsum         = _mm_add_pd(velecsum,velec);
1192
1193             fscal            = felec;
1194
1195             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1196
1197             /* Calculate temporary vectorial force */
1198             tx               = _mm_mul_pd(fscal,dx22);
1199             ty               = _mm_mul_pd(fscal,dy22);
1200             tz               = _mm_mul_pd(fscal,dz22);
1201
1202             /* Update vectorial force */
1203             fix2             = _mm_add_pd(fix2,tx);
1204             fiy2             = _mm_add_pd(fiy2,ty);
1205             fiz2             = _mm_add_pd(fiz2,tz);
1206
1207             fjx2             = _mm_add_pd(fjx2,tx);
1208             fjy2             = _mm_add_pd(fjy2,ty);
1209             fjz2             = _mm_add_pd(fjz2,tz);
1210
1211             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1212
1213             /* Inner loop uses 400 flops */
1214         }
1215
1216         /* End of innermost loop */
1217
1218         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1219                                               f+i_coord_offset,fshift+i_shift_offset);
1220
1221         ggid                        = gid[iidx];
1222         /* Update potential energies */
1223         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1224         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1225
1226         /* Increment number of inner iterations */
1227         inneriter                  += j_index_end - j_index_start;
1228
1229         /* Outer loop uses 20 flops */
1230     }
1231
1232     /* Increment number of outer iterations */
1233     outeriter        += nri;
1234
1235     /* Update outer/inner flops */
1236
1237     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*400);
1238 }
1239 /*
1240  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1241  * Electrostatics interaction: CubicSplineTable
1242  * VdW interaction:            LennardJones
1243  * Geometry:                   Water3-Water3
1244  * Calculate force/pot:        Force
1245  */
1246 void
1247 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1248                     (t_nblist                    * gmx_restrict       nlist,
1249                      rvec                        * gmx_restrict          xx,
1250                      rvec                        * gmx_restrict          ff,
1251                      t_forcerec                  * gmx_restrict          fr,
1252                      t_mdatoms                   * gmx_restrict     mdatoms,
1253                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1254                      t_nrnb                      * gmx_restrict        nrnb)
1255 {
1256     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1257      * just 0 for non-waters.
1258      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1259      * jnr indices corresponding to data put in the four positions in the SIMD register.
1260      */
1261     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1262     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1263     int              jnrA,jnrB;
1264     int              j_coord_offsetA,j_coord_offsetB;
1265     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1266     real             rcutoff_scalar;
1267     real             *shiftvec,*fshift,*x,*f;
1268     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1269     int              vdwioffset0;
1270     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1271     int              vdwioffset1;
1272     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1273     int              vdwioffset2;
1274     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1275     int              vdwjidx0A,vdwjidx0B;
1276     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1277     int              vdwjidx1A,vdwjidx1B;
1278     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1279     int              vdwjidx2A,vdwjidx2B;
1280     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1281     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1282     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1283     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1284     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1285     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1286     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1287     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1288     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1289     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1290     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1291     real             *charge;
1292     int              nvdwtype;
1293     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1294     int              *vdwtype;
1295     real             *vdwparam;
1296     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1297     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1298     __m128i          vfitab;
1299     __m128i          ifour       = _mm_set1_epi32(4);
1300     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1301     real             *vftab;
1302     __m128d          dummy_mask,cutoff_mask;
1303     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1304     __m128d          one     = _mm_set1_pd(1.0);
1305     __m128d          two     = _mm_set1_pd(2.0);
1306     x                = xx[0];
1307     f                = ff[0];
1308
1309     nri              = nlist->nri;
1310     iinr             = nlist->iinr;
1311     jindex           = nlist->jindex;
1312     jjnr             = nlist->jjnr;
1313     shiftidx         = nlist->shift;
1314     gid              = nlist->gid;
1315     shiftvec         = fr->shift_vec[0];
1316     fshift           = fr->fshift[0];
1317     facel            = _mm_set1_pd(fr->epsfac);
1318     charge           = mdatoms->chargeA;
1319     nvdwtype         = fr->ntype;
1320     vdwparam         = fr->nbfp;
1321     vdwtype          = mdatoms->typeA;
1322
1323     vftab            = kernel_data->table_elec->data;
1324     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
1325
1326     /* Setup water-specific parameters */
1327     inr              = nlist->iinr[0];
1328     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1329     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1330     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1331     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1332
1333     jq0              = _mm_set1_pd(charge[inr+0]);
1334     jq1              = _mm_set1_pd(charge[inr+1]);
1335     jq2              = _mm_set1_pd(charge[inr+2]);
1336     vdwjidx0A        = 2*vdwtype[inr+0];
1337     qq00             = _mm_mul_pd(iq0,jq0);
1338     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1339     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1340     qq01             = _mm_mul_pd(iq0,jq1);
1341     qq02             = _mm_mul_pd(iq0,jq2);
1342     qq10             = _mm_mul_pd(iq1,jq0);
1343     qq11             = _mm_mul_pd(iq1,jq1);
1344     qq12             = _mm_mul_pd(iq1,jq2);
1345     qq20             = _mm_mul_pd(iq2,jq0);
1346     qq21             = _mm_mul_pd(iq2,jq1);
1347     qq22             = _mm_mul_pd(iq2,jq2);
1348
1349     /* Avoid stupid compiler warnings */
1350     jnrA = jnrB = 0;
1351     j_coord_offsetA = 0;
1352     j_coord_offsetB = 0;
1353
1354     outeriter        = 0;
1355     inneriter        = 0;
1356
1357     /* Start outer loop over neighborlists */
1358     for(iidx=0; iidx<nri; iidx++)
1359     {
1360         /* Load shift vector for this list */
1361         i_shift_offset   = DIM*shiftidx[iidx];
1362
1363         /* Load limits for loop over neighbors */
1364         j_index_start    = jindex[iidx];
1365         j_index_end      = jindex[iidx+1];
1366
1367         /* Get outer coordinate index */
1368         inr              = iinr[iidx];
1369         i_coord_offset   = DIM*inr;
1370
1371         /* Load i particle coords and add shift vector */
1372         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1373                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1374
1375         fix0             = _mm_setzero_pd();
1376         fiy0             = _mm_setzero_pd();
1377         fiz0             = _mm_setzero_pd();
1378         fix1             = _mm_setzero_pd();
1379         fiy1             = _mm_setzero_pd();
1380         fiz1             = _mm_setzero_pd();
1381         fix2             = _mm_setzero_pd();
1382         fiy2             = _mm_setzero_pd();
1383         fiz2             = _mm_setzero_pd();
1384
1385         /* Start inner kernel loop */
1386         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1387         {
1388
1389             /* Get j neighbor index, and coordinate index */
1390             jnrA             = jjnr[jidx];
1391             jnrB             = jjnr[jidx+1];
1392             j_coord_offsetA  = DIM*jnrA;
1393             j_coord_offsetB  = DIM*jnrB;
1394
1395             /* load j atom coordinates */
1396             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1397                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1398
1399             /* Calculate displacement vector */
1400             dx00             = _mm_sub_pd(ix0,jx0);
1401             dy00             = _mm_sub_pd(iy0,jy0);
1402             dz00             = _mm_sub_pd(iz0,jz0);
1403             dx01             = _mm_sub_pd(ix0,jx1);
1404             dy01             = _mm_sub_pd(iy0,jy1);
1405             dz01             = _mm_sub_pd(iz0,jz1);
1406             dx02             = _mm_sub_pd(ix0,jx2);
1407             dy02             = _mm_sub_pd(iy0,jy2);
1408             dz02             = _mm_sub_pd(iz0,jz2);
1409             dx10             = _mm_sub_pd(ix1,jx0);
1410             dy10             = _mm_sub_pd(iy1,jy0);
1411             dz10             = _mm_sub_pd(iz1,jz0);
1412             dx11             = _mm_sub_pd(ix1,jx1);
1413             dy11             = _mm_sub_pd(iy1,jy1);
1414             dz11             = _mm_sub_pd(iz1,jz1);
1415             dx12             = _mm_sub_pd(ix1,jx2);
1416             dy12             = _mm_sub_pd(iy1,jy2);
1417             dz12             = _mm_sub_pd(iz1,jz2);
1418             dx20             = _mm_sub_pd(ix2,jx0);
1419             dy20             = _mm_sub_pd(iy2,jy0);
1420             dz20             = _mm_sub_pd(iz2,jz0);
1421             dx21             = _mm_sub_pd(ix2,jx1);
1422             dy21             = _mm_sub_pd(iy2,jy1);
1423             dz21             = _mm_sub_pd(iz2,jz1);
1424             dx22             = _mm_sub_pd(ix2,jx2);
1425             dy22             = _mm_sub_pd(iy2,jy2);
1426             dz22             = _mm_sub_pd(iz2,jz2);
1427
1428             /* Calculate squared distance and things based on it */
1429             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1430             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1431             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1432             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1433             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1434             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1435             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1436             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1437             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1438
1439             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1440             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1441             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1442             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1443             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1444             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1445             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1446             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1447             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1448
1449             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1450
1451             fjx0             = _mm_setzero_pd();
1452             fjy0             = _mm_setzero_pd();
1453             fjz0             = _mm_setzero_pd();
1454             fjx1             = _mm_setzero_pd();
1455             fjy1             = _mm_setzero_pd();
1456             fjz1             = _mm_setzero_pd();
1457             fjx2             = _mm_setzero_pd();
1458             fjy2             = _mm_setzero_pd();
1459             fjz2             = _mm_setzero_pd();
1460
1461             /**************************
1462              * CALCULATE INTERACTIONS *
1463              **************************/
1464
1465             r00              = _mm_mul_pd(rsq00,rinv00);
1466
1467             /* Calculate table index by multiplying r with table scale and truncate to integer */
1468             rt               = _mm_mul_pd(r00,vftabscale);
1469             vfitab           = _mm_cvttpd_epi32(rt);
1470             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1471             vfitab           = _mm_slli_epi32(vfitab,2);
1472
1473             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1474             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1475             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1476             GMX_MM_TRANSPOSE2_PD(Y,F);
1477             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1478             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1479             GMX_MM_TRANSPOSE2_PD(G,H);
1480             Heps             = _mm_mul_pd(vfeps,H);
1481             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1482             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1483             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1484
1485             /* LENNARD-JONES DISPERSION/REPULSION */
1486
1487             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1488             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1489
1490             fscal            = _mm_add_pd(felec,fvdw);
1491
1492             /* Calculate temporary vectorial force */
1493             tx               = _mm_mul_pd(fscal,dx00);
1494             ty               = _mm_mul_pd(fscal,dy00);
1495             tz               = _mm_mul_pd(fscal,dz00);
1496
1497             /* Update vectorial force */
1498             fix0             = _mm_add_pd(fix0,tx);
1499             fiy0             = _mm_add_pd(fiy0,ty);
1500             fiz0             = _mm_add_pd(fiz0,tz);
1501
1502             fjx0             = _mm_add_pd(fjx0,tx);
1503             fjy0             = _mm_add_pd(fjy0,ty);
1504             fjz0             = _mm_add_pd(fjz0,tz);
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             r01              = _mm_mul_pd(rsq01,rinv01);
1511
1512             /* Calculate table index by multiplying r with table scale and truncate to integer */
1513             rt               = _mm_mul_pd(r01,vftabscale);
1514             vfitab           = _mm_cvttpd_epi32(rt);
1515             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1516             vfitab           = _mm_slli_epi32(vfitab,2);
1517
1518             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1519             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1520             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1521             GMX_MM_TRANSPOSE2_PD(Y,F);
1522             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1523             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1524             GMX_MM_TRANSPOSE2_PD(G,H);
1525             Heps             = _mm_mul_pd(vfeps,H);
1526             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1527             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1528             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1529
1530             fscal            = felec;
1531
1532             /* Calculate temporary vectorial force */
1533             tx               = _mm_mul_pd(fscal,dx01);
1534             ty               = _mm_mul_pd(fscal,dy01);
1535             tz               = _mm_mul_pd(fscal,dz01);
1536
1537             /* Update vectorial force */
1538             fix0             = _mm_add_pd(fix0,tx);
1539             fiy0             = _mm_add_pd(fiy0,ty);
1540             fiz0             = _mm_add_pd(fiz0,tz);
1541
1542             fjx1             = _mm_add_pd(fjx1,tx);
1543             fjy1             = _mm_add_pd(fjy1,ty);
1544             fjz1             = _mm_add_pd(fjz1,tz);
1545
1546             /**************************
1547              * CALCULATE INTERACTIONS *
1548              **************************/
1549
1550             r02              = _mm_mul_pd(rsq02,rinv02);
1551
1552             /* Calculate table index by multiplying r with table scale and truncate to integer */
1553             rt               = _mm_mul_pd(r02,vftabscale);
1554             vfitab           = _mm_cvttpd_epi32(rt);
1555             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1556             vfitab           = _mm_slli_epi32(vfitab,2);
1557
1558             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1559             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1560             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1561             GMX_MM_TRANSPOSE2_PD(Y,F);
1562             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1563             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1564             GMX_MM_TRANSPOSE2_PD(G,H);
1565             Heps             = _mm_mul_pd(vfeps,H);
1566             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1567             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1568             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1569
1570             fscal            = felec;
1571
1572             /* Calculate temporary vectorial force */
1573             tx               = _mm_mul_pd(fscal,dx02);
1574             ty               = _mm_mul_pd(fscal,dy02);
1575             tz               = _mm_mul_pd(fscal,dz02);
1576
1577             /* Update vectorial force */
1578             fix0             = _mm_add_pd(fix0,tx);
1579             fiy0             = _mm_add_pd(fiy0,ty);
1580             fiz0             = _mm_add_pd(fiz0,tz);
1581
1582             fjx2             = _mm_add_pd(fjx2,tx);
1583             fjy2             = _mm_add_pd(fjy2,ty);
1584             fjz2             = _mm_add_pd(fjz2,tz);
1585
1586             /**************************
1587              * CALCULATE INTERACTIONS *
1588              **************************/
1589
1590             r10              = _mm_mul_pd(rsq10,rinv10);
1591
1592             /* Calculate table index by multiplying r with table scale and truncate to integer */
1593             rt               = _mm_mul_pd(r10,vftabscale);
1594             vfitab           = _mm_cvttpd_epi32(rt);
1595             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1596             vfitab           = _mm_slli_epi32(vfitab,2);
1597
1598             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1599             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1600             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1601             GMX_MM_TRANSPOSE2_PD(Y,F);
1602             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1603             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1604             GMX_MM_TRANSPOSE2_PD(G,H);
1605             Heps             = _mm_mul_pd(vfeps,H);
1606             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1607             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1608             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm_mul_pd(fscal,dx10);
1614             ty               = _mm_mul_pd(fscal,dy10);
1615             tz               = _mm_mul_pd(fscal,dz10);
1616
1617             /* Update vectorial force */
1618             fix1             = _mm_add_pd(fix1,tx);
1619             fiy1             = _mm_add_pd(fiy1,ty);
1620             fiz1             = _mm_add_pd(fiz1,tz);
1621
1622             fjx0             = _mm_add_pd(fjx0,tx);
1623             fjy0             = _mm_add_pd(fjy0,ty);
1624             fjz0             = _mm_add_pd(fjz0,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             r11              = _mm_mul_pd(rsq11,rinv11);
1631
1632             /* Calculate table index by multiplying r with table scale and truncate to integer */
1633             rt               = _mm_mul_pd(r11,vftabscale);
1634             vfitab           = _mm_cvttpd_epi32(rt);
1635             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1636             vfitab           = _mm_slli_epi32(vfitab,2);
1637
1638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1640             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1641             GMX_MM_TRANSPOSE2_PD(Y,F);
1642             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1643             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1644             GMX_MM_TRANSPOSE2_PD(G,H);
1645             Heps             = _mm_mul_pd(vfeps,H);
1646             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1647             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1648             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1649
1650             fscal            = felec;
1651
1652             /* Calculate temporary vectorial force */
1653             tx               = _mm_mul_pd(fscal,dx11);
1654             ty               = _mm_mul_pd(fscal,dy11);
1655             tz               = _mm_mul_pd(fscal,dz11);
1656
1657             /* Update vectorial force */
1658             fix1             = _mm_add_pd(fix1,tx);
1659             fiy1             = _mm_add_pd(fiy1,ty);
1660             fiz1             = _mm_add_pd(fiz1,tz);
1661
1662             fjx1             = _mm_add_pd(fjx1,tx);
1663             fjy1             = _mm_add_pd(fjy1,ty);
1664             fjz1             = _mm_add_pd(fjz1,tz);
1665
1666             /**************************
1667              * CALCULATE INTERACTIONS *
1668              **************************/
1669
1670             r12              = _mm_mul_pd(rsq12,rinv12);
1671
1672             /* Calculate table index by multiplying r with table scale and truncate to integer */
1673             rt               = _mm_mul_pd(r12,vftabscale);
1674             vfitab           = _mm_cvttpd_epi32(rt);
1675             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1676             vfitab           = _mm_slli_epi32(vfitab,2);
1677
1678             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1679             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1680             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1681             GMX_MM_TRANSPOSE2_PD(Y,F);
1682             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1683             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1684             GMX_MM_TRANSPOSE2_PD(G,H);
1685             Heps             = _mm_mul_pd(vfeps,H);
1686             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1687             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1688             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1689
1690             fscal            = felec;
1691
1692             /* Calculate temporary vectorial force */
1693             tx               = _mm_mul_pd(fscal,dx12);
1694             ty               = _mm_mul_pd(fscal,dy12);
1695             tz               = _mm_mul_pd(fscal,dz12);
1696
1697             /* Update vectorial force */
1698             fix1             = _mm_add_pd(fix1,tx);
1699             fiy1             = _mm_add_pd(fiy1,ty);
1700             fiz1             = _mm_add_pd(fiz1,tz);
1701
1702             fjx2             = _mm_add_pd(fjx2,tx);
1703             fjy2             = _mm_add_pd(fjy2,ty);
1704             fjz2             = _mm_add_pd(fjz2,tz);
1705
1706             /**************************
1707              * CALCULATE INTERACTIONS *
1708              **************************/
1709
1710             r20              = _mm_mul_pd(rsq20,rinv20);
1711
1712             /* Calculate table index by multiplying r with table scale and truncate to integer */
1713             rt               = _mm_mul_pd(r20,vftabscale);
1714             vfitab           = _mm_cvttpd_epi32(rt);
1715             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1716             vfitab           = _mm_slli_epi32(vfitab,2);
1717
1718             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1719             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1720             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1721             GMX_MM_TRANSPOSE2_PD(Y,F);
1722             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1723             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1724             GMX_MM_TRANSPOSE2_PD(G,H);
1725             Heps             = _mm_mul_pd(vfeps,H);
1726             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1727             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1728             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1729
1730             fscal            = felec;
1731
1732             /* Calculate temporary vectorial force */
1733             tx               = _mm_mul_pd(fscal,dx20);
1734             ty               = _mm_mul_pd(fscal,dy20);
1735             tz               = _mm_mul_pd(fscal,dz20);
1736
1737             /* Update vectorial force */
1738             fix2             = _mm_add_pd(fix2,tx);
1739             fiy2             = _mm_add_pd(fiy2,ty);
1740             fiz2             = _mm_add_pd(fiz2,tz);
1741
1742             fjx0             = _mm_add_pd(fjx0,tx);
1743             fjy0             = _mm_add_pd(fjy0,ty);
1744             fjz0             = _mm_add_pd(fjz0,tz);
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             r21              = _mm_mul_pd(rsq21,rinv21);
1751
1752             /* Calculate table index by multiplying r with table scale and truncate to integer */
1753             rt               = _mm_mul_pd(r21,vftabscale);
1754             vfitab           = _mm_cvttpd_epi32(rt);
1755             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1756             vfitab           = _mm_slli_epi32(vfitab,2);
1757
1758             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1759             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1760             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1761             GMX_MM_TRANSPOSE2_PD(Y,F);
1762             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1763             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1764             GMX_MM_TRANSPOSE2_PD(G,H);
1765             Heps             = _mm_mul_pd(vfeps,H);
1766             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1767             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1768             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1769
1770             fscal            = felec;
1771
1772             /* Calculate temporary vectorial force */
1773             tx               = _mm_mul_pd(fscal,dx21);
1774             ty               = _mm_mul_pd(fscal,dy21);
1775             tz               = _mm_mul_pd(fscal,dz21);
1776
1777             /* Update vectorial force */
1778             fix2             = _mm_add_pd(fix2,tx);
1779             fiy2             = _mm_add_pd(fiy2,ty);
1780             fiz2             = _mm_add_pd(fiz2,tz);
1781
1782             fjx1             = _mm_add_pd(fjx1,tx);
1783             fjy1             = _mm_add_pd(fjy1,ty);
1784             fjz1             = _mm_add_pd(fjz1,tz);
1785
1786             /**************************
1787              * CALCULATE INTERACTIONS *
1788              **************************/
1789
1790             r22              = _mm_mul_pd(rsq22,rinv22);
1791
1792             /* Calculate table index by multiplying r with table scale and truncate to integer */
1793             rt               = _mm_mul_pd(r22,vftabscale);
1794             vfitab           = _mm_cvttpd_epi32(rt);
1795             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1796             vfitab           = _mm_slli_epi32(vfitab,2);
1797
1798             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1799             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1800             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1801             GMX_MM_TRANSPOSE2_PD(Y,F);
1802             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1803             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1804             GMX_MM_TRANSPOSE2_PD(G,H);
1805             Heps             = _mm_mul_pd(vfeps,H);
1806             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1807             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1808             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1809
1810             fscal            = felec;
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm_mul_pd(fscal,dx22);
1814             ty               = _mm_mul_pd(fscal,dy22);
1815             tz               = _mm_mul_pd(fscal,dz22);
1816
1817             /* Update vectorial force */
1818             fix2             = _mm_add_pd(fix2,tx);
1819             fiy2             = _mm_add_pd(fiy2,ty);
1820             fiz2             = _mm_add_pd(fiz2,tz);
1821
1822             fjx2             = _mm_add_pd(fjx2,tx);
1823             fjy2             = _mm_add_pd(fjy2,ty);
1824             fjz2             = _mm_add_pd(fjz2,tz);
1825
1826             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1827
1828             /* Inner loop uses 359 flops */
1829         }
1830
1831         if(jidx<j_index_end)
1832         {
1833
1834             jnrA             = jjnr[jidx];
1835             j_coord_offsetA  = DIM*jnrA;
1836
1837             /* load j atom coordinates */
1838             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1839                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1840
1841             /* Calculate displacement vector */
1842             dx00             = _mm_sub_pd(ix0,jx0);
1843             dy00             = _mm_sub_pd(iy0,jy0);
1844             dz00             = _mm_sub_pd(iz0,jz0);
1845             dx01             = _mm_sub_pd(ix0,jx1);
1846             dy01             = _mm_sub_pd(iy0,jy1);
1847             dz01             = _mm_sub_pd(iz0,jz1);
1848             dx02             = _mm_sub_pd(ix0,jx2);
1849             dy02             = _mm_sub_pd(iy0,jy2);
1850             dz02             = _mm_sub_pd(iz0,jz2);
1851             dx10             = _mm_sub_pd(ix1,jx0);
1852             dy10             = _mm_sub_pd(iy1,jy0);
1853             dz10             = _mm_sub_pd(iz1,jz0);
1854             dx11             = _mm_sub_pd(ix1,jx1);
1855             dy11             = _mm_sub_pd(iy1,jy1);
1856             dz11             = _mm_sub_pd(iz1,jz1);
1857             dx12             = _mm_sub_pd(ix1,jx2);
1858             dy12             = _mm_sub_pd(iy1,jy2);
1859             dz12             = _mm_sub_pd(iz1,jz2);
1860             dx20             = _mm_sub_pd(ix2,jx0);
1861             dy20             = _mm_sub_pd(iy2,jy0);
1862             dz20             = _mm_sub_pd(iz2,jz0);
1863             dx21             = _mm_sub_pd(ix2,jx1);
1864             dy21             = _mm_sub_pd(iy2,jy1);
1865             dz21             = _mm_sub_pd(iz2,jz1);
1866             dx22             = _mm_sub_pd(ix2,jx2);
1867             dy22             = _mm_sub_pd(iy2,jy2);
1868             dz22             = _mm_sub_pd(iz2,jz2);
1869
1870             /* Calculate squared distance and things based on it */
1871             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1872             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1873             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1874             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1875             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1876             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1877             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1878             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1879             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1880
1881             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1882             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1883             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1884             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1885             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1886             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1887             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1888             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1889             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1890
1891             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1892
1893             fjx0             = _mm_setzero_pd();
1894             fjy0             = _mm_setzero_pd();
1895             fjz0             = _mm_setzero_pd();
1896             fjx1             = _mm_setzero_pd();
1897             fjy1             = _mm_setzero_pd();
1898             fjz1             = _mm_setzero_pd();
1899             fjx2             = _mm_setzero_pd();
1900             fjy2             = _mm_setzero_pd();
1901             fjz2             = _mm_setzero_pd();
1902
1903             /**************************
1904              * CALCULATE INTERACTIONS *
1905              **************************/
1906
1907             r00              = _mm_mul_pd(rsq00,rinv00);
1908
1909             /* Calculate table index by multiplying r with table scale and truncate to integer */
1910             rt               = _mm_mul_pd(r00,vftabscale);
1911             vfitab           = _mm_cvttpd_epi32(rt);
1912             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1913             vfitab           = _mm_slli_epi32(vfitab,2);
1914
1915             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1917             F                = _mm_setzero_pd();
1918             GMX_MM_TRANSPOSE2_PD(Y,F);
1919             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1920             H                = _mm_setzero_pd();
1921             GMX_MM_TRANSPOSE2_PD(G,H);
1922             Heps             = _mm_mul_pd(vfeps,H);
1923             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1924             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1925             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1926
1927             /* LENNARD-JONES DISPERSION/REPULSION */
1928
1929             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1930             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1931
1932             fscal            = _mm_add_pd(felec,fvdw);
1933
1934             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1935
1936             /* Calculate temporary vectorial force */
1937             tx               = _mm_mul_pd(fscal,dx00);
1938             ty               = _mm_mul_pd(fscal,dy00);
1939             tz               = _mm_mul_pd(fscal,dz00);
1940
1941             /* Update vectorial force */
1942             fix0             = _mm_add_pd(fix0,tx);
1943             fiy0             = _mm_add_pd(fiy0,ty);
1944             fiz0             = _mm_add_pd(fiz0,tz);
1945
1946             fjx0             = _mm_add_pd(fjx0,tx);
1947             fjy0             = _mm_add_pd(fjy0,ty);
1948             fjz0             = _mm_add_pd(fjz0,tz);
1949
1950             /**************************
1951              * CALCULATE INTERACTIONS *
1952              **************************/
1953
1954             r01              = _mm_mul_pd(rsq01,rinv01);
1955
1956             /* Calculate table index by multiplying r with table scale and truncate to integer */
1957             rt               = _mm_mul_pd(r01,vftabscale);
1958             vfitab           = _mm_cvttpd_epi32(rt);
1959             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1960             vfitab           = _mm_slli_epi32(vfitab,2);
1961
1962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1964             F                = _mm_setzero_pd();
1965             GMX_MM_TRANSPOSE2_PD(Y,F);
1966             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1967             H                = _mm_setzero_pd();
1968             GMX_MM_TRANSPOSE2_PD(G,H);
1969             Heps             = _mm_mul_pd(vfeps,H);
1970             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1971             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1972             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1977
1978             /* Calculate temporary vectorial force */
1979             tx               = _mm_mul_pd(fscal,dx01);
1980             ty               = _mm_mul_pd(fscal,dy01);
1981             tz               = _mm_mul_pd(fscal,dz01);
1982
1983             /* Update vectorial force */
1984             fix0             = _mm_add_pd(fix0,tx);
1985             fiy0             = _mm_add_pd(fiy0,ty);
1986             fiz0             = _mm_add_pd(fiz0,tz);
1987
1988             fjx1             = _mm_add_pd(fjx1,tx);
1989             fjy1             = _mm_add_pd(fjy1,ty);
1990             fjz1             = _mm_add_pd(fjz1,tz);
1991
1992             /**************************
1993              * CALCULATE INTERACTIONS *
1994              **************************/
1995
1996             r02              = _mm_mul_pd(rsq02,rinv02);
1997
1998             /* Calculate table index by multiplying r with table scale and truncate to integer */
1999             rt               = _mm_mul_pd(r02,vftabscale);
2000             vfitab           = _mm_cvttpd_epi32(rt);
2001             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2002             vfitab           = _mm_slli_epi32(vfitab,2);
2003
2004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2006             F                = _mm_setzero_pd();
2007             GMX_MM_TRANSPOSE2_PD(Y,F);
2008             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2009             H                = _mm_setzero_pd();
2010             GMX_MM_TRANSPOSE2_PD(G,H);
2011             Heps             = _mm_mul_pd(vfeps,H);
2012             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2013             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2014             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2019
2020             /* Calculate temporary vectorial force */
2021             tx               = _mm_mul_pd(fscal,dx02);
2022             ty               = _mm_mul_pd(fscal,dy02);
2023             tz               = _mm_mul_pd(fscal,dz02);
2024
2025             /* Update vectorial force */
2026             fix0             = _mm_add_pd(fix0,tx);
2027             fiy0             = _mm_add_pd(fiy0,ty);
2028             fiz0             = _mm_add_pd(fiz0,tz);
2029
2030             fjx2             = _mm_add_pd(fjx2,tx);
2031             fjy2             = _mm_add_pd(fjy2,ty);
2032             fjz2             = _mm_add_pd(fjz2,tz);
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             r10              = _mm_mul_pd(rsq10,rinv10);
2039
2040             /* Calculate table index by multiplying r with table scale and truncate to integer */
2041             rt               = _mm_mul_pd(r10,vftabscale);
2042             vfitab           = _mm_cvttpd_epi32(rt);
2043             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2044             vfitab           = _mm_slli_epi32(vfitab,2);
2045
2046             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048             F                = _mm_setzero_pd();
2049             GMX_MM_TRANSPOSE2_PD(Y,F);
2050             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2051             H                = _mm_setzero_pd();
2052             GMX_MM_TRANSPOSE2_PD(G,H);
2053             Heps             = _mm_mul_pd(vfeps,H);
2054             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2055             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2056             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2057
2058             fscal            = felec;
2059
2060             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2061
2062             /* Calculate temporary vectorial force */
2063             tx               = _mm_mul_pd(fscal,dx10);
2064             ty               = _mm_mul_pd(fscal,dy10);
2065             tz               = _mm_mul_pd(fscal,dz10);
2066
2067             /* Update vectorial force */
2068             fix1             = _mm_add_pd(fix1,tx);
2069             fiy1             = _mm_add_pd(fiy1,ty);
2070             fiz1             = _mm_add_pd(fiz1,tz);
2071
2072             fjx0             = _mm_add_pd(fjx0,tx);
2073             fjy0             = _mm_add_pd(fjy0,ty);
2074             fjz0             = _mm_add_pd(fjz0,tz);
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             r11              = _mm_mul_pd(rsq11,rinv11);
2081
2082             /* Calculate table index by multiplying r with table scale and truncate to integer */
2083             rt               = _mm_mul_pd(r11,vftabscale);
2084             vfitab           = _mm_cvttpd_epi32(rt);
2085             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2086             vfitab           = _mm_slli_epi32(vfitab,2);
2087
2088             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2089             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2090             F                = _mm_setzero_pd();
2091             GMX_MM_TRANSPOSE2_PD(Y,F);
2092             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2093             H                = _mm_setzero_pd();
2094             GMX_MM_TRANSPOSE2_PD(G,H);
2095             Heps             = _mm_mul_pd(vfeps,H);
2096             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2097             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2098             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2099
2100             fscal            = felec;
2101
2102             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2103
2104             /* Calculate temporary vectorial force */
2105             tx               = _mm_mul_pd(fscal,dx11);
2106             ty               = _mm_mul_pd(fscal,dy11);
2107             tz               = _mm_mul_pd(fscal,dz11);
2108
2109             /* Update vectorial force */
2110             fix1             = _mm_add_pd(fix1,tx);
2111             fiy1             = _mm_add_pd(fiy1,ty);
2112             fiz1             = _mm_add_pd(fiz1,tz);
2113
2114             fjx1             = _mm_add_pd(fjx1,tx);
2115             fjy1             = _mm_add_pd(fjy1,ty);
2116             fjz1             = _mm_add_pd(fjz1,tz);
2117
2118             /**************************
2119              * CALCULATE INTERACTIONS *
2120              **************************/
2121
2122             r12              = _mm_mul_pd(rsq12,rinv12);
2123
2124             /* Calculate table index by multiplying r with table scale and truncate to integer */
2125             rt               = _mm_mul_pd(r12,vftabscale);
2126             vfitab           = _mm_cvttpd_epi32(rt);
2127             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2128             vfitab           = _mm_slli_epi32(vfitab,2);
2129
2130             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2131             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132             F                = _mm_setzero_pd();
2133             GMX_MM_TRANSPOSE2_PD(Y,F);
2134             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2135             H                = _mm_setzero_pd();
2136             GMX_MM_TRANSPOSE2_PD(G,H);
2137             Heps             = _mm_mul_pd(vfeps,H);
2138             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2139             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2140             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2141
2142             fscal            = felec;
2143
2144             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2145
2146             /* Calculate temporary vectorial force */
2147             tx               = _mm_mul_pd(fscal,dx12);
2148             ty               = _mm_mul_pd(fscal,dy12);
2149             tz               = _mm_mul_pd(fscal,dz12);
2150
2151             /* Update vectorial force */
2152             fix1             = _mm_add_pd(fix1,tx);
2153             fiy1             = _mm_add_pd(fiy1,ty);
2154             fiz1             = _mm_add_pd(fiz1,tz);
2155
2156             fjx2             = _mm_add_pd(fjx2,tx);
2157             fjy2             = _mm_add_pd(fjy2,ty);
2158             fjz2             = _mm_add_pd(fjz2,tz);
2159
2160             /**************************
2161              * CALCULATE INTERACTIONS *
2162              **************************/
2163
2164             r20              = _mm_mul_pd(rsq20,rinv20);
2165
2166             /* Calculate table index by multiplying r with table scale and truncate to integer */
2167             rt               = _mm_mul_pd(r20,vftabscale);
2168             vfitab           = _mm_cvttpd_epi32(rt);
2169             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2170             vfitab           = _mm_slli_epi32(vfitab,2);
2171
2172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2173             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2174             F                = _mm_setzero_pd();
2175             GMX_MM_TRANSPOSE2_PD(Y,F);
2176             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2177             H                = _mm_setzero_pd();
2178             GMX_MM_TRANSPOSE2_PD(G,H);
2179             Heps             = _mm_mul_pd(vfeps,H);
2180             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2181             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2182             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2183
2184             fscal            = felec;
2185
2186             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2187
2188             /* Calculate temporary vectorial force */
2189             tx               = _mm_mul_pd(fscal,dx20);
2190             ty               = _mm_mul_pd(fscal,dy20);
2191             tz               = _mm_mul_pd(fscal,dz20);
2192
2193             /* Update vectorial force */
2194             fix2             = _mm_add_pd(fix2,tx);
2195             fiy2             = _mm_add_pd(fiy2,ty);
2196             fiz2             = _mm_add_pd(fiz2,tz);
2197
2198             fjx0             = _mm_add_pd(fjx0,tx);
2199             fjy0             = _mm_add_pd(fjy0,ty);
2200             fjz0             = _mm_add_pd(fjz0,tz);
2201
2202             /**************************
2203              * CALCULATE INTERACTIONS *
2204              **************************/
2205
2206             r21              = _mm_mul_pd(rsq21,rinv21);
2207
2208             /* Calculate table index by multiplying r with table scale and truncate to integer */
2209             rt               = _mm_mul_pd(r21,vftabscale);
2210             vfitab           = _mm_cvttpd_epi32(rt);
2211             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2212             vfitab           = _mm_slli_epi32(vfitab,2);
2213
2214             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2215             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2216             F                = _mm_setzero_pd();
2217             GMX_MM_TRANSPOSE2_PD(Y,F);
2218             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2219             H                = _mm_setzero_pd();
2220             GMX_MM_TRANSPOSE2_PD(G,H);
2221             Heps             = _mm_mul_pd(vfeps,H);
2222             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2223             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2224             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2225
2226             fscal            = felec;
2227
2228             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2229
2230             /* Calculate temporary vectorial force */
2231             tx               = _mm_mul_pd(fscal,dx21);
2232             ty               = _mm_mul_pd(fscal,dy21);
2233             tz               = _mm_mul_pd(fscal,dz21);
2234
2235             /* Update vectorial force */
2236             fix2             = _mm_add_pd(fix2,tx);
2237             fiy2             = _mm_add_pd(fiy2,ty);
2238             fiz2             = _mm_add_pd(fiz2,tz);
2239
2240             fjx1             = _mm_add_pd(fjx1,tx);
2241             fjy1             = _mm_add_pd(fjy1,ty);
2242             fjz1             = _mm_add_pd(fjz1,tz);
2243
2244             /**************************
2245              * CALCULATE INTERACTIONS *
2246              **************************/
2247
2248             r22              = _mm_mul_pd(rsq22,rinv22);
2249
2250             /* Calculate table index by multiplying r with table scale and truncate to integer */
2251             rt               = _mm_mul_pd(r22,vftabscale);
2252             vfitab           = _mm_cvttpd_epi32(rt);
2253             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2254             vfitab           = _mm_slli_epi32(vfitab,2);
2255
2256             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2257             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2258             F                = _mm_setzero_pd();
2259             GMX_MM_TRANSPOSE2_PD(Y,F);
2260             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2261             H                = _mm_setzero_pd();
2262             GMX_MM_TRANSPOSE2_PD(G,H);
2263             Heps             = _mm_mul_pd(vfeps,H);
2264             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2265             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2266             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2267
2268             fscal            = felec;
2269
2270             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2271
2272             /* Calculate temporary vectorial force */
2273             tx               = _mm_mul_pd(fscal,dx22);
2274             ty               = _mm_mul_pd(fscal,dy22);
2275             tz               = _mm_mul_pd(fscal,dz22);
2276
2277             /* Update vectorial force */
2278             fix2             = _mm_add_pd(fix2,tx);
2279             fiy2             = _mm_add_pd(fiy2,ty);
2280             fiz2             = _mm_add_pd(fiz2,tz);
2281
2282             fjx2             = _mm_add_pd(fjx2,tx);
2283             fjy2             = _mm_add_pd(fjy2,ty);
2284             fjz2             = _mm_add_pd(fjz2,tz);
2285
2286             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2287
2288             /* Inner loop uses 359 flops */
2289         }
2290
2291         /* End of innermost loop */
2292
2293         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2294                                               f+i_coord_offset,fshift+i_shift_offset);
2295
2296         /* Increment number of inner iterations */
2297         inneriter                  += j_index_end - j_index_start;
2298
2299         /* Outer loop uses 18 flops */
2300     }
2301
2302     /* Increment number of outer iterations */
2303     outeriter        += nri;
2304
2305     /* Update outer/inner flops */
2306
2307     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);
2308 }