e875f8931e10e152c243051605576191a4b9472d
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B;
91     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B;
93     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B;
95     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B;
97     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
115     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
116     __m128i          ewitab;
117     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118     real             *ewtab;
119     __m128d          dummy_mask,cutoff_mask;
120     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121     __m128d          one     = _mm_set1_pd(1.0);
122     __m128d          two     = _mm_set1_pd(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm_set1_pd(fr->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm_set1_pd(charge[inr+1]);
153     jq2              = _mm_set1_pd(charge[inr+2]);
154     jq3              = _mm_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _mm_mul_pd(iq1,jq1);
159     qq12             = _mm_mul_pd(iq1,jq2);
160     qq13             = _mm_mul_pd(iq1,jq3);
161     qq21             = _mm_mul_pd(iq2,jq1);
162     qq22             = _mm_mul_pd(iq2,jq2);
163     qq23             = _mm_mul_pd(iq2,jq3);
164     qq31             = _mm_mul_pd(iq3,jq1);
165     qq32             = _mm_mul_pd(iq3,jq2);
166     qq33             = _mm_mul_pd(iq3,jq3);
167
168     /* Avoid stupid compiler warnings */
169     jnrA = jnrB = 0;
170     j_coord_offsetA = 0;
171     j_coord_offsetB = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     /* Start outer loop over neighborlists */
177     for(iidx=0; iidx<nri; iidx++)
178     {
179         /* Load shift vector for this list */
180         i_shift_offset   = DIM*shiftidx[iidx];
181
182         /* Load limits for loop over neighbors */
183         j_index_start    = jindex[iidx];
184         j_index_end      = jindex[iidx+1];
185
186         /* Get outer coordinate index */
187         inr              = iinr[iidx];
188         i_coord_offset   = DIM*inr;
189
190         /* Load i particle coords and add shift vector */
191         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
192                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
193
194         fix0             = _mm_setzero_pd();
195         fiy0             = _mm_setzero_pd();
196         fiz0             = _mm_setzero_pd();
197         fix1             = _mm_setzero_pd();
198         fiy1             = _mm_setzero_pd();
199         fiz1             = _mm_setzero_pd();
200         fix2             = _mm_setzero_pd();
201         fiy2             = _mm_setzero_pd();
202         fiz2             = _mm_setzero_pd();
203         fix3             = _mm_setzero_pd();
204         fiy3             = _mm_setzero_pd();
205         fiz3             = _mm_setzero_pd();
206
207         /* Reset potential sums */
208         velecsum         = _mm_setzero_pd();
209         vvdwsum          = _mm_setzero_pd();
210
211         /* Start inner kernel loop */
212         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
213         {
214
215             /* Get j neighbor index, and coordinate index */
216             jnrA             = jjnr[jidx];
217             jnrB             = jjnr[jidx+1];
218             j_coord_offsetA  = DIM*jnrA;
219             j_coord_offsetB  = DIM*jnrB;
220
221             /* load j atom coordinates */
222             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
223                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
224                                               &jy2,&jz2,&jx3,&jy3,&jz3);
225
226             /* Calculate displacement vector */
227             dx00             = _mm_sub_pd(ix0,jx0);
228             dy00             = _mm_sub_pd(iy0,jy0);
229             dz00             = _mm_sub_pd(iz0,jz0);
230             dx11             = _mm_sub_pd(ix1,jx1);
231             dy11             = _mm_sub_pd(iy1,jy1);
232             dz11             = _mm_sub_pd(iz1,jz1);
233             dx12             = _mm_sub_pd(ix1,jx2);
234             dy12             = _mm_sub_pd(iy1,jy2);
235             dz12             = _mm_sub_pd(iz1,jz2);
236             dx13             = _mm_sub_pd(ix1,jx3);
237             dy13             = _mm_sub_pd(iy1,jy3);
238             dz13             = _mm_sub_pd(iz1,jz3);
239             dx21             = _mm_sub_pd(ix2,jx1);
240             dy21             = _mm_sub_pd(iy2,jy1);
241             dz21             = _mm_sub_pd(iz2,jz1);
242             dx22             = _mm_sub_pd(ix2,jx2);
243             dy22             = _mm_sub_pd(iy2,jy2);
244             dz22             = _mm_sub_pd(iz2,jz2);
245             dx23             = _mm_sub_pd(ix2,jx3);
246             dy23             = _mm_sub_pd(iy2,jy3);
247             dz23             = _mm_sub_pd(iz2,jz3);
248             dx31             = _mm_sub_pd(ix3,jx1);
249             dy31             = _mm_sub_pd(iy3,jy1);
250             dz31             = _mm_sub_pd(iz3,jz1);
251             dx32             = _mm_sub_pd(ix3,jx2);
252             dy32             = _mm_sub_pd(iy3,jy2);
253             dz32             = _mm_sub_pd(iz3,jz2);
254             dx33             = _mm_sub_pd(ix3,jx3);
255             dy33             = _mm_sub_pd(iy3,jy3);
256             dz33             = _mm_sub_pd(iz3,jz3);
257
258             /* Calculate squared distance and things based on it */
259             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
260             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
261             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
262             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
263             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
264             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
265             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
266             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
267             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
268             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
269
270             rinv11           = gmx_mm_invsqrt_pd(rsq11);
271             rinv12           = gmx_mm_invsqrt_pd(rsq12);
272             rinv13           = gmx_mm_invsqrt_pd(rsq13);
273             rinv21           = gmx_mm_invsqrt_pd(rsq21);
274             rinv22           = gmx_mm_invsqrt_pd(rsq22);
275             rinv23           = gmx_mm_invsqrt_pd(rsq23);
276             rinv31           = gmx_mm_invsqrt_pd(rsq31);
277             rinv32           = gmx_mm_invsqrt_pd(rsq32);
278             rinv33           = gmx_mm_invsqrt_pd(rsq33);
279
280             rinvsq00         = gmx_mm_inv_pd(rsq00);
281             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
282             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
283             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
284             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
285             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
286             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
287             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
288             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
289             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
290
291             fjx0             = _mm_setzero_pd();
292             fjy0             = _mm_setzero_pd();
293             fjz0             = _mm_setzero_pd();
294             fjx1             = _mm_setzero_pd();
295             fjy1             = _mm_setzero_pd();
296             fjz1             = _mm_setzero_pd();
297             fjx2             = _mm_setzero_pd();
298             fjy2             = _mm_setzero_pd();
299             fjz2             = _mm_setzero_pd();
300             fjx3             = _mm_setzero_pd();
301             fjy3             = _mm_setzero_pd();
302             fjz3             = _mm_setzero_pd();
303
304             /**************************
305              * CALCULATE INTERACTIONS *
306              **************************/
307
308             /* LENNARD-JONES DISPERSION/REPULSION */
309
310             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
311             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
312             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
313             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
314             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
315
316             /* Update potential sum for this i atom from the interaction with this j atom. */
317             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
318
319             fscal            = fvdw;
320
321             /* Calculate temporary vectorial force */
322             tx               = _mm_mul_pd(fscal,dx00);
323             ty               = _mm_mul_pd(fscal,dy00);
324             tz               = _mm_mul_pd(fscal,dz00);
325
326             /* Update vectorial force */
327             fix0             = _mm_add_pd(fix0,tx);
328             fiy0             = _mm_add_pd(fiy0,ty);
329             fiz0             = _mm_add_pd(fiz0,tz);
330
331             fjx0             = _mm_add_pd(fjx0,tx);
332             fjy0             = _mm_add_pd(fjy0,ty);
333             fjz0             = _mm_add_pd(fjz0,tz);
334
335             /**************************
336              * CALCULATE INTERACTIONS *
337              **************************/
338
339             r11              = _mm_mul_pd(rsq11,rinv11);
340
341             /* EWALD ELECTROSTATICS */
342
343             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
344             ewrt             = _mm_mul_pd(r11,ewtabscale);
345             ewitab           = _mm_cvttpd_epi32(ewrt);
346             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
347             ewitab           = _mm_slli_epi32(ewitab,2);
348             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
349             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
350             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
351             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
352             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
353             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
354             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
355             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
356             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
357             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
358
359             /* Update potential sum for this i atom from the interaction with this j atom. */
360             velecsum         = _mm_add_pd(velecsum,velec);
361
362             fscal            = felec;
363
364             /* Calculate temporary vectorial force */
365             tx               = _mm_mul_pd(fscal,dx11);
366             ty               = _mm_mul_pd(fscal,dy11);
367             tz               = _mm_mul_pd(fscal,dz11);
368
369             /* Update vectorial force */
370             fix1             = _mm_add_pd(fix1,tx);
371             fiy1             = _mm_add_pd(fiy1,ty);
372             fiz1             = _mm_add_pd(fiz1,tz);
373
374             fjx1             = _mm_add_pd(fjx1,tx);
375             fjy1             = _mm_add_pd(fjy1,ty);
376             fjz1             = _mm_add_pd(fjz1,tz);
377
378             /**************************
379              * CALCULATE INTERACTIONS *
380              **************************/
381
382             r12              = _mm_mul_pd(rsq12,rinv12);
383
384             /* EWALD ELECTROSTATICS */
385
386             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
387             ewrt             = _mm_mul_pd(r12,ewtabscale);
388             ewitab           = _mm_cvttpd_epi32(ewrt);
389             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
390             ewitab           = _mm_slli_epi32(ewitab,2);
391             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
392             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
393             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
394             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
395             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
396             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
397             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
398             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
399             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
400             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
401
402             /* Update potential sum for this i atom from the interaction with this j atom. */
403             velecsum         = _mm_add_pd(velecsum,velec);
404
405             fscal            = felec;
406
407             /* Calculate temporary vectorial force */
408             tx               = _mm_mul_pd(fscal,dx12);
409             ty               = _mm_mul_pd(fscal,dy12);
410             tz               = _mm_mul_pd(fscal,dz12);
411
412             /* Update vectorial force */
413             fix1             = _mm_add_pd(fix1,tx);
414             fiy1             = _mm_add_pd(fiy1,ty);
415             fiz1             = _mm_add_pd(fiz1,tz);
416
417             fjx2             = _mm_add_pd(fjx2,tx);
418             fjy2             = _mm_add_pd(fjy2,ty);
419             fjz2             = _mm_add_pd(fjz2,tz);
420
421             /**************************
422              * CALCULATE INTERACTIONS *
423              **************************/
424
425             r13              = _mm_mul_pd(rsq13,rinv13);
426
427             /* EWALD ELECTROSTATICS */
428
429             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
430             ewrt             = _mm_mul_pd(r13,ewtabscale);
431             ewitab           = _mm_cvttpd_epi32(ewrt);
432             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
433             ewitab           = _mm_slli_epi32(ewitab,2);
434             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
435             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
436             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
437             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
438             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
439             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
440             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
441             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
442             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
443             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
444
445             /* Update potential sum for this i atom from the interaction with this j atom. */
446             velecsum         = _mm_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             /* Calculate temporary vectorial force */
451             tx               = _mm_mul_pd(fscal,dx13);
452             ty               = _mm_mul_pd(fscal,dy13);
453             tz               = _mm_mul_pd(fscal,dz13);
454
455             /* Update vectorial force */
456             fix1             = _mm_add_pd(fix1,tx);
457             fiy1             = _mm_add_pd(fiy1,ty);
458             fiz1             = _mm_add_pd(fiz1,tz);
459
460             fjx3             = _mm_add_pd(fjx3,tx);
461             fjy3             = _mm_add_pd(fjy3,ty);
462             fjz3             = _mm_add_pd(fjz3,tz);
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             r21              = _mm_mul_pd(rsq21,rinv21);
469
470             /* EWALD ELECTROSTATICS */
471
472             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
473             ewrt             = _mm_mul_pd(r21,ewtabscale);
474             ewitab           = _mm_cvttpd_epi32(ewrt);
475             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
476             ewitab           = _mm_slli_epi32(ewitab,2);
477             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
478             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
479             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
480             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
481             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
482             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
483             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
484             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
485             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
486             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
487
488             /* Update potential sum for this i atom from the interaction with this j atom. */
489             velecsum         = _mm_add_pd(velecsum,velec);
490
491             fscal            = felec;
492
493             /* Calculate temporary vectorial force */
494             tx               = _mm_mul_pd(fscal,dx21);
495             ty               = _mm_mul_pd(fscal,dy21);
496             tz               = _mm_mul_pd(fscal,dz21);
497
498             /* Update vectorial force */
499             fix2             = _mm_add_pd(fix2,tx);
500             fiy2             = _mm_add_pd(fiy2,ty);
501             fiz2             = _mm_add_pd(fiz2,tz);
502
503             fjx1             = _mm_add_pd(fjx1,tx);
504             fjy1             = _mm_add_pd(fjy1,ty);
505             fjz1             = _mm_add_pd(fjz1,tz);
506
507             /**************************
508              * CALCULATE INTERACTIONS *
509              **************************/
510
511             r22              = _mm_mul_pd(rsq22,rinv22);
512
513             /* EWALD ELECTROSTATICS */
514
515             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
516             ewrt             = _mm_mul_pd(r22,ewtabscale);
517             ewitab           = _mm_cvttpd_epi32(ewrt);
518             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
519             ewitab           = _mm_slli_epi32(ewitab,2);
520             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
521             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
522             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
523             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
524             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
525             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
526             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
527             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
528             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
529             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
530
531             /* Update potential sum for this i atom from the interaction with this j atom. */
532             velecsum         = _mm_add_pd(velecsum,velec);
533
534             fscal            = felec;
535
536             /* Calculate temporary vectorial force */
537             tx               = _mm_mul_pd(fscal,dx22);
538             ty               = _mm_mul_pd(fscal,dy22);
539             tz               = _mm_mul_pd(fscal,dz22);
540
541             /* Update vectorial force */
542             fix2             = _mm_add_pd(fix2,tx);
543             fiy2             = _mm_add_pd(fiy2,ty);
544             fiz2             = _mm_add_pd(fiz2,tz);
545
546             fjx2             = _mm_add_pd(fjx2,tx);
547             fjy2             = _mm_add_pd(fjy2,ty);
548             fjz2             = _mm_add_pd(fjz2,tz);
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             r23              = _mm_mul_pd(rsq23,rinv23);
555
556             /* EWALD ELECTROSTATICS */
557
558             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
559             ewrt             = _mm_mul_pd(r23,ewtabscale);
560             ewitab           = _mm_cvttpd_epi32(ewrt);
561             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
562             ewitab           = _mm_slli_epi32(ewitab,2);
563             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
564             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
565             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
566             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
567             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
568             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
569             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
570             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
571             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
572             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
573
574             /* Update potential sum for this i atom from the interaction with this j atom. */
575             velecsum         = _mm_add_pd(velecsum,velec);
576
577             fscal            = felec;
578
579             /* Calculate temporary vectorial force */
580             tx               = _mm_mul_pd(fscal,dx23);
581             ty               = _mm_mul_pd(fscal,dy23);
582             tz               = _mm_mul_pd(fscal,dz23);
583
584             /* Update vectorial force */
585             fix2             = _mm_add_pd(fix2,tx);
586             fiy2             = _mm_add_pd(fiy2,ty);
587             fiz2             = _mm_add_pd(fiz2,tz);
588
589             fjx3             = _mm_add_pd(fjx3,tx);
590             fjy3             = _mm_add_pd(fjy3,ty);
591             fjz3             = _mm_add_pd(fjz3,tz);
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             r31              = _mm_mul_pd(rsq31,rinv31);
598
599             /* EWALD ELECTROSTATICS */
600
601             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
602             ewrt             = _mm_mul_pd(r31,ewtabscale);
603             ewitab           = _mm_cvttpd_epi32(ewrt);
604             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
605             ewitab           = _mm_slli_epi32(ewitab,2);
606             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
607             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
608             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
609             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
610             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
611             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
612             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
613             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
614             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
615             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
616
617             /* Update potential sum for this i atom from the interaction with this j atom. */
618             velecsum         = _mm_add_pd(velecsum,velec);
619
620             fscal            = felec;
621
622             /* Calculate temporary vectorial force */
623             tx               = _mm_mul_pd(fscal,dx31);
624             ty               = _mm_mul_pd(fscal,dy31);
625             tz               = _mm_mul_pd(fscal,dz31);
626
627             /* Update vectorial force */
628             fix3             = _mm_add_pd(fix3,tx);
629             fiy3             = _mm_add_pd(fiy3,ty);
630             fiz3             = _mm_add_pd(fiz3,tz);
631
632             fjx1             = _mm_add_pd(fjx1,tx);
633             fjy1             = _mm_add_pd(fjy1,ty);
634             fjz1             = _mm_add_pd(fjz1,tz);
635
636             /**************************
637              * CALCULATE INTERACTIONS *
638              **************************/
639
640             r32              = _mm_mul_pd(rsq32,rinv32);
641
642             /* EWALD ELECTROSTATICS */
643
644             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645             ewrt             = _mm_mul_pd(r32,ewtabscale);
646             ewitab           = _mm_cvttpd_epi32(ewrt);
647             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
648             ewitab           = _mm_slli_epi32(ewitab,2);
649             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
650             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
651             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
652             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
653             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
654             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
655             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
656             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
657             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
658             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
659
660             /* Update potential sum for this i atom from the interaction with this j atom. */
661             velecsum         = _mm_add_pd(velecsum,velec);
662
663             fscal            = felec;
664
665             /* Calculate temporary vectorial force */
666             tx               = _mm_mul_pd(fscal,dx32);
667             ty               = _mm_mul_pd(fscal,dy32);
668             tz               = _mm_mul_pd(fscal,dz32);
669
670             /* Update vectorial force */
671             fix3             = _mm_add_pd(fix3,tx);
672             fiy3             = _mm_add_pd(fiy3,ty);
673             fiz3             = _mm_add_pd(fiz3,tz);
674
675             fjx2             = _mm_add_pd(fjx2,tx);
676             fjy2             = _mm_add_pd(fjy2,ty);
677             fjz2             = _mm_add_pd(fjz2,tz);
678
679             /**************************
680              * CALCULATE INTERACTIONS *
681              **************************/
682
683             r33              = _mm_mul_pd(rsq33,rinv33);
684
685             /* EWALD ELECTROSTATICS */
686
687             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688             ewrt             = _mm_mul_pd(r33,ewtabscale);
689             ewitab           = _mm_cvttpd_epi32(ewrt);
690             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
691             ewitab           = _mm_slli_epi32(ewitab,2);
692             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
693             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
694             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
695             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
696             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
697             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
698             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
699             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
700             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
701             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
702
703             /* Update potential sum for this i atom from the interaction with this j atom. */
704             velecsum         = _mm_add_pd(velecsum,velec);
705
706             fscal            = felec;
707
708             /* Calculate temporary vectorial force */
709             tx               = _mm_mul_pd(fscal,dx33);
710             ty               = _mm_mul_pd(fscal,dy33);
711             tz               = _mm_mul_pd(fscal,dz33);
712
713             /* Update vectorial force */
714             fix3             = _mm_add_pd(fix3,tx);
715             fiy3             = _mm_add_pd(fiy3,ty);
716             fiz3             = _mm_add_pd(fiz3,tz);
717
718             fjx3             = _mm_add_pd(fjx3,tx);
719             fjy3             = _mm_add_pd(fjy3,ty);
720             fjz3             = _mm_add_pd(fjz3,tz);
721
722             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
723
724             /* Inner loop uses 404 flops */
725         }
726
727         if(jidx<j_index_end)
728         {
729
730             jnrA             = jjnr[jidx];
731             j_coord_offsetA  = DIM*jnrA;
732
733             /* load j atom coordinates */
734             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
735                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
736                                               &jy2,&jz2,&jx3,&jy3,&jz3);
737
738             /* Calculate displacement vector */
739             dx00             = _mm_sub_pd(ix0,jx0);
740             dy00             = _mm_sub_pd(iy0,jy0);
741             dz00             = _mm_sub_pd(iz0,jz0);
742             dx11             = _mm_sub_pd(ix1,jx1);
743             dy11             = _mm_sub_pd(iy1,jy1);
744             dz11             = _mm_sub_pd(iz1,jz1);
745             dx12             = _mm_sub_pd(ix1,jx2);
746             dy12             = _mm_sub_pd(iy1,jy2);
747             dz12             = _mm_sub_pd(iz1,jz2);
748             dx13             = _mm_sub_pd(ix1,jx3);
749             dy13             = _mm_sub_pd(iy1,jy3);
750             dz13             = _mm_sub_pd(iz1,jz3);
751             dx21             = _mm_sub_pd(ix2,jx1);
752             dy21             = _mm_sub_pd(iy2,jy1);
753             dz21             = _mm_sub_pd(iz2,jz1);
754             dx22             = _mm_sub_pd(ix2,jx2);
755             dy22             = _mm_sub_pd(iy2,jy2);
756             dz22             = _mm_sub_pd(iz2,jz2);
757             dx23             = _mm_sub_pd(ix2,jx3);
758             dy23             = _mm_sub_pd(iy2,jy3);
759             dz23             = _mm_sub_pd(iz2,jz3);
760             dx31             = _mm_sub_pd(ix3,jx1);
761             dy31             = _mm_sub_pd(iy3,jy1);
762             dz31             = _mm_sub_pd(iz3,jz1);
763             dx32             = _mm_sub_pd(ix3,jx2);
764             dy32             = _mm_sub_pd(iy3,jy2);
765             dz32             = _mm_sub_pd(iz3,jz2);
766             dx33             = _mm_sub_pd(ix3,jx3);
767             dy33             = _mm_sub_pd(iy3,jy3);
768             dz33             = _mm_sub_pd(iz3,jz3);
769
770             /* Calculate squared distance and things based on it */
771             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
772             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
773             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
774             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
775             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
776             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
777             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
778             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
779             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
780             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
781
782             rinv11           = gmx_mm_invsqrt_pd(rsq11);
783             rinv12           = gmx_mm_invsqrt_pd(rsq12);
784             rinv13           = gmx_mm_invsqrt_pd(rsq13);
785             rinv21           = gmx_mm_invsqrt_pd(rsq21);
786             rinv22           = gmx_mm_invsqrt_pd(rsq22);
787             rinv23           = gmx_mm_invsqrt_pd(rsq23);
788             rinv31           = gmx_mm_invsqrt_pd(rsq31);
789             rinv32           = gmx_mm_invsqrt_pd(rsq32);
790             rinv33           = gmx_mm_invsqrt_pd(rsq33);
791
792             rinvsq00         = gmx_mm_inv_pd(rsq00);
793             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
794             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
795             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
796             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
797             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
798             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
799             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
800             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
801             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
802
803             fjx0             = _mm_setzero_pd();
804             fjy0             = _mm_setzero_pd();
805             fjz0             = _mm_setzero_pd();
806             fjx1             = _mm_setzero_pd();
807             fjy1             = _mm_setzero_pd();
808             fjz1             = _mm_setzero_pd();
809             fjx2             = _mm_setzero_pd();
810             fjy2             = _mm_setzero_pd();
811             fjz2             = _mm_setzero_pd();
812             fjx3             = _mm_setzero_pd();
813             fjy3             = _mm_setzero_pd();
814             fjz3             = _mm_setzero_pd();
815
816             /**************************
817              * CALCULATE INTERACTIONS *
818              **************************/
819
820             /* LENNARD-JONES DISPERSION/REPULSION */
821
822             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
823             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
824             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
825             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
826             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
827
828             /* Update potential sum for this i atom from the interaction with this j atom. */
829             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
830             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
831
832             fscal            = fvdw;
833
834             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
835
836             /* Calculate temporary vectorial force */
837             tx               = _mm_mul_pd(fscal,dx00);
838             ty               = _mm_mul_pd(fscal,dy00);
839             tz               = _mm_mul_pd(fscal,dz00);
840
841             /* Update vectorial force */
842             fix0             = _mm_add_pd(fix0,tx);
843             fiy0             = _mm_add_pd(fiy0,ty);
844             fiz0             = _mm_add_pd(fiz0,tz);
845
846             fjx0             = _mm_add_pd(fjx0,tx);
847             fjy0             = _mm_add_pd(fjy0,ty);
848             fjz0             = _mm_add_pd(fjz0,tz);
849
850             /**************************
851              * CALCULATE INTERACTIONS *
852              **************************/
853
854             r11              = _mm_mul_pd(rsq11,rinv11);
855
856             /* EWALD ELECTROSTATICS */
857
858             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
859             ewrt             = _mm_mul_pd(r11,ewtabscale);
860             ewitab           = _mm_cvttpd_epi32(ewrt);
861             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
862             ewitab           = _mm_slli_epi32(ewitab,2);
863             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
864             ewtabD           = _mm_setzero_pd();
865             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
866             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
867             ewtabFn          = _mm_setzero_pd();
868             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
869             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
870             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
871             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
872             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
873
874             /* Update potential sum for this i atom from the interaction with this j atom. */
875             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
876             velecsum         = _mm_add_pd(velecsum,velec);
877
878             fscal            = felec;
879
880             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
881
882             /* Calculate temporary vectorial force */
883             tx               = _mm_mul_pd(fscal,dx11);
884             ty               = _mm_mul_pd(fscal,dy11);
885             tz               = _mm_mul_pd(fscal,dz11);
886
887             /* Update vectorial force */
888             fix1             = _mm_add_pd(fix1,tx);
889             fiy1             = _mm_add_pd(fiy1,ty);
890             fiz1             = _mm_add_pd(fiz1,tz);
891
892             fjx1             = _mm_add_pd(fjx1,tx);
893             fjy1             = _mm_add_pd(fjy1,ty);
894             fjz1             = _mm_add_pd(fjz1,tz);
895
896             /**************************
897              * CALCULATE INTERACTIONS *
898              **************************/
899
900             r12              = _mm_mul_pd(rsq12,rinv12);
901
902             /* EWALD ELECTROSTATICS */
903
904             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
905             ewrt             = _mm_mul_pd(r12,ewtabscale);
906             ewitab           = _mm_cvttpd_epi32(ewrt);
907             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
908             ewitab           = _mm_slli_epi32(ewitab,2);
909             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
910             ewtabD           = _mm_setzero_pd();
911             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
912             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
913             ewtabFn          = _mm_setzero_pd();
914             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
915             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
916             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
917             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
918             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
922             velecsum         = _mm_add_pd(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm_mul_pd(fscal,dx12);
930             ty               = _mm_mul_pd(fscal,dy12);
931             tz               = _mm_mul_pd(fscal,dz12);
932
933             /* Update vectorial force */
934             fix1             = _mm_add_pd(fix1,tx);
935             fiy1             = _mm_add_pd(fiy1,ty);
936             fiz1             = _mm_add_pd(fiz1,tz);
937
938             fjx2             = _mm_add_pd(fjx2,tx);
939             fjy2             = _mm_add_pd(fjy2,ty);
940             fjz2             = _mm_add_pd(fjz2,tz);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r13              = _mm_mul_pd(rsq13,rinv13);
947
948             /* EWALD ELECTROSTATICS */
949
950             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
951             ewrt             = _mm_mul_pd(r13,ewtabscale);
952             ewitab           = _mm_cvttpd_epi32(ewrt);
953             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
954             ewitab           = _mm_slli_epi32(ewitab,2);
955             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
956             ewtabD           = _mm_setzero_pd();
957             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
958             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
959             ewtabFn          = _mm_setzero_pd();
960             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
961             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
962             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
963             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
964             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
965
966             /* Update potential sum for this i atom from the interaction with this j atom. */
967             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
968             velecsum         = _mm_add_pd(velecsum,velec);
969
970             fscal            = felec;
971
972             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
973
974             /* Calculate temporary vectorial force */
975             tx               = _mm_mul_pd(fscal,dx13);
976             ty               = _mm_mul_pd(fscal,dy13);
977             tz               = _mm_mul_pd(fscal,dz13);
978
979             /* Update vectorial force */
980             fix1             = _mm_add_pd(fix1,tx);
981             fiy1             = _mm_add_pd(fiy1,ty);
982             fiz1             = _mm_add_pd(fiz1,tz);
983
984             fjx3             = _mm_add_pd(fjx3,tx);
985             fjy3             = _mm_add_pd(fjy3,ty);
986             fjz3             = _mm_add_pd(fjz3,tz);
987
988             /**************************
989              * CALCULATE INTERACTIONS *
990              **************************/
991
992             r21              = _mm_mul_pd(rsq21,rinv21);
993
994             /* EWALD ELECTROSTATICS */
995
996             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
997             ewrt             = _mm_mul_pd(r21,ewtabscale);
998             ewitab           = _mm_cvttpd_epi32(ewrt);
999             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1000             ewitab           = _mm_slli_epi32(ewitab,2);
1001             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1002             ewtabD           = _mm_setzero_pd();
1003             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1004             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1005             ewtabFn          = _mm_setzero_pd();
1006             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1007             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1008             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1009             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1010             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1011
1012             /* Update potential sum for this i atom from the interaction with this j atom. */
1013             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1014             velecsum         = _mm_add_pd(velecsum,velec);
1015
1016             fscal            = felec;
1017
1018             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm_mul_pd(fscal,dx21);
1022             ty               = _mm_mul_pd(fscal,dy21);
1023             tz               = _mm_mul_pd(fscal,dz21);
1024
1025             /* Update vectorial force */
1026             fix2             = _mm_add_pd(fix2,tx);
1027             fiy2             = _mm_add_pd(fiy2,ty);
1028             fiz2             = _mm_add_pd(fiz2,tz);
1029
1030             fjx1             = _mm_add_pd(fjx1,tx);
1031             fjy1             = _mm_add_pd(fjy1,ty);
1032             fjz1             = _mm_add_pd(fjz1,tz);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             r22              = _mm_mul_pd(rsq22,rinv22);
1039
1040             /* EWALD ELECTROSTATICS */
1041
1042             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1043             ewrt             = _mm_mul_pd(r22,ewtabscale);
1044             ewitab           = _mm_cvttpd_epi32(ewrt);
1045             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1046             ewitab           = _mm_slli_epi32(ewitab,2);
1047             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1048             ewtabD           = _mm_setzero_pd();
1049             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1050             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1051             ewtabFn          = _mm_setzero_pd();
1052             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1053             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1054             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1055             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1056             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1057
1058             /* Update potential sum for this i atom from the interaction with this j atom. */
1059             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1060             velecsum         = _mm_add_pd(velecsum,velec);
1061
1062             fscal            = felec;
1063
1064             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1065
1066             /* Calculate temporary vectorial force */
1067             tx               = _mm_mul_pd(fscal,dx22);
1068             ty               = _mm_mul_pd(fscal,dy22);
1069             tz               = _mm_mul_pd(fscal,dz22);
1070
1071             /* Update vectorial force */
1072             fix2             = _mm_add_pd(fix2,tx);
1073             fiy2             = _mm_add_pd(fiy2,ty);
1074             fiz2             = _mm_add_pd(fiz2,tz);
1075
1076             fjx2             = _mm_add_pd(fjx2,tx);
1077             fjy2             = _mm_add_pd(fjy2,ty);
1078             fjz2             = _mm_add_pd(fjz2,tz);
1079
1080             /**************************
1081              * CALCULATE INTERACTIONS *
1082              **************************/
1083
1084             r23              = _mm_mul_pd(rsq23,rinv23);
1085
1086             /* EWALD ELECTROSTATICS */
1087
1088             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1089             ewrt             = _mm_mul_pd(r23,ewtabscale);
1090             ewitab           = _mm_cvttpd_epi32(ewrt);
1091             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1092             ewitab           = _mm_slli_epi32(ewitab,2);
1093             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1094             ewtabD           = _mm_setzero_pd();
1095             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1096             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1097             ewtabFn          = _mm_setzero_pd();
1098             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1099             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1100             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1101             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1102             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1103
1104             /* Update potential sum for this i atom from the interaction with this j atom. */
1105             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1106             velecsum         = _mm_add_pd(velecsum,velec);
1107
1108             fscal            = felec;
1109
1110             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1111
1112             /* Calculate temporary vectorial force */
1113             tx               = _mm_mul_pd(fscal,dx23);
1114             ty               = _mm_mul_pd(fscal,dy23);
1115             tz               = _mm_mul_pd(fscal,dz23);
1116
1117             /* Update vectorial force */
1118             fix2             = _mm_add_pd(fix2,tx);
1119             fiy2             = _mm_add_pd(fiy2,ty);
1120             fiz2             = _mm_add_pd(fiz2,tz);
1121
1122             fjx3             = _mm_add_pd(fjx3,tx);
1123             fjy3             = _mm_add_pd(fjy3,ty);
1124             fjz3             = _mm_add_pd(fjz3,tz);
1125
1126             /**************************
1127              * CALCULATE INTERACTIONS *
1128              **************************/
1129
1130             r31              = _mm_mul_pd(rsq31,rinv31);
1131
1132             /* EWALD ELECTROSTATICS */
1133
1134             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1135             ewrt             = _mm_mul_pd(r31,ewtabscale);
1136             ewitab           = _mm_cvttpd_epi32(ewrt);
1137             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1138             ewitab           = _mm_slli_epi32(ewitab,2);
1139             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1140             ewtabD           = _mm_setzero_pd();
1141             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1142             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1143             ewtabFn          = _mm_setzero_pd();
1144             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1145             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1146             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1147             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1148             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1149
1150             /* Update potential sum for this i atom from the interaction with this j atom. */
1151             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1152             velecsum         = _mm_add_pd(velecsum,velec);
1153
1154             fscal            = felec;
1155
1156             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1157
1158             /* Calculate temporary vectorial force */
1159             tx               = _mm_mul_pd(fscal,dx31);
1160             ty               = _mm_mul_pd(fscal,dy31);
1161             tz               = _mm_mul_pd(fscal,dz31);
1162
1163             /* Update vectorial force */
1164             fix3             = _mm_add_pd(fix3,tx);
1165             fiy3             = _mm_add_pd(fiy3,ty);
1166             fiz3             = _mm_add_pd(fiz3,tz);
1167
1168             fjx1             = _mm_add_pd(fjx1,tx);
1169             fjy1             = _mm_add_pd(fjy1,ty);
1170             fjz1             = _mm_add_pd(fjz1,tz);
1171
1172             /**************************
1173              * CALCULATE INTERACTIONS *
1174              **************************/
1175
1176             r32              = _mm_mul_pd(rsq32,rinv32);
1177
1178             /* EWALD ELECTROSTATICS */
1179
1180             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1181             ewrt             = _mm_mul_pd(r32,ewtabscale);
1182             ewitab           = _mm_cvttpd_epi32(ewrt);
1183             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1184             ewitab           = _mm_slli_epi32(ewitab,2);
1185             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1186             ewtabD           = _mm_setzero_pd();
1187             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1188             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1189             ewtabFn          = _mm_setzero_pd();
1190             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1191             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1192             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1193             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1194             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1195
1196             /* Update potential sum for this i atom from the interaction with this j atom. */
1197             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1198             velecsum         = _mm_add_pd(velecsum,velec);
1199
1200             fscal            = felec;
1201
1202             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1203
1204             /* Calculate temporary vectorial force */
1205             tx               = _mm_mul_pd(fscal,dx32);
1206             ty               = _mm_mul_pd(fscal,dy32);
1207             tz               = _mm_mul_pd(fscal,dz32);
1208
1209             /* Update vectorial force */
1210             fix3             = _mm_add_pd(fix3,tx);
1211             fiy3             = _mm_add_pd(fiy3,ty);
1212             fiz3             = _mm_add_pd(fiz3,tz);
1213
1214             fjx2             = _mm_add_pd(fjx2,tx);
1215             fjy2             = _mm_add_pd(fjy2,ty);
1216             fjz2             = _mm_add_pd(fjz2,tz);
1217
1218             /**************************
1219              * CALCULATE INTERACTIONS *
1220              **************************/
1221
1222             r33              = _mm_mul_pd(rsq33,rinv33);
1223
1224             /* EWALD ELECTROSTATICS */
1225
1226             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1227             ewrt             = _mm_mul_pd(r33,ewtabscale);
1228             ewitab           = _mm_cvttpd_epi32(ewrt);
1229             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1230             ewitab           = _mm_slli_epi32(ewitab,2);
1231             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1232             ewtabD           = _mm_setzero_pd();
1233             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1234             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1235             ewtabFn          = _mm_setzero_pd();
1236             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1237             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1238             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1239             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1240             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1241
1242             /* Update potential sum for this i atom from the interaction with this j atom. */
1243             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1244             velecsum         = _mm_add_pd(velecsum,velec);
1245
1246             fscal            = felec;
1247
1248             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1249
1250             /* Calculate temporary vectorial force */
1251             tx               = _mm_mul_pd(fscal,dx33);
1252             ty               = _mm_mul_pd(fscal,dy33);
1253             tz               = _mm_mul_pd(fscal,dz33);
1254
1255             /* Update vectorial force */
1256             fix3             = _mm_add_pd(fix3,tx);
1257             fiy3             = _mm_add_pd(fiy3,ty);
1258             fiz3             = _mm_add_pd(fiz3,tz);
1259
1260             fjx3             = _mm_add_pd(fjx3,tx);
1261             fjy3             = _mm_add_pd(fjy3,ty);
1262             fjz3             = _mm_add_pd(fjz3,tz);
1263
1264             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1265
1266             /* Inner loop uses 404 flops */
1267         }
1268
1269         /* End of innermost loop */
1270
1271         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1272                                               f+i_coord_offset,fshift+i_shift_offset);
1273
1274         ggid                        = gid[iidx];
1275         /* Update potential energies */
1276         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1277         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1278
1279         /* Increment number of inner iterations */
1280         inneriter                  += j_index_end - j_index_start;
1281
1282         /* Outer loop uses 26 flops */
1283     }
1284
1285     /* Increment number of outer iterations */
1286     outeriter        += nri;
1287
1288     /* Update outer/inner flops */
1289
1290     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*404);
1291 }
1292 /*
1293  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double
1294  * Electrostatics interaction: Ewald
1295  * VdW interaction:            LennardJones
1296  * Geometry:                   Water4-Water4
1297  * Calculate force/pot:        Force
1298  */
1299 void
1300 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double
1301                     (t_nblist                    * gmx_restrict       nlist,
1302                      rvec                        * gmx_restrict          xx,
1303                      rvec                        * gmx_restrict          ff,
1304                      t_forcerec                  * gmx_restrict          fr,
1305                      t_mdatoms                   * gmx_restrict     mdatoms,
1306                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1307                      t_nrnb                      * gmx_restrict        nrnb)
1308 {
1309     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1310      * just 0 for non-waters.
1311      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1312      * jnr indices corresponding to data put in the four positions in the SIMD register.
1313      */
1314     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1315     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1316     int              jnrA,jnrB;
1317     int              j_coord_offsetA,j_coord_offsetB;
1318     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1319     real             rcutoff_scalar;
1320     real             *shiftvec,*fshift,*x,*f;
1321     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1322     int              vdwioffset0;
1323     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1324     int              vdwioffset1;
1325     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1326     int              vdwioffset2;
1327     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1328     int              vdwioffset3;
1329     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1330     int              vdwjidx0A,vdwjidx0B;
1331     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1332     int              vdwjidx1A,vdwjidx1B;
1333     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1334     int              vdwjidx2A,vdwjidx2B;
1335     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1336     int              vdwjidx3A,vdwjidx3B;
1337     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1338     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1339     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1340     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1341     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1342     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1343     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1344     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1345     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1346     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1347     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1348     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1349     real             *charge;
1350     int              nvdwtype;
1351     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1352     int              *vdwtype;
1353     real             *vdwparam;
1354     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1355     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1356     __m128i          ewitab;
1357     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1358     real             *ewtab;
1359     __m128d          dummy_mask,cutoff_mask;
1360     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1361     __m128d          one     = _mm_set1_pd(1.0);
1362     __m128d          two     = _mm_set1_pd(2.0);
1363     x                = xx[0];
1364     f                = ff[0];
1365
1366     nri              = nlist->nri;
1367     iinr             = nlist->iinr;
1368     jindex           = nlist->jindex;
1369     jjnr             = nlist->jjnr;
1370     shiftidx         = nlist->shift;
1371     gid              = nlist->gid;
1372     shiftvec         = fr->shift_vec[0];
1373     fshift           = fr->fshift[0];
1374     facel            = _mm_set1_pd(fr->epsfac);
1375     charge           = mdatoms->chargeA;
1376     nvdwtype         = fr->ntype;
1377     vdwparam         = fr->nbfp;
1378     vdwtype          = mdatoms->typeA;
1379
1380     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1381     ewtab            = fr->ic->tabq_coul_F;
1382     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1383     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1384
1385     /* Setup water-specific parameters */
1386     inr              = nlist->iinr[0];
1387     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1388     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1389     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1390     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1391
1392     jq1              = _mm_set1_pd(charge[inr+1]);
1393     jq2              = _mm_set1_pd(charge[inr+2]);
1394     jq3              = _mm_set1_pd(charge[inr+3]);
1395     vdwjidx0A        = 2*vdwtype[inr+0];
1396     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1397     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1398     qq11             = _mm_mul_pd(iq1,jq1);
1399     qq12             = _mm_mul_pd(iq1,jq2);
1400     qq13             = _mm_mul_pd(iq1,jq3);
1401     qq21             = _mm_mul_pd(iq2,jq1);
1402     qq22             = _mm_mul_pd(iq2,jq2);
1403     qq23             = _mm_mul_pd(iq2,jq3);
1404     qq31             = _mm_mul_pd(iq3,jq1);
1405     qq32             = _mm_mul_pd(iq3,jq2);
1406     qq33             = _mm_mul_pd(iq3,jq3);
1407
1408     /* Avoid stupid compiler warnings */
1409     jnrA = jnrB = 0;
1410     j_coord_offsetA = 0;
1411     j_coord_offsetB = 0;
1412
1413     outeriter        = 0;
1414     inneriter        = 0;
1415
1416     /* Start outer loop over neighborlists */
1417     for(iidx=0; iidx<nri; iidx++)
1418     {
1419         /* Load shift vector for this list */
1420         i_shift_offset   = DIM*shiftidx[iidx];
1421
1422         /* Load limits for loop over neighbors */
1423         j_index_start    = jindex[iidx];
1424         j_index_end      = jindex[iidx+1];
1425
1426         /* Get outer coordinate index */
1427         inr              = iinr[iidx];
1428         i_coord_offset   = DIM*inr;
1429
1430         /* Load i particle coords and add shift vector */
1431         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1432                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1433
1434         fix0             = _mm_setzero_pd();
1435         fiy0             = _mm_setzero_pd();
1436         fiz0             = _mm_setzero_pd();
1437         fix1             = _mm_setzero_pd();
1438         fiy1             = _mm_setzero_pd();
1439         fiz1             = _mm_setzero_pd();
1440         fix2             = _mm_setzero_pd();
1441         fiy2             = _mm_setzero_pd();
1442         fiz2             = _mm_setzero_pd();
1443         fix3             = _mm_setzero_pd();
1444         fiy3             = _mm_setzero_pd();
1445         fiz3             = _mm_setzero_pd();
1446
1447         /* Start inner kernel loop */
1448         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1449         {
1450
1451             /* Get j neighbor index, and coordinate index */
1452             jnrA             = jjnr[jidx];
1453             jnrB             = jjnr[jidx+1];
1454             j_coord_offsetA  = DIM*jnrA;
1455             j_coord_offsetB  = DIM*jnrB;
1456
1457             /* load j atom coordinates */
1458             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1459                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1460                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1461
1462             /* Calculate displacement vector */
1463             dx00             = _mm_sub_pd(ix0,jx0);
1464             dy00             = _mm_sub_pd(iy0,jy0);
1465             dz00             = _mm_sub_pd(iz0,jz0);
1466             dx11             = _mm_sub_pd(ix1,jx1);
1467             dy11             = _mm_sub_pd(iy1,jy1);
1468             dz11             = _mm_sub_pd(iz1,jz1);
1469             dx12             = _mm_sub_pd(ix1,jx2);
1470             dy12             = _mm_sub_pd(iy1,jy2);
1471             dz12             = _mm_sub_pd(iz1,jz2);
1472             dx13             = _mm_sub_pd(ix1,jx3);
1473             dy13             = _mm_sub_pd(iy1,jy3);
1474             dz13             = _mm_sub_pd(iz1,jz3);
1475             dx21             = _mm_sub_pd(ix2,jx1);
1476             dy21             = _mm_sub_pd(iy2,jy1);
1477             dz21             = _mm_sub_pd(iz2,jz1);
1478             dx22             = _mm_sub_pd(ix2,jx2);
1479             dy22             = _mm_sub_pd(iy2,jy2);
1480             dz22             = _mm_sub_pd(iz2,jz2);
1481             dx23             = _mm_sub_pd(ix2,jx3);
1482             dy23             = _mm_sub_pd(iy2,jy3);
1483             dz23             = _mm_sub_pd(iz2,jz3);
1484             dx31             = _mm_sub_pd(ix3,jx1);
1485             dy31             = _mm_sub_pd(iy3,jy1);
1486             dz31             = _mm_sub_pd(iz3,jz1);
1487             dx32             = _mm_sub_pd(ix3,jx2);
1488             dy32             = _mm_sub_pd(iy3,jy2);
1489             dz32             = _mm_sub_pd(iz3,jz2);
1490             dx33             = _mm_sub_pd(ix3,jx3);
1491             dy33             = _mm_sub_pd(iy3,jy3);
1492             dz33             = _mm_sub_pd(iz3,jz3);
1493
1494             /* Calculate squared distance and things based on it */
1495             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1496             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1497             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1498             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1499             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1500             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1501             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1502             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1503             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1504             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1505
1506             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1507             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1508             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1509             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1510             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1511             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1512             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1513             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1514             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1515
1516             rinvsq00         = gmx_mm_inv_pd(rsq00);
1517             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1518             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1519             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1520             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1521             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1522             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1523             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1524             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1525             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1526
1527             fjx0             = _mm_setzero_pd();
1528             fjy0             = _mm_setzero_pd();
1529             fjz0             = _mm_setzero_pd();
1530             fjx1             = _mm_setzero_pd();
1531             fjy1             = _mm_setzero_pd();
1532             fjz1             = _mm_setzero_pd();
1533             fjx2             = _mm_setzero_pd();
1534             fjy2             = _mm_setzero_pd();
1535             fjz2             = _mm_setzero_pd();
1536             fjx3             = _mm_setzero_pd();
1537             fjy3             = _mm_setzero_pd();
1538             fjz3             = _mm_setzero_pd();
1539
1540             /**************************
1541              * CALCULATE INTERACTIONS *
1542              **************************/
1543
1544             /* LENNARD-JONES DISPERSION/REPULSION */
1545
1546             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1547             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1548
1549             fscal            = fvdw;
1550
1551             /* Calculate temporary vectorial force */
1552             tx               = _mm_mul_pd(fscal,dx00);
1553             ty               = _mm_mul_pd(fscal,dy00);
1554             tz               = _mm_mul_pd(fscal,dz00);
1555
1556             /* Update vectorial force */
1557             fix0             = _mm_add_pd(fix0,tx);
1558             fiy0             = _mm_add_pd(fiy0,ty);
1559             fiz0             = _mm_add_pd(fiz0,tz);
1560
1561             fjx0             = _mm_add_pd(fjx0,tx);
1562             fjy0             = _mm_add_pd(fjy0,ty);
1563             fjz0             = _mm_add_pd(fjz0,tz);
1564
1565             /**************************
1566              * CALCULATE INTERACTIONS *
1567              **************************/
1568
1569             r11              = _mm_mul_pd(rsq11,rinv11);
1570
1571             /* EWALD ELECTROSTATICS */
1572
1573             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1574             ewrt             = _mm_mul_pd(r11,ewtabscale);
1575             ewitab           = _mm_cvttpd_epi32(ewrt);
1576             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1577             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1578                                          &ewtabF,&ewtabFn);
1579             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1580             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1581
1582             fscal            = felec;
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm_mul_pd(fscal,dx11);
1586             ty               = _mm_mul_pd(fscal,dy11);
1587             tz               = _mm_mul_pd(fscal,dz11);
1588
1589             /* Update vectorial force */
1590             fix1             = _mm_add_pd(fix1,tx);
1591             fiy1             = _mm_add_pd(fiy1,ty);
1592             fiz1             = _mm_add_pd(fiz1,tz);
1593
1594             fjx1             = _mm_add_pd(fjx1,tx);
1595             fjy1             = _mm_add_pd(fjy1,ty);
1596             fjz1             = _mm_add_pd(fjz1,tz);
1597
1598             /**************************
1599              * CALCULATE INTERACTIONS *
1600              **************************/
1601
1602             r12              = _mm_mul_pd(rsq12,rinv12);
1603
1604             /* EWALD ELECTROSTATICS */
1605
1606             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1607             ewrt             = _mm_mul_pd(r12,ewtabscale);
1608             ewitab           = _mm_cvttpd_epi32(ewrt);
1609             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1610             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1611                                          &ewtabF,&ewtabFn);
1612             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1613             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1614
1615             fscal            = felec;
1616
1617             /* Calculate temporary vectorial force */
1618             tx               = _mm_mul_pd(fscal,dx12);
1619             ty               = _mm_mul_pd(fscal,dy12);
1620             tz               = _mm_mul_pd(fscal,dz12);
1621
1622             /* Update vectorial force */
1623             fix1             = _mm_add_pd(fix1,tx);
1624             fiy1             = _mm_add_pd(fiy1,ty);
1625             fiz1             = _mm_add_pd(fiz1,tz);
1626
1627             fjx2             = _mm_add_pd(fjx2,tx);
1628             fjy2             = _mm_add_pd(fjy2,ty);
1629             fjz2             = _mm_add_pd(fjz2,tz);
1630
1631             /**************************
1632              * CALCULATE INTERACTIONS *
1633              **************************/
1634
1635             r13              = _mm_mul_pd(rsq13,rinv13);
1636
1637             /* EWALD ELECTROSTATICS */
1638
1639             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640             ewrt             = _mm_mul_pd(r13,ewtabscale);
1641             ewitab           = _mm_cvttpd_epi32(ewrt);
1642             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1643             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644                                          &ewtabF,&ewtabFn);
1645             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1646             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1647
1648             fscal            = felec;
1649
1650             /* Calculate temporary vectorial force */
1651             tx               = _mm_mul_pd(fscal,dx13);
1652             ty               = _mm_mul_pd(fscal,dy13);
1653             tz               = _mm_mul_pd(fscal,dz13);
1654
1655             /* Update vectorial force */
1656             fix1             = _mm_add_pd(fix1,tx);
1657             fiy1             = _mm_add_pd(fiy1,ty);
1658             fiz1             = _mm_add_pd(fiz1,tz);
1659
1660             fjx3             = _mm_add_pd(fjx3,tx);
1661             fjy3             = _mm_add_pd(fjy3,ty);
1662             fjz3             = _mm_add_pd(fjz3,tz);
1663
1664             /**************************
1665              * CALCULATE INTERACTIONS *
1666              **************************/
1667
1668             r21              = _mm_mul_pd(rsq21,rinv21);
1669
1670             /* EWALD ELECTROSTATICS */
1671
1672             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1673             ewrt             = _mm_mul_pd(r21,ewtabscale);
1674             ewitab           = _mm_cvttpd_epi32(ewrt);
1675             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1676             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1677                                          &ewtabF,&ewtabFn);
1678             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1679             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1680
1681             fscal            = felec;
1682
1683             /* Calculate temporary vectorial force */
1684             tx               = _mm_mul_pd(fscal,dx21);
1685             ty               = _mm_mul_pd(fscal,dy21);
1686             tz               = _mm_mul_pd(fscal,dz21);
1687
1688             /* Update vectorial force */
1689             fix2             = _mm_add_pd(fix2,tx);
1690             fiy2             = _mm_add_pd(fiy2,ty);
1691             fiz2             = _mm_add_pd(fiz2,tz);
1692
1693             fjx1             = _mm_add_pd(fjx1,tx);
1694             fjy1             = _mm_add_pd(fjy1,ty);
1695             fjz1             = _mm_add_pd(fjz1,tz);
1696
1697             /**************************
1698              * CALCULATE INTERACTIONS *
1699              **************************/
1700
1701             r22              = _mm_mul_pd(rsq22,rinv22);
1702
1703             /* EWALD ELECTROSTATICS */
1704
1705             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1706             ewrt             = _mm_mul_pd(r22,ewtabscale);
1707             ewitab           = _mm_cvttpd_epi32(ewrt);
1708             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1709             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1710                                          &ewtabF,&ewtabFn);
1711             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1712             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1713
1714             fscal            = felec;
1715
1716             /* Calculate temporary vectorial force */
1717             tx               = _mm_mul_pd(fscal,dx22);
1718             ty               = _mm_mul_pd(fscal,dy22);
1719             tz               = _mm_mul_pd(fscal,dz22);
1720
1721             /* Update vectorial force */
1722             fix2             = _mm_add_pd(fix2,tx);
1723             fiy2             = _mm_add_pd(fiy2,ty);
1724             fiz2             = _mm_add_pd(fiz2,tz);
1725
1726             fjx2             = _mm_add_pd(fjx2,tx);
1727             fjy2             = _mm_add_pd(fjy2,ty);
1728             fjz2             = _mm_add_pd(fjz2,tz);
1729
1730             /**************************
1731              * CALCULATE INTERACTIONS *
1732              **************************/
1733
1734             r23              = _mm_mul_pd(rsq23,rinv23);
1735
1736             /* EWALD ELECTROSTATICS */
1737
1738             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1739             ewrt             = _mm_mul_pd(r23,ewtabscale);
1740             ewitab           = _mm_cvttpd_epi32(ewrt);
1741             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1742             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1743                                          &ewtabF,&ewtabFn);
1744             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1745             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1746
1747             fscal            = felec;
1748
1749             /* Calculate temporary vectorial force */
1750             tx               = _mm_mul_pd(fscal,dx23);
1751             ty               = _mm_mul_pd(fscal,dy23);
1752             tz               = _mm_mul_pd(fscal,dz23);
1753
1754             /* Update vectorial force */
1755             fix2             = _mm_add_pd(fix2,tx);
1756             fiy2             = _mm_add_pd(fiy2,ty);
1757             fiz2             = _mm_add_pd(fiz2,tz);
1758
1759             fjx3             = _mm_add_pd(fjx3,tx);
1760             fjy3             = _mm_add_pd(fjy3,ty);
1761             fjz3             = _mm_add_pd(fjz3,tz);
1762
1763             /**************************
1764              * CALCULATE INTERACTIONS *
1765              **************************/
1766
1767             r31              = _mm_mul_pd(rsq31,rinv31);
1768
1769             /* EWALD ELECTROSTATICS */
1770
1771             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1772             ewrt             = _mm_mul_pd(r31,ewtabscale);
1773             ewitab           = _mm_cvttpd_epi32(ewrt);
1774             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1775             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1776                                          &ewtabF,&ewtabFn);
1777             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1778             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1779
1780             fscal            = felec;
1781
1782             /* Calculate temporary vectorial force */
1783             tx               = _mm_mul_pd(fscal,dx31);
1784             ty               = _mm_mul_pd(fscal,dy31);
1785             tz               = _mm_mul_pd(fscal,dz31);
1786
1787             /* Update vectorial force */
1788             fix3             = _mm_add_pd(fix3,tx);
1789             fiy3             = _mm_add_pd(fiy3,ty);
1790             fiz3             = _mm_add_pd(fiz3,tz);
1791
1792             fjx1             = _mm_add_pd(fjx1,tx);
1793             fjy1             = _mm_add_pd(fjy1,ty);
1794             fjz1             = _mm_add_pd(fjz1,tz);
1795
1796             /**************************
1797              * CALCULATE INTERACTIONS *
1798              **************************/
1799
1800             r32              = _mm_mul_pd(rsq32,rinv32);
1801
1802             /* EWALD ELECTROSTATICS */
1803
1804             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1805             ewrt             = _mm_mul_pd(r32,ewtabscale);
1806             ewitab           = _mm_cvttpd_epi32(ewrt);
1807             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1808             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1809                                          &ewtabF,&ewtabFn);
1810             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1811             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1812
1813             fscal            = felec;
1814
1815             /* Calculate temporary vectorial force */
1816             tx               = _mm_mul_pd(fscal,dx32);
1817             ty               = _mm_mul_pd(fscal,dy32);
1818             tz               = _mm_mul_pd(fscal,dz32);
1819
1820             /* Update vectorial force */
1821             fix3             = _mm_add_pd(fix3,tx);
1822             fiy3             = _mm_add_pd(fiy3,ty);
1823             fiz3             = _mm_add_pd(fiz3,tz);
1824
1825             fjx2             = _mm_add_pd(fjx2,tx);
1826             fjy2             = _mm_add_pd(fjy2,ty);
1827             fjz2             = _mm_add_pd(fjz2,tz);
1828
1829             /**************************
1830              * CALCULATE INTERACTIONS *
1831              **************************/
1832
1833             r33              = _mm_mul_pd(rsq33,rinv33);
1834
1835             /* EWALD ELECTROSTATICS */
1836
1837             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1838             ewrt             = _mm_mul_pd(r33,ewtabscale);
1839             ewitab           = _mm_cvttpd_epi32(ewrt);
1840             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1841             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1842                                          &ewtabF,&ewtabFn);
1843             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1844             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1845
1846             fscal            = felec;
1847
1848             /* Calculate temporary vectorial force */
1849             tx               = _mm_mul_pd(fscal,dx33);
1850             ty               = _mm_mul_pd(fscal,dy33);
1851             tz               = _mm_mul_pd(fscal,dz33);
1852
1853             /* Update vectorial force */
1854             fix3             = _mm_add_pd(fix3,tx);
1855             fiy3             = _mm_add_pd(fiy3,ty);
1856             fiz3             = _mm_add_pd(fiz3,tz);
1857
1858             fjx3             = _mm_add_pd(fjx3,tx);
1859             fjy3             = _mm_add_pd(fjy3,ty);
1860             fjz3             = _mm_add_pd(fjz3,tz);
1861
1862             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1863
1864             /* Inner loop uses 354 flops */
1865         }
1866
1867         if(jidx<j_index_end)
1868         {
1869
1870             jnrA             = jjnr[jidx];
1871             j_coord_offsetA  = DIM*jnrA;
1872
1873             /* load j atom coordinates */
1874             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1875                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1876                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1877
1878             /* Calculate displacement vector */
1879             dx00             = _mm_sub_pd(ix0,jx0);
1880             dy00             = _mm_sub_pd(iy0,jy0);
1881             dz00             = _mm_sub_pd(iz0,jz0);
1882             dx11             = _mm_sub_pd(ix1,jx1);
1883             dy11             = _mm_sub_pd(iy1,jy1);
1884             dz11             = _mm_sub_pd(iz1,jz1);
1885             dx12             = _mm_sub_pd(ix1,jx2);
1886             dy12             = _mm_sub_pd(iy1,jy2);
1887             dz12             = _mm_sub_pd(iz1,jz2);
1888             dx13             = _mm_sub_pd(ix1,jx3);
1889             dy13             = _mm_sub_pd(iy1,jy3);
1890             dz13             = _mm_sub_pd(iz1,jz3);
1891             dx21             = _mm_sub_pd(ix2,jx1);
1892             dy21             = _mm_sub_pd(iy2,jy1);
1893             dz21             = _mm_sub_pd(iz2,jz1);
1894             dx22             = _mm_sub_pd(ix2,jx2);
1895             dy22             = _mm_sub_pd(iy2,jy2);
1896             dz22             = _mm_sub_pd(iz2,jz2);
1897             dx23             = _mm_sub_pd(ix2,jx3);
1898             dy23             = _mm_sub_pd(iy2,jy3);
1899             dz23             = _mm_sub_pd(iz2,jz3);
1900             dx31             = _mm_sub_pd(ix3,jx1);
1901             dy31             = _mm_sub_pd(iy3,jy1);
1902             dz31             = _mm_sub_pd(iz3,jz1);
1903             dx32             = _mm_sub_pd(ix3,jx2);
1904             dy32             = _mm_sub_pd(iy3,jy2);
1905             dz32             = _mm_sub_pd(iz3,jz2);
1906             dx33             = _mm_sub_pd(ix3,jx3);
1907             dy33             = _mm_sub_pd(iy3,jy3);
1908             dz33             = _mm_sub_pd(iz3,jz3);
1909
1910             /* Calculate squared distance and things based on it */
1911             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1912             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1913             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1914             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1915             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1916             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1917             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1918             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1919             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1920             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1921
1922             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1923             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1924             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1925             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1926             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1927             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1928             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1929             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1930             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1931
1932             rinvsq00         = gmx_mm_inv_pd(rsq00);
1933             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1934             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1935             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1936             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1937             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1938             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1939             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1940             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1941             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1942
1943             fjx0             = _mm_setzero_pd();
1944             fjy0             = _mm_setzero_pd();
1945             fjz0             = _mm_setzero_pd();
1946             fjx1             = _mm_setzero_pd();
1947             fjy1             = _mm_setzero_pd();
1948             fjz1             = _mm_setzero_pd();
1949             fjx2             = _mm_setzero_pd();
1950             fjy2             = _mm_setzero_pd();
1951             fjz2             = _mm_setzero_pd();
1952             fjx3             = _mm_setzero_pd();
1953             fjy3             = _mm_setzero_pd();
1954             fjz3             = _mm_setzero_pd();
1955
1956             /**************************
1957              * CALCULATE INTERACTIONS *
1958              **************************/
1959
1960             /* LENNARD-JONES DISPERSION/REPULSION */
1961
1962             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1963             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1964
1965             fscal            = fvdw;
1966
1967             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1968
1969             /* Calculate temporary vectorial force */
1970             tx               = _mm_mul_pd(fscal,dx00);
1971             ty               = _mm_mul_pd(fscal,dy00);
1972             tz               = _mm_mul_pd(fscal,dz00);
1973
1974             /* Update vectorial force */
1975             fix0             = _mm_add_pd(fix0,tx);
1976             fiy0             = _mm_add_pd(fiy0,ty);
1977             fiz0             = _mm_add_pd(fiz0,tz);
1978
1979             fjx0             = _mm_add_pd(fjx0,tx);
1980             fjy0             = _mm_add_pd(fjy0,ty);
1981             fjz0             = _mm_add_pd(fjz0,tz);
1982
1983             /**************************
1984              * CALCULATE INTERACTIONS *
1985              **************************/
1986
1987             r11              = _mm_mul_pd(rsq11,rinv11);
1988
1989             /* EWALD ELECTROSTATICS */
1990
1991             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1992             ewrt             = _mm_mul_pd(r11,ewtabscale);
1993             ewitab           = _mm_cvttpd_epi32(ewrt);
1994             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1995             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1996             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1997             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1998
1999             fscal            = felec;
2000
2001             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2002
2003             /* Calculate temporary vectorial force */
2004             tx               = _mm_mul_pd(fscal,dx11);
2005             ty               = _mm_mul_pd(fscal,dy11);
2006             tz               = _mm_mul_pd(fscal,dz11);
2007
2008             /* Update vectorial force */
2009             fix1             = _mm_add_pd(fix1,tx);
2010             fiy1             = _mm_add_pd(fiy1,ty);
2011             fiz1             = _mm_add_pd(fiz1,tz);
2012
2013             fjx1             = _mm_add_pd(fjx1,tx);
2014             fjy1             = _mm_add_pd(fjy1,ty);
2015             fjz1             = _mm_add_pd(fjz1,tz);
2016
2017             /**************************
2018              * CALCULATE INTERACTIONS *
2019              **************************/
2020
2021             r12              = _mm_mul_pd(rsq12,rinv12);
2022
2023             /* EWALD ELECTROSTATICS */
2024
2025             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2026             ewrt             = _mm_mul_pd(r12,ewtabscale);
2027             ewitab           = _mm_cvttpd_epi32(ewrt);
2028             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2029             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2030             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2031             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2032
2033             fscal            = felec;
2034
2035             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2036
2037             /* Calculate temporary vectorial force */
2038             tx               = _mm_mul_pd(fscal,dx12);
2039             ty               = _mm_mul_pd(fscal,dy12);
2040             tz               = _mm_mul_pd(fscal,dz12);
2041
2042             /* Update vectorial force */
2043             fix1             = _mm_add_pd(fix1,tx);
2044             fiy1             = _mm_add_pd(fiy1,ty);
2045             fiz1             = _mm_add_pd(fiz1,tz);
2046
2047             fjx2             = _mm_add_pd(fjx2,tx);
2048             fjy2             = _mm_add_pd(fjy2,ty);
2049             fjz2             = _mm_add_pd(fjz2,tz);
2050
2051             /**************************
2052              * CALCULATE INTERACTIONS *
2053              **************************/
2054
2055             r13              = _mm_mul_pd(rsq13,rinv13);
2056
2057             /* EWALD ELECTROSTATICS */
2058
2059             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060             ewrt             = _mm_mul_pd(r13,ewtabscale);
2061             ewitab           = _mm_cvttpd_epi32(ewrt);
2062             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2063             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2064             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2065             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2066
2067             fscal            = felec;
2068
2069             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2070
2071             /* Calculate temporary vectorial force */
2072             tx               = _mm_mul_pd(fscal,dx13);
2073             ty               = _mm_mul_pd(fscal,dy13);
2074             tz               = _mm_mul_pd(fscal,dz13);
2075
2076             /* Update vectorial force */
2077             fix1             = _mm_add_pd(fix1,tx);
2078             fiy1             = _mm_add_pd(fiy1,ty);
2079             fiz1             = _mm_add_pd(fiz1,tz);
2080
2081             fjx3             = _mm_add_pd(fjx3,tx);
2082             fjy3             = _mm_add_pd(fjy3,ty);
2083             fjz3             = _mm_add_pd(fjz3,tz);
2084
2085             /**************************
2086              * CALCULATE INTERACTIONS *
2087              **************************/
2088
2089             r21              = _mm_mul_pd(rsq21,rinv21);
2090
2091             /* EWALD ELECTROSTATICS */
2092
2093             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2094             ewrt             = _mm_mul_pd(r21,ewtabscale);
2095             ewitab           = _mm_cvttpd_epi32(ewrt);
2096             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2097             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2098             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2099             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2100
2101             fscal            = felec;
2102
2103             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2104
2105             /* Calculate temporary vectorial force */
2106             tx               = _mm_mul_pd(fscal,dx21);
2107             ty               = _mm_mul_pd(fscal,dy21);
2108             tz               = _mm_mul_pd(fscal,dz21);
2109
2110             /* Update vectorial force */
2111             fix2             = _mm_add_pd(fix2,tx);
2112             fiy2             = _mm_add_pd(fiy2,ty);
2113             fiz2             = _mm_add_pd(fiz2,tz);
2114
2115             fjx1             = _mm_add_pd(fjx1,tx);
2116             fjy1             = _mm_add_pd(fjy1,ty);
2117             fjz1             = _mm_add_pd(fjz1,tz);
2118
2119             /**************************
2120              * CALCULATE INTERACTIONS *
2121              **************************/
2122
2123             r22              = _mm_mul_pd(rsq22,rinv22);
2124
2125             /* EWALD ELECTROSTATICS */
2126
2127             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2128             ewrt             = _mm_mul_pd(r22,ewtabscale);
2129             ewitab           = _mm_cvttpd_epi32(ewrt);
2130             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2131             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2132             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2133             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2134
2135             fscal            = felec;
2136
2137             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2138
2139             /* Calculate temporary vectorial force */
2140             tx               = _mm_mul_pd(fscal,dx22);
2141             ty               = _mm_mul_pd(fscal,dy22);
2142             tz               = _mm_mul_pd(fscal,dz22);
2143
2144             /* Update vectorial force */
2145             fix2             = _mm_add_pd(fix2,tx);
2146             fiy2             = _mm_add_pd(fiy2,ty);
2147             fiz2             = _mm_add_pd(fiz2,tz);
2148
2149             fjx2             = _mm_add_pd(fjx2,tx);
2150             fjy2             = _mm_add_pd(fjy2,ty);
2151             fjz2             = _mm_add_pd(fjz2,tz);
2152
2153             /**************************
2154              * CALCULATE INTERACTIONS *
2155              **************************/
2156
2157             r23              = _mm_mul_pd(rsq23,rinv23);
2158
2159             /* EWALD ELECTROSTATICS */
2160
2161             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2162             ewrt             = _mm_mul_pd(r23,ewtabscale);
2163             ewitab           = _mm_cvttpd_epi32(ewrt);
2164             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2165             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2166             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2167             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2168
2169             fscal            = felec;
2170
2171             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2172
2173             /* Calculate temporary vectorial force */
2174             tx               = _mm_mul_pd(fscal,dx23);
2175             ty               = _mm_mul_pd(fscal,dy23);
2176             tz               = _mm_mul_pd(fscal,dz23);
2177
2178             /* Update vectorial force */
2179             fix2             = _mm_add_pd(fix2,tx);
2180             fiy2             = _mm_add_pd(fiy2,ty);
2181             fiz2             = _mm_add_pd(fiz2,tz);
2182
2183             fjx3             = _mm_add_pd(fjx3,tx);
2184             fjy3             = _mm_add_pd(fjy3,ty);
2185             fjz3             = _mm_add_pd(fjz3,tz);
2186
2187             /**************************
2188              * CALCULATE INTERACTIONS *
2189              **************************/
2190
2191             r31              = _mm_mul_pd(rsq31,rinv31);
2192
2193             /* EWALD ELECTROSTATICS */
2194
2195             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2196             ewrt             = _mm_mul_pd(r31,ewtabscale);
2197             ewitab           = _mm_cvttpd_epi32(ewrt);
2198             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2199             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2200             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2201             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm_mul_pd(fscal,dx31);
2209             ty               = _mm_mul_pd(fscal,dy31);
2210             tz               = _mm_mul_pd(fscal,dz31);
2211
2212             /* Update vectorial force */
2213             fix3             = _mm_add_pd(fix3,tx);
2214             fiy3             = _mm_add_pd(fiy3,ty);
2215             fiz3             = _mm_add_pd(fiz3,tz);
2216
2217             fjx1             = _mm_add_pd(fjx1,tx);
2218             fjy1             = _mm_add_pd(fjy1,ty);
2219             fjz1             = _mm_add_pd(fjz1,tz);
2220
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             r32              = _mm_mul_pd(rsq32,rinv32);
2226
2227             /* EWALD ELECTROSTATICS */
2228
2229             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2230             ewrt             = _mm_mul_pd(r32,ewtabscale);
2231             ewitab           = _mm_cvttpd_epi32(ewrt);
2232             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2233             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2234             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2235             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2236
2237             fscal            = felec;
2238
2239             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2240
2241             /* Calculate temporary vectorial force */
2242             tx               = _mm_mul_pd(fscal,dx32);
2243             ty               = _mm_mul_pd(fscal,dy32);
2244             tz               = _mm_mul_pd(fscal,dz32);
2245
2246             /* Update vectorial force */
2247             fix3             = _mm_add_pd(fix3,tx);
2248             fiy3             = _mm_add_pd(fiy3,ty);
2249             fiz3             = _mm_add_pd(fiz3,tz);
2250
2251             fjx2             = _mm_add_pd(fjx2,tx);
2252             fjy2             = _mm_add_pd(fjy2,ty);
2253             fjz2             = _mm_add_pd(fjz2,tz);
2254
2255             /**************************
2256              * CALCULATE INTERACTIONS *
2257              **************************/
2258
2259             r33              = _mm_mul_pd(rsq33,rinv33);
2260
2261             /* EWALD ELECTROSTATICS */
2262
2263             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2264             ewrt             = _mm_mul_pd(r33,ewtabscale);
2265             ewitab           = _mm_cvttpd_epi32(ewrt);
2266             eweps            = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2267             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2268             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2269             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2270
2271             fscal            = felec;
2272
2273             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2274
2275             /* Calculate temporary vectorial force */
2276             tx               = _mm_mul_pd(fscal,dx33);
2277             ty               = _mm_mul_pd(fscal,dy33);
2278             tz               = _mm_mul_pd(fscal,dz33);
2279
2280             /* Update vectorial force */
2281             fix3             = _mm_add_pd(fix3,tx);
2282             fiy3             = _mm_add_pd(fiy3,ty);
2283             fiz3             = _mm_add_pd(fiz3,tz);
2284
2285             fjx3             = _mm_add_pd(fjx3,tx);
2286             fjy3             = _mm_add_pd(fjy3,ty);
2287             fjz3             = _mm_add_pd(fjz3,tz);
2288
2289             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2290
2291             /* Inner loop uses 354 flops */
2292         }
2293
2294         /* End of innermost loop */
2295
2296         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2297                                               f+i_coord_offset,fshift+i_shift_offset);
2298
2299         /* Increment number of inner iterations */
2300         inneriter                  += j_index_end - j_index_start;
2301
2302         /* Outer loop uses 24 flops */
2303     }
2304
2305     /* Increment number of outer iterations */
2306     outeriter        += nri;
2307
2308     /* Update outer/inner flops */
2309
2310     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*354);
2311 }