made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse4_1_double / nb_kernel_ElecEw_VdwLJ_GeomW4W4_sse4_1_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_double
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B;
75     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B;
77     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B;
79     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B;
81     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
99     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
100     __m128i          ewitab;
101     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
102     real             *ewtab;
103     __m128d          dummy_mask,cutoff_mask;
104     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
105     __m128d          one     = _mm_set1_pd(1.0);
106     __m128d          two     = _mm_set1_pd(2.0);
107     x                = xx[0];
108     f                = ff[0];
109
110     nri              = nlist->nri;
111     iinr             = nlist->iinr;
112     jindex           = nlist->jindex;
113     jjnr             = nlist->jjnr;
114     shiftidx         = nlist->shift;
115     gid              = nlist->gid;
116     shiftvec         = fr->shift_vec[0];
117     fshift           = fr->fshift[0];
118     facel            = _mm_set1_pd(fr->epsfac);
119     charge           = mdatoms->chargeA;
120     nvdwtype         = fr->ntype;
121     vdwparam         = fr->nbfp;
122     vdwtype          = mdatoms->typeA;
123
124     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
125     ewtab            = fr->ic->tabq_coul_FDV0;
126     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
127     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
132     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
133     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
134     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
135
136     jq1              = _mm_set1_pd(charge[inr+1]);
137     jq2              = _mm_set1_pd(charge[inr+2]);
138     jq3              = _mm_set1_pd(charge[inr+3]);
139     vdwjidx0A        = 2*vdwtype[inr+0];
140     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
141     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
142     qq11             = _mm_mul_pd(iq1,jq1);
143     qq12             = _mm_mul_pd(iq1,jq2);
144     qq13             = _mm_mul_pd(iq1,jq3);
145     qq21             = _mm_mul_pd(iq2,jq1);
146     qq22             = _mm_mul_pd(iq2,jq2);
147     qq23             = _mm_mul_pd(iq2,jq3);
148     qq31             = _mm_mul_pd(iq3,jq1);
149     qq32             = _mm_mul_pd(iq3,jq2);
150     qq33             = _mm_mul_pd(iq3,jq3);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156
157     outeriter        = 0;
158     inneriter        = 0;
159
160     /* Start outer loop over neighborlists */
161     for(iidx=0; iidx<nri; iidx++)
162     {
163         /* Load shift vector for this list */
164         i_shift_offset   = DIM*shiftidx[iidx];
165
166         /* Load limits for loop over neighbors */
167         j_index_start    = jindex[iidx];
168         j_index_end      = jindex[iidx+1];
169
170         /* Get outer coordinate index */
171         inr              = iinr[iidx];
172         i_coord_offset   = DIM*inr;
173
174         /* Load i particle coords and add shift vector */
175         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
176                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
177
178         fix0             = _mm_setzero_pd();
179         fiy0             = _mm_setzero_pd();
180         fiz0             = _mm_setzero_pd();
181         fix1             = _mm_setzero_pd();
182         fiy1             = _mm_setzero_pd();
183         fiz1             = _mm_setzero_pd();
184         fix2             = _mm_setzero_pd();
185         fiy2             = _mm_setzero_pd();
186         fiz2             = _mm_setzero_pd();
187         fix3             = _mm_setzero_pd();
188         fiy3             = _mm_setzero_pd();
189         fiz3             = _mm_setzero_pd();
190
191         /* Reset potential sums */
192         velecsum         = _mm_setzero_pd();
193         vvdwsum          = _mm_setzero_pd();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             j_coord_offsetA  = DIM*jnrA;
203             j_coord_offsetB  = DIM*jnrB;
204
205             /* load j atom coordinates */
206             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
208                                               &jy2,&jz2,&jx3,&jy3,&jz3);
209
210             /* Calculate displacement vector */
211             dx00             = _mm_sub_pd(ix0,jx0);
212             dy00             = _mm_sub_pd(iy0,jy0);
213             dz00             = _mm_sub_pd(iz0,jz0);
214             dx11             = _mm_sub_pd(ix1,jx1);
215             dy11             = _mm_sub_pd(iy1,jy1);
216             dz11             = _mm_sub_pd(iz1,jz1);
217             dx12             = _mm_sub_pd(ix1,jx2);
218             dy12             = _mm_sub_pd(iy1,jy2);
219             dz12             = _mm_sub_pd(iz1,jz2);
220             dx13             = _mm_sub_pd(ix1,jx3);
221             dy13             = _mm_sub_pd(iy1,jy3);
222             dz13             = _mm_sub_pd(iz1,jz3);
223             dx21             = _mm_sub_pd(ix2,jx1);
224             dy21             = _mm_sub_pd(iy2,jy1);
225             dz21             = _mm_sub_pd(iz2,jz1);
226             dx22             = _mm_sub_pd(ix2,jx2);
227             dy22             = _mm_sub_pd(iy2,jy2);
228             dz22             = _mm_sub_pd(iz2,jz2);
229             dx23             = _mm_sub_pd(ix2,jx3);
230             dy23             = _mm_sub_pd(iy2,jy3);
231             dz23             = _mm_sub_pd(iz2,jz3);
232             dx31             = _mm_sub_pd(ix3,jx1);
233             dy31             = _mm_sub_pd(iy3,jy1);
234             dz31             = _mm_sub_pd(iz3,jz1);
235             dx32             = _mm_sub_pd(ix3,jx2);
236             dy32             = _mm_sub_pd(iy3,jy2);
237             dz32             = _mm_sub_pd(iz3,jz2);
238             dx33             = _mm_sub_pd(ix3,jx3);
239             dy33             = _mm_sub_pd(iy3,jy3);
240             dz33             = _mm_sub_pd(iz3,jz3);
241
242             /* Calculate squared distance and things based on it */
243             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
244             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
245             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
246             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
247             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
248             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
249             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
250             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
251             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
252             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
253
254             rinv11           = gmx_mm_invsqrt_pd(rsq11);
255             rinv12           = gmx_mm_invsqrt_pd(rsq12);
256             rinv13           = gmx_mm_invsqrt_pd(rsq13);
257             rinv21           = gmx_mm_invsqrt_pd(rsq21);
258             rinv22           = gmx_mm_invsqrt_pd(rsq22);
259             rinv23           = gmx_mm_invsqrt_pd(rsq23);
260             rinv31           = gmx_mm_invsqrt_pd(rsq31);
261             rinv32           = gmx_mm_invsqrt_pd(rsq32);
262             rinv33           = gmx_mm_invsqrt_pd(rsq33);
263
264             rinvsq00         = gmx_mm_inv_pd(rsq00);
265             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
266             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
267             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
268             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
269             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
270             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
271             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
272             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
273             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
274
275             fjx0             = _mm_setzero_pd();
276             fjy0             = _mm_setzero_pd();
277             fjz0             = _mm_setzero_pd();
278             fjx1             = _mm_setzero_pd();
279             fjy1             = _mm_setzero_pd();
280             fjz1             = _mm_setzero_pd();
281             fjx2             = _mm_setzero_pd();
282             fjy2             = _mm_setzero_pd();
283             fjz2             = _mm_setzero_pd();
284             fjx3             = _mm_setzero_pd();
285             fjy3             = _mm_setzero_pd();
286             fjz3             = _mm_setzero_pd();
287
288             /**************************
289              * CALCULATE INTERACTIONS *
290              **************************/
291
292             /* LENNARD-JONES DISPERSION/REPULSION */
293
294             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
295             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
296             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
297             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
298             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
299
300             /* Update potential sum for this i atom from the interaction with this j atom. */
301             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
302
303             fscal            = fvdw;
304
305             /* Calculate temporary vectorial force */
306             tx               = _mm_mul_pd(fscal,dx00);
307             ty               = _mm_mul_pd(fscal,dy00);
308             tz               = _mm_mul_pd(fscal,dz00);
309
310             /* Update vectorial force */
311             fix0             = _mm_add_pd(fix0,tx);
312             fiy0             = _mm_add_pd(fiy0,ty);
313             fiz0             = _mm_add_pd(fiz0,tz);
314
315             fjx0             = _mm_add_pd(fjx0,tx);
316             fjy0             = _mm_add_pd(fjy0,ty);
317             fjz0             = _mm_add_pd(fjz0,tz);
318
319             /**************************
320              * CALCULATE INTERACTIONS *
321              **************************/
322
323             r11              = _mm_mul_pd(rsq11,rinv11);
324
325             /* EWALD ELECTROSTATICS */
326
327             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328             ewrt             = _mm_mul_pd(r11,ewtabscale);
329             ewitab           = _mm_cvttpd_epi32(ewrt);
330             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
331             ewitab           = _mm_slli_epi32(ewitab,2);
332             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
333             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
334             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
335             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
336             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
337             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
338             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
339             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
340             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
341             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velecsum         = _mm_add_pd(velecsum,velec);
345
346             fscal            = felec;
347
348             /* Calculate temporary vectorial force */
349             tx               = _mm_mul_pd(fscal,dx11);
350             ty               = _mm_mul_pd(fscal,dy11);
351             tz               = _mm_mul_pd(fscal,dz11);
352
353             /* Update vectorial force */
354             fix1             = _mm_add_pd(fix1,tx);
355             fiy1             = _mm_add_pd(fiy1,ty);
356             fiz1             = _mm_add_pd(fiz1,tz);
357
358             fjx1             = _mm_add_pd(fjx1,tx);
359             fjy1             = _mm_add_pd(fjy1,ty);
360             fjz1             = _mm_add_pd(fjz1,tz);
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             r12              = _mm_mul_pd(rsq12,rinv12);
367
368             /* EWALD ELECTROSTATICS */
369
370             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
371             ewrt             = _mm_mul_pd(r12,ewtabscale);
372             ewitab           = _mm_cvttpd_epi32(ewrt);
373             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
374             ewitab           = _mm_slli_epi32(ewitab,2);
375             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
376             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
377             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
378             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
379             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
380             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
381             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
382             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
383             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
384             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
385
386             /* Update potential sum for this i atom from the interaction with this j atom. */
387             velecsum         = _mm_add_pd(velecsum,velec);
388
389             fscal            = felec;
390
391             /* Calculate temporary vectorial force */
392             tx               = _mm_mul_pd(fscal,dx12);
393             ty               = _mm_mul_pd(fscal,dy12);
394             tz               = _mm_mul_pd(fscal,dz12);
395
396             /* Update vectorial force */
397             fix1             = _mm_add_pd(fix1,tx);
398             fiy1             = _mm_add_pd(fiy1,ty);
399             fiz1             = _mm_add_pd(fiz1,tz);
400
401             fjx2             = _mm_add_pd(fjx2,tx);
402             fjy2             = _mm_add_pd(fjy2,ty);
403             fjz2             = _mm_add_pd(fjz2,tz);
404
405             /**************************
406              * CALCULATE INTERACTIONS *
407              **************************/
408
409             r13              = _mm_mul_pd(rsq13,rinv13);
410
411             /* EWALD ELECTROSTATICS */
412
413             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414             ewrt             = _mm_mul_pd(r13,ewtabscale);
415             ewitab           = _mm_cvttpd_epi32(ewrt);
416             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
417             ewitab           = _mm_slli_epi32(ewitab,2);
418             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
419             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
420             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
421             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
422             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
423             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
424             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
425             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
426             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
427             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
428
429             /* Update potential sum for this i atom from the interaction with this j atom. */
430             velecsum         = _mm_add_pd(velecsum,velec);
431
432             fscal            = felec;
433
434             /* Calculate temporary vectorial force */
435             tx               = _mm_mul_pd(fscal,dx13);
436             ty               = _mm_mul_pd(fscal,dy13);
437             tz               = _mm_mul_pd(fscal,dz13);
438
439             /* Update vectorial force */
440             fix1             = _mm_add_pd(fix1,tx);
441             fiy1             = _mm_add_pd(fiy1,ty);
442             fiz1             = _mm_add_pd(fiz1,tz);
443
444             fjx3             = _mm_add_pd(fjx3,tx);
445             fjy3             = _mm_add_pd(fjy3,ty);
446             fjz3             = _mm_add_pd(fjz3,tz);
447
448             /**************************
449              * CALCULATE INTERACTIONS *
450              **************************/
451
452             r21              = _mm_mul_pd(rsq21,rinv21);
453
454             /* EWALD ELECTROSTATICS */
455
456             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
457             ewrt             = _mm_mul_pd(r21,ewtabscale);
458             ewitab           = _mm_cvttpd_epi32(ewrt);
459             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
460             ewitab           = _mm_slli_epi32(ewitab,2);
461             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
462             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
463             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
464             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
465             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
466             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
467             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
468             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
469             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
470             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velecsum         = _mm_add_pd(velecsum,velec);
474
475             fscal            = felec;
476
477             /* Calculate temporary vectorial force */
478             tx               = _mm_mul_pd(fscal,dx21);
479             ty               = _mm_mul_pd(fscal,dy21);
480             tz               = _mm_mul_pd(fscal,dz21);
481
482             /* Update vectorial force */
483             fix2             = _mm_add_pd(fix2,tx);
484             fiy2             = _mm_add_pd(fiy2,ty);
485             fiz2             = _mm_add_pd(fiz2,tz);
486
487             fjx1             = _mm_add_pd(fjx1,tx);
488             fjy1             = _mm_add_pd(fjy1,ty);
489             fjz1             = _mm_add_pd(fjz1,tz);
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             r22              = _mm_mul_pd(rsq22,rinv22);
496
497             /* EWALD ELECTROSTATICS */
498
499             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
500             ewrt             = _mm_mul_pd(r22,ewtabscale);
501             ewitab           = _mm_cvttpd_epi32(ewrt);
502             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
503             ewitab           = _mm_slli_epi32(ewitab,2);
504             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
505             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
506             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
507             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
508             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
509             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
510             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
511             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
512             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
513             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
514
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velecsum         = _mm_add_pd(velecsum,velec);
517
518             fscal            = felec;
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm_mul_pd(fscal,dx22);
522             ty               = _mm_mul_pd(fscal,dy22);
523             tz               = _mm_mul_pd(fscal,dz22);
524
525             /* Update vectorial force */
526             fix2             = _mm_add_pd(fix2,tx);
527             fiy2             = _mm_add_pd(fiy2,ty);
528             fiz2             = _mm_add_pd(fiz2,tz);
529
530             fjx2             = _mm_add_pd(fjx2,tx);
531             fjy2             = _mm_add_pd(fjy2,ty);
532             fjz2             = _mm_add_pd(fjz2,tz);
533
534             /**************************
535              * CALCULATE INTERACTIONS *
536              **************************/
537
538             r23              = _mm_mul_pd(rsq23,rinv23);
539
540             /* EWALD ELECTROSTATICS */
541
542             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
543             ewrt             = _mm_mul_pd(r23,ewtabscale);
544             ewitab           = _mm_cvttpd_epi32(ewrt);
545             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
546             ewitab           = _mm_slli_epi32(ewitab,2);
547             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
548             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
549             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
550             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
551             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
552             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
553             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
554             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
555             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
556             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velecsum         = _mm_add_pd(velecsum,velec);
560
561             fscal            = felec;
562
563             /* Calculate temporary vectorial force */
564             tx               = _mm_mul_pd(fscal,dx23);
565             ty               = _mm_mul_pd(fscal,dy23);
566             tz               = _mm_mul_pd(fscal,dz23);
567
568             /* Update vectorial force */
569             fix2             = _mm_add_pd(fix2,tx);
570             fiy2             = _mm_add_pd(fiy2,ty);
571             fiz2             = _mm_add_pd(fiz2,tz);
572
573             fjx3             = _mm_add_pd(fjx3,tx);
574             fjy3             = _mm_add_pd(fjy3,ty);
575             fjz3             = _mm_add_pd(fjz3,tz);
576
577             /**************************
578              * CALCULATE INTERACTIONS *
579              **************************/
580
581             r31              = _mm_mul_pd(rsq31,rinv31);
582
583             /* EWALD ELECTROSTATICS */
584
585             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
586             ewrt             = _mm_mul_pd(r31,ewtabscale);
587             ewitab           = _mm_cvttpd_epi32(ewrt);
588             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
589             ewitab           = _mm_slli_epi32(ewitab,2);
590             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
591             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
592             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
593             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
594             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
595             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
596             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
597             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
598             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
599             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
600
601             /* Update potential sum for this i atom from the interaction with this j atom. */
602             velecsum         = _mm_add_pd(velecsum,velec);
603
604             fscal            = felec;
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm_mul_pd(fscal,dx31);
608             ty               = _mm_mul_pd(fscal,dy31);
609             tz               = _mm_mul_pd(fscal,dz31);
610
611             /* Update vectorial force */
612             fix3             = _mm_add_pd(fix3,tx);
613             fiy3             = _mm_add_pd(fiy3,ty);
614             fiz3             = _mm_add_pd(fiz3,tz);
615
616             fjx1             = _mm_add_pd(fjx1,tx);
617             fjy1             = _mm_add_pd(fjy1,ty);
618             fjz1             = _mm_add_pd(fjz1,tz);
619
620             /**************************
621              * CALCULATE INTERACTIONS *
622              **************************/
623
624             r32              = _mm_mul_pd(rsq32,rinv32);
625
626             /* EWALD ELECTROSTATICS */
627
628             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629             ewrt             = _mm_mul_pd(r32,ewtabscale);
630             ewitab           = _mm_cvttpd_epi32(ewrt);
631             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
632             ewitab           = _mm_slli_epi32(ewitab,2);
633             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
634             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
635             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
636             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
637             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
638             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
639             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
640             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
641             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
642             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
643
644             /* Update potential sum for this i atom from the interaction with this j atom. */
645             velecsum         = _mm_add_pd(velecsum,velec);
646
647             fscal            = felec;
648
649             /* Calculate temporary vectorial force */
650             tx               = _mm_mul_pd(fscal,dx32);
651             ty               = _mm_mul_pd(fscal,dy32);
652             tz               = _mm_mul_pd(fscal,dz32);
653
654             /* Update vectorial force */
655             fix3             = _mm_add_pd(fix3,tx);
656             fiy3             = _mm_add_pd(fiy3,ty);
657             fiz3             = _mm_add_pd(fiz3,tz);
658
659             fjx2             = _mm_add_pd(fjx2,tx);
660             fjy2             = _mm_add_pd(fjy2,ty);
661             fjz2             = _mm_add_pd(fjz2,tz);
662
663             /**************************
664              * CALCULATE INTERACTIONS *
665              **************************/
666
667             r33              = _mm_mul_pd(rsq33,rinv33);
668
669             /* EWALD ELECTROSTATICS */
670
671             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672             ewrt             = _mm_mul_pd(r33,ewtabscale);
673             ewitab           = _mm_cvttpd_epi32(ewrt);
674             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
675             ewitab           = _mm_slli_epi32(ewitab,2);
676             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
677             ewtabD           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
678             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
679             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
680             ewtabFn          = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
681             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
682             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
683             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
684             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
685             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
686
687             /* Update potential sum for this i atom from the interaction with this j atom. */
688             velecsum         = _mm_add_pd(velecsum,velec);
689
690             fscal            = felec;
691
692             /* Calculate temporary vectorial force */
693             tx               = _mm_mul_pd(fscal,dx33);
694             ty               = _mm_mul_pd(fscal,dy33);
695             tz               = _mm_mul_pd(fscal,dz33);
696
697             /* Update vectorial force */
698             fix3             = _mm_add_pd(fix3,tx);
699             fiy3             = _mm_add_pd(fiy3,ty);
700             fiz3             = _mm_add_pd(fiz3,tz);
701
702             fjx3             = _mm_add_pd(fjx3,tx);
703             fjy3             = _mm_add_pd(fjy3,ty);
704             fjz3             = _mm_add_pd(fjz3,tz);
705
706             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
707
708             /* Inner loop uses 404 flops */
709         }
710
711         if(jidx<j_index_end)
712         {
713
714             jnrA             = jjnr[jidx];
715             j_coord_offsetA  = DIM*jnrA;
716
717             /* load j atom coordinates */
718             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
719                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
720                                               &jy2,&jz2,&jx3,&jy3,&jz3);
721
722             /* Calculate displacement vector */
723             dx00             = _mm_sub_pd(ix0,jx0);
724             dy00             = _mm_sub_pd(iy0,jy0);
725             dz00             = _mm_sub_pd(iz0,jz0);
726             dx11             = _mm_sub_pd(ix1,jx1);
727             dy11             = _mm_sub_pd(iy1,jy1);
728             dz11             = _mm_sub_pd(iz1,jz1);
729             dx12             = _mm_sub_pd(ix1,jx2);
730             dy12             = _mm_sub_pd(iy1,jy2);
731             dz12             = _mm_sub_pd(iz1,jz2);
732             dx13             = _mm_sub_pd(ix1,jx3);
733             dy13             = _mm_sub_pd(iy1,jy3);
734             dz13             = _mm_sub_pd(iz1,jz3);
735             dx21             = _mm_sub_pd(ix2,jx1);
736             dy21             = _mm_sub_pd(iy2,jy1);
737             dz21             = _mm_sub_pd(iz2,jz1);
738             dx22             = _mm_sub_pd(ix2,jx2);
739             dy22             = _mm_sub_pd(iy2,jy2);
740             dz22             = _mm_sub_pd(iz2,jz2);
741             dx23             = _mm_sub_pd(ix2,jx3);
742             dy23             = _mm_sub_pd(iy2,jy3);
743             dz23             = _mm_sub_pd(iz2,jz3);
744             dx31             = _mm_sub_pd(ix3,jx1);
745             dy31             = _mm_sub_pd(iy3,jy1);
746             dz31             = _mm_sub_pd(iz3,jz1);
747             dx32             = _mm_sub_pd(ix3,jx2);
748             dy32             = _mm_sub_pd(iy3,jy2);
749             dz32             = _mm_sub_pd(iz3,jz2);
750             dx33             = _mm_sub_pd(ix3,jx3);
751             dy33             = _mm_sub_pd(iy3,jy3);
752             dz33             = _mm_sub_pd(iz3,jz3);
753
754             /* Calculate squared distance and things based on it */
755             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
756             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
757             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
758             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
759             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
760             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
761             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
762             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
763             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
764             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
765
766             rinv11           = gmx_mm_invsqrt_pd(rsq11);
767             rinv12           = gmx_mm_invsqrt_pd(rsq12);
768             rinv13           = gmx_mm_invsqrt_pd(rsq13);
769             rinv21           = gmx_mm_invsqrt_pd(rsq21);
770             rinv22           = gmx_mm_invsqrt_pd(rsq22);
771             rinv23           = gmx_mm_invsqrt_pd(rsq23);
772             rinv31           = gmx_mm_invsqrt_pd(rsq31);
773             rinv32           = gmx_mm_invsqrt_pd(rsq32);
774             rinv33           = gmx_mm_invsqrt_pd(rsq33);
775
776             rinvsq00         = gmx_mm_inv_pd(rsq00);
777             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
778             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
779             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
780             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
781             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
782             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
783             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
784             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
785             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
786
787             fjx0             = _mm_setzero_pd();
788             fjy0             = _mm_setzero_pd();
789             fjz0             = _mm_setzero_pd();
790             fjx1             = _mm_setzero_pd();
791             fjy1             = _mm_setzero_pd();
792             fjz1             = _mm_setzero_pd();
793             fjx2             = _mm_setzero_pd();
794             fjy2             = _mm_setzero_pd();
795             fjz2             = _mm_setzero_pd();
796             fjx3             = _mm_setzero_pd();
797             fjy3             = _mm_setzero_pd();
798             fjz3             = _mm_setzero_pd();
799
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             /* LENNARD-JONES DISPERSION/REPULSION */
805
806             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
807             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
808             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
809             vvdw             = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
810             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
811
812             /* Update potential sum for this i atom from the interaction with this j atom. */
813             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
814             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
815
816             fscal            = fvdw;
817
818             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
819
820             /* Calculate temporary vectorial force */
821             tx               = _mm_mul_pd(fscal,dx00);
822             ty               = _mm_mul_pd(fscal,dy00);
823             tz               = _mm_mul_pd(fscal,dz00);
824
825             /* Update vectorial force */
826             fix0             = _mm_add_pd(fix0,tx);
827             fiy0             = _mm_add_pd(fiy0,ty);
828             fiz0             = _mm_add_pd(fiz0,tz);
829
830             fjx0             = _mm_add_pd(fjx0,tx);
831             fjy0             = _mm_add_pd(fjy0,ty);
832             fjz0             = _mm_add_pd(fjz0,tz);
833
834             /**************************
835              * CALCULATE INTERACTIONS *
836              **************************/
837
838             r11              = _mm_mul_pd(rsq11,rinv11);
839
840             /* EWALD ELECTROSTATICS */
841
842             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
843             ewrt             = _mm_mul_pd(r11,ewtabscale);
844             ewitab           = _mm_cvttpd_epi32(ewrt);
845             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
846             ewitab           = _mm_slli_epi32(ewitab,2);
847             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
848             ewtabD           = _mm_setzero_pd();
849             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
850             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
851             ewtabFn          = _mm_setzero_pd();
852             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
853             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
854             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
855             velec            = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
856             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
857
858             /* Update potential sum for this i atom from the interaction with this j atom. */
859             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
860             velecsum         = _mm_add_pd(velecsum,velec);
861
862             fscal            = felec;
863
864             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
865
866             /* Calculate temporary vectorial force */
867             tx               = _mm_mul_pd(fscal,dx11);
868             ty               = _mm_mul_pd(fscal,dy11);
869             tz               = _mm_mul_pd(fscal,dz11);
870
871             /* Update vectorial force */
872             fix1             = _mm_add_pd(fix1,tx);
873             fiy1             = _mm_add_pd(fiy1,ty);
874             fiz1             = _mm_add_pd(fiz1,tz);
875
876             fjx1             = _mm_add_pd(fjx1,tx);
877             fjy1             = _mm_add_pd(fjy1,ty);
878             fjz1             = _mm_add_pd(fjz1,tz);
879
880             /**************************
881              * CALCULATE INTERACTIONS *
882              **************************/
883
884             r12              = _mm_mul_pd(rsq12,rinv12);
885
886             /* EWALD ELECTROSTATICS */
887
888             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
889             ewrt             = _mm_mul_pd(r12,ewtabscale);
890             ewitab           = _mm_cvttpd_epi32(ewrt);
891             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
892             ewitab           = _mm_slli_epi32(ewitab,2);
893             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
894             ewtabD           = _mm_setzero_pd();
895             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
896             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
897             ewtabFn          = _mm_setzero_pd();
898             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
899             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
900             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
901             velec            = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
902             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
903
904             /* Update potential sum for this i atom from the interaction with this j atom. */
905             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
906             velecsum         = _mm_add_pd(velecsum,velec);
907
908             fscal            = felec;
909
910             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
911
912             /* Calculate temporary vectorial force */
913             tx               = _mm_mul_pd(fscal,dx12);
914             ty               = _mm_mul_pd(fscal,dy12);
915             tz               = _mm_mul_pd(fscal,dz12);
916
917             /* Update vectorial force */
918             fix1             = _mm_add_pd(fix1,tx);
919             fiy1             = _mm_add_pd(fiy1,ty);
920             fiz1             = _mm_add_pd(fiz1,tz);
921
922             fjx2             = _mm_add_pd(fjx2,tx);
923             fjy2             = _mm_add_pd(fjy2,ty);
924             fjz2             = _mm_add_pd(fjz2,tz);
925
926             /**************************
927              * CALCULATE INTERACTIONS *
928              **************************/
929
930             r13              = _mm_mul_pd(rsq13,rinv13);
931
932             /* EWALD ELECTROSTATICS */
933
934             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
935             ewrt             = _mm_mul_pd(r13,ewtabscale);
936             ewitab           = _mm_cvttpd_epi32(ewrt);
937             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
938             ewitab           = _mm_slli_epi32(ewitab,2);
939             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
940             ewtabD           = _mm_setzero_pd();
941             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
942             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
943             ewtabFn          = _mm_setzero_pd();
944             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
945             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
946             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
947             velec            = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
948             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
949
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
952             velecsum         = _mm_add_pd(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
957
958             /* Calculate temporary vectorial force */
959             tx               = _mm_mul_pd(fscal,dx13);
960             ty               = _mm_mul_pd(fscal,dy13);
961             tz               = _mm_mul_pd(fscal,dz13);
962
963             /* Update vectorial force */
964             fix1             = _mm_add_pd(fix1,tx);
965             fiy1             = _mm_add_pd(fiy1,ty);
966             fiz1             = _mm_add_pd(fiz1,tz);
967
968             fjx3             = _mm_add_pd(fjx3,tx);
969             fjy3             = _mm_add_pd(fjy3,ty);
970             fjz3             = _mm_add_pd(fjz3,tz);
971
972             /**************************
973              * CALCULATE INTERACTIONS *
974              **************************/
975
976             r21              = _mm_mul_pd(rsq21,rinv21);
977
978             /* EWALD ELECTROSTATICS */
979
980             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
981             ewrt             = _mm_mul_pd(r21,ewtabscale);
982             ewitab           = _mm_cvttpd_epi32(ewrt);
983             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
984             ewitab           = _mm_slli_epi32(ewitab,2);
985             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
986             ewtabD           = _mm_setzero_pd();
987             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
988             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
989             ewtabFn          = _mm_setzero_pd();
990             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
991             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
992             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
993             velec            = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
994             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
995
996             /* Update potential sum for this i atom from the interaction with this j atom. */
997             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
998             velecsum         = _mm_add_pd(velecsum,velec);
999
1000             fscal            = felec;
1001
1002             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1003
1004             /* Calculate temporary vectorial force */
1005             tx               = _mm_mul_pd(fscal,dx21);
1006             ty               = _mm_mul_pd(fscal,dy21);
1007             tz               = _mm_mul_pd(fscal,dz21);
1008
1009             /* Update vectorial force */
1010             fix2             = _mm_add_pd(fix2,tx);
1011             fiy2             = _mm_add_pd(fiy2,ty);
1012             fiz2             = _mm_add_pd(fiz2,tz);
1013
1014             fjx1             = _mm_add_pd(fjx1,tx);
1015             fjy1             = _mm_add_pd(fjy1,ty);
1016             fjz1             = _mm_add_pd(fjz1,tz);
1017
1018             /**************************
1019              * CALCULATE INTERACTIONS *
1020              **************************/
1021
1022             r22              = _mm_mul_pd(rsq22,rinv22);
1023
1024             /* EWALD ELECTROSTATICS */
1025
1026             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1027             ewrt             = _mm_mul_pd(r22,ewtabscale);
1028             ewitab           = _mm_cvttpd_epi32(ewrt);
1029             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1030             ewitab           = _mm_slli_epi32(ewitab,2);
1031             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1032             ewtabD           = _mm_setzero_pd();
1033             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1034             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1035             ewtabFn          = _mm_setzero_pd();
1036             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1037             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1038             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1039             velec            = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1040             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1044             velecsum         = _mm_add_pd(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm_mul_pd(fscal,dx22);
1052             ty               = _mm_mul_pd(fscal,dy22);
1053             tz               = _mm_mul_pd(fscal,dz22);
1054
1055             /* Update vectorial force */
1056             fix2             = _mm_add_pd(fix2,tx);
1057             fiy2             = _mm_add_pd(fiy2,ty);
1058             fiz2             = _mm_add_pd(fiz2,tz);
1059
1060             fjx2             = _mm_add_pd(fjx2,tx);
1061             fjy2             = _mm_add_pd(fjy2,ty);
1062             fjz2             = _mm_add_pd(fjz2,tz);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             r23              = _mm_mul_pd(rsq23,rinv23);
1069
1070             /* EWALD ELECTROSTATICS */
1071
1072             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1073             ewrt             = _mm_mul_pd(r23,ewtabscale);
1074             ewitab           = _mm_cvttpd_epi32(ewrt);
1075             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1076             ewitab           = _mm_slli_epi32(ewitab,2);
1077             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1078             ewtabD           = _mm_setzero_pd();
1079             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1080             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1081             ewtabFn          = _mm_setzero_pd();
1082             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1083             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1084             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1085             velec            = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1086             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1087
1088             /* Update potential sum for this i atom from the interaction with this j atom. */
1089             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1090             velecsum         = _mm_add_pd(velecsum,velec);
1091
1092             fscal            = felec;
1093
1094             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1095
1096             /* Calculate temporary vectorial force */
1097             tx               = _mm_mul_pd(fscal,dx23);
1098             ty               = _mm_mul_pd(fscal,dy23);
1099             tz               = _mm_mul_pd(fscal,dz23);
1100
1101             /* Update vectorial force */
1102             fix2             = _mm_add_pd(fix2,tx);
1103             fiy2             = _mm_add_pd(fiy2,ty);
1104             fiz2             = _mm_add_pd(fiz2,tz);
1105
1106             fjx3             = _mm_add_pd(fjx3,tx);
1107             fjy3             = _mm_add_pd(fjy3,ty);
1108             fjz3             = _mm_add_pd(fjz3,tz);
1109
1110             /**************************
1111              * CALCULATE INTERACTIONS *
1112              **************************/
1113
1114             r31              = _mm_mul_pd(rsq31,rinv31);
1115
1116             /* EWALD ELECTROSTATICS */
1117
1118             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1119             ewrt             = _mm_mul_pd(r31,ewtabscale);
1120             ewitab           = _mm_cvttpd_epi32(ewrt);
1121             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1122             ewitab           = _mm_slli_epi32(ewitab,2);
1123             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1124             ewtabD           = _mm_setzero_pd();
1125             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1126             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1127             ewtabFn          = _mm_setzero_pd();
1128             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1129             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1130             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1131             velec            = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1132             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1133
1134             /* Update potential sum for this i atom from the interaction with this j atom. */
1135             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1136             velecsum         = _mm_add_pd(velecsum,velec);
1137
1138             fscal            = felec;
1139
1140             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1141
1142             /* Calculate temporary vectorial force */
1143             tx               = _mm_mul_pd(fscal,dx31);
1144             ty               = _mm_mul_pd(fscal,dy31);
1145             tz               = _mm_mul_pd(fscal,dz31);
1146
1147             /* Update vectorial force */
1148             fix3             = _mm_add_pd(fix3,tx);
1149             fiy3             = _mm_add_pd(fiy3,ty);
1150             fiz3             = _mm_add_pd(fiz3,tz);
1151
1152             fjx1             = _mm_add_pd(fjx1,tx);
1153             fjy1             = _mm_add_pd(fjy1,ty);
1154             fjz1             = _mm_add_pd(fjz1,tz);
1155
1156             /**************************
1157              * CALCULATE INTERACTIONS *
1158              **************************/
1159
1160             r32              = _mm_mul_pd(rsq32,rinv32);
1161
1162             /* EWALD ELECTROSTATICS */
1163
1164             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165             ewrt             = _mm_mul_pd(r32,ewtabscale);
1166             ewitab           = _mm_cvttpd_epi32(ewrt);
1167             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1168             ewitab           = _mm_slli_epi32(ewitab,2);
1169             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1170             ewtabD           = _mm_setzero_pd();
1171             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1172             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1173             ewtabFn          = _mm_setzero_pd();
1174             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1175             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1176             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1177             velec            = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1178             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1179
1180             /* Update potential sum for this i atom from the interaction with this j atom. */
1181             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1182             velecsum         = _mm_add_pd(velecsum,velec);
1183
1184             fscal            = felec;
1185
1186             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1187
1188             /* Calculate temporary vectorial force */
1189             tx               = _mm_mul_pd(fscal,dx32);
1190             ty               = _mm_mul_pd(fscal,dy32);
1191             tz               = _mm_mul_pd(fscal,dz32);
1192
1193             /* Update vectorial force */
1194             fix3             = _mm_add_pd(fix3,tx);
1195             fiy3             = _mm_add_pd(fiy3,ty);
1196             fiz3             = _mm_add_pd(fiz3,tz);
1197
1198             fjx2             = _mm_add_pd(fjx2,tx);
1199             fjy2             = _mm_add_pd(fjy2,ty);
1200             fjz2             = _mm_add_pd(fjz2,tz);
1201
1202             /**************************
1203              * CALCULATE INTERACTIONS *
1204              **************************/
1205
1206             r33              = _mm_mul_pd(rsq33,rinv33);
1207
1208             /* EWALD ELECTROSTATICS */
1209
1210             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211             ewrt             = _mm_mul_pd(r33,ewtabscale);
1212             ewitab           = _mm_cvttpd_epi32(ewrt);
1213             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1214             ewitab           = _mm_slli_epi32(ewitab,2);
1215             ewtabF           = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1216             ewtabD           = _mm_setzero_pd();
1217             GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1218             ewtabV           = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1219             ewtabFn          = _mm_setzero_pd();
1220             GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1221             felec            = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1222             velec            = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1223             velec            = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1224             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1225
1226             /* Update potential sum for this i atom from the interaction with this j atom. */
1227             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1228             velecsum         = _mm_add_pd(velecsum,velec);
1229
1230             fscal            = felec;
1231
1232             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1233
1234             /* Calculate temporary vectorial force */
1235             tx               = _mm_mul_pd(fscal,dx33);
1236             ty               = _mm_mul_pd(fscal,dy33);
1237             tz               = _mm_mul_pd(fscal,dz33);
1238
1239             /* Update vectorial force */
1240             fix3             = _mm_add_pd(fix3,tx);
1241             fiy3             = _mm_add_pd(fiy3,ty);
1242             fiz3             = _mm_add_pd(fiz3,tz);
1243
1244             fjx3             = _mm_add_pd(fjx3,tx);
1245             fjy3             = _mm_add_pd(fjy3,ty);
1246             fjz3             = _mm_add_pd(fjz3,tz);
1247
1248             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1249
1250             /* Inner loop uses 404 flops */
1251         }
1252
1253         /* End of innermost loop */
1254
1255         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1256                                               f+i_coord_offset,fshift+i_shift_offset);
1257
1258         ggid                        = gid[iidx];
1259         /* Update potential energies */
1260         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1261         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1262
1263         /* Increment number of inner iterations */
1264         inneriter                  += j_index_end - j_index_start;
1265
1266         /* Outer loop uses 26 flops */
1267     }
1268
1269     /* Increment number of outer iterations */
1270     outeriter        += nri;
1271
1272     /* Update outer/inner flops */
1273
1274     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*404);
1275 }
1276 /*
1277  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_double
1278  * Electrostatics interaction: Ewald
1279  * VdW interaction:            LennardJones
1280  * Geometry:                   Water4-Water4
1281  * Calculate force/pot:        Force
1282  */
1283 void
1284 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_double
1285                     (t_nblist * gmx_restrict                nlist,
1286                      rvec * gmx_restrict                    xx,
1287                      rvec * gmx_restrict                    ff,
1288                      t_forcerec * gmx_restrict              fr,
1289                      t_mdatoms * gmx_restrict               mdatoms,
1290                      nb_kernel_data_t * gmx_restrict        kernel_data,
1291                      t_nrnb * gmx_restrict                  nrnb)
1292 {
1293     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1294      * just 0 for non-waters.
1295      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1296      * jnr indices corresponding to data put in the four positions in the SIMD register.
1297      */
1298     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1299     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1300     int              jnrA,jnrB;
1301     int              j_coord_offsetA,j_coord_offsetB;
1302     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1303     real             rcutoff_scalar;
1304     real             *shiftvec,*fshift,*x,*f;
1305     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1306     int              vdwioffset0;
1307     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1308     int              vdwioffset1;
1309     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1310     int              vdwioffset2;
1311     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1312     int              vdwioffset3;
1313     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1314     int              vdwjidx0A,vdwjidx0B;
1315     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1316     int              vdwjidx1A,vdwjidx1B;
1317     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1318     int              vdwjidx2A,vdwjidx2B;
1319     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1320     int              vdwjidx3A,vdwjidx3B;
1321     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1322     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1323     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1324     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1325     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1326     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1327     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1328     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1329     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1330     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1331     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1332     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1333     real             *charge;
1334     int              nvdwtype;
1335     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1336     int              *vdwtype;
1337     real             *vdwparam;
1338     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1339     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1340     __m128i          ewitab;
1341     __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1342     real             *ewtab;
1343     __m128d          dummy_mask,cutoff_mask;
1344     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1345     __m128d          one     = _mm_set1_pd(1.0);
1346     __m128d          two     = _mm_set1_pd(2.0);
1347     x                = xx[0];
1348     f                = ff[0];
1349
1350     nri              = nlist->nri;
1351     iinr             = nlist->iinr;
1352     jindex           = nlist->jindex;
1353     jjnr             = nlist->jjnr;
1354     shiftidx         = nlist->shift;
1355     gid              = nlist->gid;
1356     shiftvec         = fr->shift_vec[0];
1357     fshift           = fr->fshift[0];
1358     facel            = _mm_set1_pd(fr->epsfac);
1359     charge           = mdatoms->chargeA;
1360     nvdwtype         = fr->ntype;
1361     vdwparam         = fr->nbfp;
1362     vdwtype          = mdatoms->typeA;
1363
1364     sh_ewald         = _mm_set1_pd(fr->ic->sh_ewald);
1365     ewtab            = fr->ic->tabq_coul_F;
1366     ewtabscale       = _mm_set1_pd(fr->ic->tabq_scale);
1367     ewtabhalfspace   = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1368
1369     /* Setup water-specific parameters */
1370     inr              = nlist->iinr[0];
1371     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1372     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1373     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1374     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1375
1376     jq1              = _mm_set1_pd(charge[inr+1]);
1377     jq2              = _mm_set1_pd(charge[inr+2]);
1378     jq3              = _mm_set1_pd(charge[inr+3]);
1379     vdwjidx0A        = 2*vdwtype[inr+0];
1380     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1381     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1382     qq11             = _mm_mul_pd(iq1,jq1);
1383     qq12             = _mm_mul_pd(iq1,jq2);
1384     qq13             = _mm_mul_pd(iq1,jq3);
1385     qq21             = _mm_mul_pd(iq2,jq1);
1386     qq22             = _mm_mul_pd(iq2,jq2);
1387     qq23             = _mm_mul_pd(iq2,jq3);
1388     qq31             = _mm_mul_pd(iq3,jq1);
1389     qq32             = _mm_mul_pd(iq3,jq2);
1390     qq33             = _mm_mul_pd(iq3,jq3);
1391
1392     /* Avoid stupid compiler warnings */
1393     jnrA = jnrB = 0;
1394     j_coord_offsetA = 0;
1395     j_coord_offsetB = 0;
1396
1397     outeriter        = 0;
1398     inneriter        = 0;
1399
1400     /* Start outer loop over neighborlists */
1401     for(iidx=0; iidx<nri; iidx++)
1402     {
1403         /* Load shift vector for this list */
1404         i_shift_offset   = DIM*shiftidx[iidx];
1405
1406         /* Load limits for loop over neighbors */
1407         j_index_start    = jindex[iidx];
1408         j_index_end      = jindex[iidx+1];
1409
1410         /* Get outer coordinate index */
1411         inr              = iinr[iidx];
1412         i_coord_offset   = DIM*inr;
1413
1414         /* Load i particle coords and add shift vector */
1415         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1416                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1417
1418         fix0             = _mm_setzero_pd();
1419         fiy0             = _mm_setzero_pd();
1420         fiz0             = _mm_setzero_pd();
1421         fix1             = _mm_setzero_pd();
1422         fiy1             = _mm_setzero_pd();
1423         fiz1             = _mm_setzero_pd();
1424         fix2             = _mm_setzero_pd();
1425         fiy2             = _mm_setzero_pd();
1426         fiz2             = _mm_setzero_pd();
1427         fix3             = _mm_setzero_pd();
1428         fiy3             = _mm_setzero_pd();
1429         fiz3             = _mm_setzero_pd();
1430
1431         /* Start inner kernel loop */
1432         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1433         {
1434
1435             /* Get j neighbor index, and coordinate index */
1436             jnrA             = jjnr[jidx];
1437             jnrB             = jjnr[jidx+1];
1438             j_coord_offsetA  = DIM*jnrA;
1439             j_coord_offsetB  = DIM*jnrB;
1440
1441             /* load j atom coordinates */
1442             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1443                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1444                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1445
1446             /* Calculate displacement vector */
1447             dx00             = _mm_sub_pd(ix0,jx0);
1448             dy00             = _mm_sub_pd(iy0,jy0);
1449             dz00             = _mm_sub_pd(iz0,jz0);
1450             dx11             = _mm_sub_pd(ix1,jx1);
1451             dy11             = _mm_sub_pd(iy1,jy1);
1452             dz11             = _mm_sub_pd(iz1,jz1);
1453             dx12             = _mm_sub_pd(ix1,jx2);
1454             dy12             = _mm_sub_pd(iy1,jy2);
1455             dz12             = _mm_sub_pd(iz1,jz2);
1456             dx13             = _mm_sub_pd(ix1,jx3);
1457             dy13             = _mm_sub_pd(iy1,jy3);
1458             dz13             = _mm_sub_pd(iz1,jz3);
1459             dx21             = _mm_sub_pd(ix2,jx1);
1460             dy21             = _mm_sub_pd(iy2,jy1);
1461             dz21             = _mm_sub_pd(iz2,jz1);
1462             dx22             = _mm_sub_pd(ix2,jx2);
1463             dy22             = _mm_sub_pd(iy2,jy2);
1464             dz22             = _mm_sub_pd(iz2,jz2);
1465             dx23             = _mm_sub_pd(ix2,jx3);
1466             dy23             = _mm_sub_pd(iy2,jy3);
1467             dz23             = _mm_sub_pd(iz2,jz3);
1468             dx31             = _mm_sub_pd(ix3,jx1);
1469             dy31             = _mm_sub_pd(iy3,jy1);
1470             dz31             = _mm_sub_pd(iz3,jz1);
1471             dx32             = _mm_sub_pd(ix3,jx2);
1472             dy32             = _mm_sub_pd(iy3,jy2);
1473             dz32             = _mm_sub_pd(iz3,jz2);
1474             dx33             = _mm_sub_pd(ix3,jx3);
1475             dy33             = _mm_sub_pd(iy3,jy3);
1476             dz33             = _mm_sub_pd(iz3,jz3);
1477
1478             /* Calculate squared distance and things based on it */
1479             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1480             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1481             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1482             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1483             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1484             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1485             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1486             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1487             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1488             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1489
1490             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1491             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1492             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1493             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1494             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1495             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1496             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1497             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1498             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1499
1500             rinvsq00         = gmx_mm_inv_pd(rsq00);
1501             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1502             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1503             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1504             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1505             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1506             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1507             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1508             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1509             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1510
1511             fjx0             = _mm_setzero_pd();
1512             fjy0             = _mm_setzero_pd();
1513             fjz0             = _mm_setzero_pd();
1514             fjx1             = _mm_setzero_pd();
1515             fjy1             = _mm_setzero_pd();
1516             fjz1             = _mm_setzero_pd();
1517             fjx2             = _mm_setzero_pd();
1518             fjy2             = _mm_setzero_pd();
1519             fjz2             = _mm_setzero_pd();
1520             fjx3             = _mm_setzero_pd();
1521             fjy3             = _mm_setzero_pd();
1522             fjz3             = _mm_setzero_pd();
1523
1524             /**************************
1525              * CALCULATE INTERACTIONS *
1526              **************************/
1527
1528             /* LENNARD-JONES DISPERSION/REPULSION */
1529
1530             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1531             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1532
1533             fscal            = fvdw;
1534
1535             /* Calculate temporary vectorial force */
1536             tx               = _mm_mul_pd(fscal,dx00);
1537             ty               = _mm_mul_pd(fscal,dy00);
1538             tz               = _mm_mul_pd(fscal,dz00);
1539
1540             /* Update vectorial force */
1541             fix0             = _mm_add_pd(fix0,tx);
1542             fiy0             = _mm_add_pd(fiy0,ty);
1543             fiz0             = _mm_add_pd(fiz0,tz);
1544
1545             fjx0             = _mm_add_pd(fjx0,tx);
1546             fjy0             = _mm_add_pd(fjy0,ty);
1547             fjz0             = _mm_add_pd(fjz0,tz);
1548
1549             /**************************
1550              * CALCULATE INTERACTIONS *
1551              **************************/
1552
1553             r11              = _mm_mul_pd(rsq11,rinv11);
1554
1555             /* EWALD ELECTROSTATICS */
1556
1557             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1558             ewrt             = _mm_mul_pd(r11,ewtabscale);
1559             ewitab           = _mm_cvttpd_epi32(ewrt);
1560             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1561             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1562                                          &ewtabF,&ewtabFn);
1563             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1564             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1565
1566             fscal            = felec;
1567
1568             /* Calculate temporary vectorial force */
1569             tx               = _mm_mul_pd(fscal,dx11);
1570             ty               = _mm_mul_pd(fscal,dy11);
1571             tz               = _mm_mul_pd(fscal,dz11);
1572
1573             /* Update vectorial force */
1574             fix1             = _mm_add_pd(fix1,tx);
1575             fiy1             = _mm_add_pd(fiy1,ty);
1576             fiz1             = _mm_add_pd(fiz1,tz);
1577
1578             fjx1             = _mm_add_pd(fjx1,tx);
1579             fjy1             = _mm_add_pd(fjy1,ty);
1580             fjz1             = _mm_add_pd(fjz1,tz);
1581
1582             /**************************
1583              * CALCULATE INTERACTIONS *
1584              **************************/
1585
1586             r12              = _mm_mul_pd(rsq12,rinv12);
1587
1588             /* EWALD ELECTROSTATICS */
1589
1590             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1591             ewrt             = _mm_mul_pd(r12,ewtabscale);
1592             ewitab           = _mm_cvttpd_epi32(ewrt);
1593             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1594             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1595                                          &ewtabF,&ewtabFn);
1596             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1597             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1598
1599             fscal            = felec;
1600
1601             /* Calculate temporary vectorial force */
1602             tx               = _mm_mul_pd(fscal,dx12);
1603             ty               = _mm_mul_pd(fscal,dy12);
1604             tz               = _mm_mul_pd(fscal,dz12);
1605
1606             /* Update vectorial force */
1607             fix1             = _mm_add_pd(fix1,tx);
1608             fiy1             = _mm_add_pd(fiy1,ty);
1609             fiz1             = _mm_add_pd(fiz1,tz);
1610
1611             fjx2             = _mm_add_pd(fjx2,tx);
1612             fjy2             = _mm_add_pd(fjy2,ty);
1613             fjz2             = _mm_add_pd(fjz2,tz);
1614
1615             /**************************
1616              * CALCULATE INTERACTIONS *
1617              **************************/
1618
1619             r13              = _mm_mul_pd(rsq13,rinv13);
1620
1621             /* EWALD ELECTROSTATICS */
1622
1623             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1624             ewrt             = _mm_mul_pd(r13,ewtabscale);
1625             ewitab           = _mm_cvttpd_epi32(ewrt);
1626             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1627             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1628                                          &ewtabF,&ewtabFn);
1629             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1630             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1631
1632             fscal            = felec;
1633
1634             /* Calculate temporary vectorial force */
1635             tx               = _mm_mul_pd(fscal,dx13);
1636             ty               = _mm_mul_pd(fscal,dy13);
1637             tz               = _mm_mul_pd(fscal,dz13);
1638
1639             /* Update vectorial force */
1640             fix1             = _mm_add_pd(fix1,tx);
1641             fiy1             = _mm_add_pd(fiy1,ty);
1642             fiz1             = _mm_add_pd(fiz1,tz);
1643
1644             fjx3             = _mm_add_pd(fjx3,tx);
1645             fjy3             = _mm_add_pd(fjy3,ty);
1646             fjz3             = _mm_add_pd(fjz3,tz);
1647
1648             /**************************
1649              * CALCULATE INTERACTIONS *
1650              **************************/
1651
1652             r21              = _mm_mul_pd(rsq21,rinv21);
1653
1654             /* EWALD ELECTROSTATICS */
1655
1656             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1657             ewrt             = _mm_mul_pd(r21,ewtabscale);
1658             ewitab           = _mm_cvttpd_epi32(ewrt);
1659             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1660             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1661                                          &ewtabF,&ewtabFn);
1662             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1663             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1664
1665             fscal            = felec;
1666
1667             /* Calculate temporary vectorial force */
1668             tx               = _mm_mul_pd(fscal,dx21);
1669             ty               = _mm_mul_pd(fscal,dy21);
1670             tz               = _mm_mul_pd(fscal,dz21);
1671
1672             /* Update vectorial force */
1673             fix2             = _mm_add_pd(fix2,tx);
1674             fiy2             = _mm_add_pd(fiy2,ty);
1675             fiz2             = _mm_add_pd(fiz2,tz);
1676
1677             fjx1             = _mm_add_pd(fjx1,tx);
1678             fjy1             = _mm_add_pd(fjy1,ty);
1679             fjz1             = _mm_add_pd(fjz1,tz);
1680
1681             /**************************
1682              * CALCULATE INTERACTIONS *
1683              **************************/
1684
1685             r22              = _mm_mul_pd(rsq22,rinv22);
1686
1687             /* EWALD ELECTROSTATICS */
1688
1689             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1690             ewrt             = _mm_mul_pd(r22,ewtabscale);
1691             ewitab           = _mm_cvttpd_epi32(ewrt);
1692             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1693             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1694                                          &ewtabF,&ewtabFn);
1695             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1696             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1697
1698             fscal            = felec;
1699
1700             /* Calculate temporary vectorial force */
1701             tx               = _mm_mul_pd(fscal,dx22);
1702             ty               = _mm_mul_pd(fscal,dy22);
1703             tz               = _mm_mul_pd(fscal,dz22);
1704
1705             /* Update vectorial force */
1706             fix2             = _mm_add_pd(fix2,tx);
1707             fiy2             = _mm_add_pd(fiy2,ty);
1708             fiz2             = _mm_add_pd(fiz2,tz);
1709
1710             fjx2             = _mm_add_pd(fjx2,tx);
1711             fjy2             = _mm_add_pd(fjy2,ty);
1712             fjz2             = _mm_add_pd(fjz2,tz);
1713
1714             /**************************
1715              * CALCULATE INTERACTIONS *
1716              **************************/
1717
1718             r23              = _mm_mul_pd(rsq23,rinv23);
1719
1720             /* EWALD ELECTROSTATICS */
1721
1722             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1723             ewrt             = _mm_mul_pd(r23,ewtabscale);
1724             ewitab           = _mm_cvttpd_epi32(ewrt);
1725             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1726             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1727                                          &ewtabF,&ewtabFn);
1728             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1729             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1730
1731             fscal            = felec;
1732
1733             /* Calculate temporary vectorial force */
1734             tx               = _mm_mul_pd(fscal,dx23);
1735             ty               = _mm_mul_pd(fscal,dy23);
1736             tz               = _mm_mul_pd(fscal,dz23);
1737
1738             /* Update vectorial force */
1739             fix2             = _mm_add_pd(fix2,tx);
1740             fiy2             = _mm_add_pd(fiy2,ty);
1741             fiz2             = _mm_add_pd(fiz2,tz);
1742
1743             fjx3             = _mm_add_pd(fjx3,tx);
1744             fjy3             = _mm_add_pd(fjy3,ty);
1745             fjz3             = _mm_add_pd(fjz3,tz);
1746
1747             /**************************
1748              * CALCULATE INTERACTIONS *
1749              **************************/
1750
1751             r31              = _mm_mul_pd(rsq31,rinv31);
1752
1753             /* EWALD ELECTROSTATICS */
1754
1755             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1756             ewrt             = _mm_mul_pd(r31,ewtabscale);
1757             ewitab           = _mm_cvttpd_epi32(ewrt);
1758             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1759             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1760                                          &ewtabF,&ewtabFn);
1761             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1762             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1763
1764             fscal            = felec;
1765
1766             /* Calculate temporary vectorial force */
1767             tx               = _mm_mul_pd(fscal,dx31);
1768             ty               = _mm_mul_pd(fscal,dy31);
1769             tz               = _mm_mul_pd(fscal,dz31);
1770
1771             /* Update vectorial force */
1772             fix3             = _mm_add_pd(fix3,tx);
1773             fiy3             = _mm_add_pd(fiy3,ty);
1774             fiz3             = _mm_add_pd(fiz3,tz);
1775
1776             fjx1             = _mm_add_pd(fjx1,tx);
1777             fjy1             = _mm_add_pd(fjy1,ty);
1778             fjz1             = _mm_add_pd(fjz1,tz);
1779
1780             /**************************
1781              * CALCULATE INTERACTIONS *
1782              **************************/
1783
1784             r32              = _mm_mul_pd(rsq32,rinv32);
1785
1786             /* EWALD ELECTROSTATICS */
1787
1788             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1789             ewrt             = _mm_mul_pd(r32,ewtabscale);
1790             ewitab           = _mm_cvttpd_epi32(ewrt);
1791             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1792             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1793                                          &ewtabF,&ewtabFn);
1794             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1795             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1796
1797             fscal            = felec;
1798
1799             /* Calculate temporary vectorial force */
1800             tx               = _mm_mul_pd(fscal,dx32);
1801             ty               = _mm_mul_pd(fscal,dy32);
1802             tz               = _mm_mul_pd(fscal,dz32);
1803
1804             /* Update vectorial force */
1805             fix3             = _mm_add_pd(fix3,tx);
1806             fiy3             = _mm_add_pd(fiy3,ty);
1807             fiz3             = _mm_add_pd(fiz3,tz);
1808
1809             fjx2             = _mm_add_pd(fjx2,tx);
1810             fjy2             = _mm_add_pd(fjy2,ty);
1811             fjz2             = _mm_add_pd(fjz2,tz);
1812
1813             /**************************
1814              * CALCULATE INTERACTIONS *
1815              **************************/
1816
1817             r33              = _mm_mul_pd(rsq33,rinv33);
1818
1819             /* EWALD ELECTROSTATICS */
1820
1821             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1822             ewrt             = _mm_mul_pd(r33,ewtabscale);
1823             ewitab           = _mm_cvttpd_epi32(ewrt);
1824             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1825             gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1826                                          &ewtabF,&ewtabFn);
1827             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1828             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1829
1830             fscal            = felec;
1831
1832             /* Calculate temporary vectorial force */
1833             tx               = _mm_mul_pd(fscal,dx33);
1834             ty               = _mm_mul_pd(fscal,dy33);
1835             tz               = _mm_mul_pd(fscal,dz33);
1836
1837             /* Update vectorial force */
1838             fix3             = _mm_add_pd(fix3,tx);
1839             fiy3             = _mm_add_pd(fiy3,ty);
1840             fiz3             = _mm_add_pd(fiz3,tz);
1841
1842             fjx3             = _mm_add_pd(fjx3,tx);
1843             fjy3             = _mm_add_pd(fjy3,ty);
1844             fjz3             = _mm_add_pd(fjz3,tz);
1845
1846             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1847
1848             /* Inner loop uses 354 flops */
1849         }
1850
1851         if(jidx<j_index_end)
1852         {
1853
1854             jnrA             = jjnr[jidx];
1855             j_coord_offsetA  = DIM*jnrA;
1856
1857             /* load j atom coordinates */
1858             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1859                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1860                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1861
1862             /* Calculate displacement vector */
1863             dx00             = _mm_sub_pd(ix0,jx0);
1864             dy00             = _mm_sub_pd(iy0,jy0);
1865             dz00             = _mm_sub_pd(iz0,jz0);
1866             dx11             = _mm_sub_pd(ix1,jx1);
1867             dy11             = _mm_sub_pd(iy1,jy1);
1868             dz11             = _mm_sub_pd(iz1,jz1);
1869             dx12             = _mm_sub_pd(ix1,jx2);
1870             dy12             = _mm_sub_pd(iy1,jy2);
1871             dz12             = _mm_sub_pd(iz1,jz2);
1872             dx13             = _mm_sub_pd(ix1,jx3);
1873             dy13             = _mm_sub_pd(iy1,jy3);
1874             dz13             = _mm_sub_pd(iz1,jz3);
1875             dx21             = _mm_sub_pd(ix2,jx1);
1876             dy21             = _mm_sub_pd(iy2,jy1);
1877             dz21             = _mm_sub_pd(iz2,jz1);
1878             dx22             = _mm_sub_pd(ix2,jx2);
1879             dy22             = _mm_sub_pd(iy2,jy2);
1880             dz22             = _mm_sub_pd(iz2,jz2);
1881             dx23             = _mm_sub_pd(ix2,jx3);
1882             dy23             = _mm_sub_pd(iy2,jy3);
1883             dz23             = _mm_sub_pd(iz2,jz3);
1884             dx31             = _mm_sub_pd(ix3,jx1);
1885             dy31             = _mm_sub_pd(iy3,jy1);
1886             dz31             = _mm_sub_pd(iz3,jz1);
1887             dx32             = _mm_sub_pd(ix3,jx2);
1888             dy32             = _mm_sub_pd(iy3,jy2);
1889             dz32             = _mm_sub_pd(iz3,jz2);
1890             dx33             = _mm_sub_pd(ix3,jx3);
1891             dy33             = _mm_sub_pd(iy3,jy3);
1892             dz33             = _mm_sub_pd(iz3,jz3);
1893
1894             /* Calculate squared distance and things based on it */
1895             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1896             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1897             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1898             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1899             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1900             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1901             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1902             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1903             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1904             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1905
1906             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1907             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1908             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1909             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1910             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1911             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1912             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1913             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1914             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1915
1916             rinvsq00         = gmx_mm_inv_pd(rsq00);
1917             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1918             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1919             rinvsq13         = _mm_mul_pd(rinv13,rinv13);
1920             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1921             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1922             rinvsq23         = _mm_mul_pd(rinv23,rinv23);
1923             rinvsq31         = _mm_mul_pd(rinv31,rinv31);
1924             rinvsq32         = _mm_mul_pd(rinv32,rinv32);
1925             rinvsq33         = _mm_mul_pd(rinv33,rinv33);
1926
1927             fjx0             = _mm_setzero_pd();
1928             fjy0             = _mm_setzero_pd();
1929             fjz0             = _mm_setzero_pd();
1930             fjx1             = _mm_setzero_pd();
1931             fjy1             = _mm_setzero_pd();
1932             fjz1             = _mm_setzero_pd();
1933             fjx2             = _mm_setzero_pd();
1934             fjy2             = _mm_setzero_pd();
1935             fjz2             = _mm_setzero_pd();
1936             fjx3             = _mm_setzero_pd();
1937             fjy3             = _mm_setzero_pd();
1938             fjz3             = _mm_setzero_pd();
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             /* LENNARD-JONES DISPERSION/REPULSION */
1945
1946             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1947             fvdw             = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1948
1949             fscal            = fvdw;
1950
1951             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1952
1953             /* Calculate temporary vectorial force */
1954             tx               = _mm_mul_pd(fscal,dx00);
1955             ty               = _mm_mul_pd(fscal,dy00);
1956             tz               = _mm_mul_pd(fscal,dz00);
1957
1958             /* Update vectorial force */
1959             fix0             = _mm_add_pd(fix0,tx);
1960             fiy0             = _mm_add_pd(fiy0,ty);
1961             fiz0             = _mm_add_pd(fiz0,tz);
1962
1963             fjx0             = _mm_add_pd(fjx0,tx);
1964             fjy0             = _mm_add_pd(fjy0,ty);
1965             fjz0             = _mm_add_pd(fjz0,tz);
1966
1967             /**************************
1968              * CALCULATE INTERACTIONS *
1969              **************************/
1970
1971             r11              = _mm_mul_pd(rsq11,rinv11);
1972
1973             /* EWALD ELECTROSTATICS */
1974
1975             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1976             ewrt             = _mm_mul_pd(r11,ewtabscale);
1977             ewitab           = _mm_cvttpd_epi32(ewrt);
1978             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1979             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1980             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1981             felec            = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1982
1983             fscal            = felec;
1984
1985             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1986
1987             /* Calculate temporary vectorial force */
1988             tx               = _mm_mul_pd(fscal,dx11);
1989             ty               = _mm_mul_pd(fscal,dy11);
1990             tz               = _mm_mul_pd(fscal,dz11);
1991
1992             /* Update vectorial force */
1993             fix1             = _mm_add_pd(fix1,tx);
1994             fiy1             = _mm_add_pd(fiy1,ty);
1995             fiz1             = _mm_add_pd(fiz1,tz);
1996
1997             fjx1             = _mm_add_pd(fjx1,tx);
1998             fjy1             = _mm_add_pd(fjy1,ty);
1999             fjz1             = _mm_add_pd(fjz1,tz);
2000
2001             /**************************
2002              * CALCULATE INTERACTIONS *
2003              **************************/
2004
2005             r12              = _mm_mul_pd(rsq12,rinv12);
2006
2007             /* EWALD ELECTROSTATICS */
2008
2009             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010             ewrt             = _mm_mul_pd(r12,ewtabscale);
2011             ewitab           = _mm_cvttpd_epi32(ewrt);
2012             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2013             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2014             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2015             felec            = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2016
2017             fscal            = felec;
2018
2019             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2020
2021             /* Calculate temporary vectorial force */
2022             tx               = _mm_mul_pd(fscal,dx12);
2023             ty               = _mm_mul_pd(fscal,dy12);
2024             tz               = _mm_mul_pd(fscal,dz12);
2025
2026             /* Update vectorial force */
2027             fix1             = _mm_add_pd(fix1,tx);
2028             fiy1             = _mm_add_pd(fiy1,ty);
2029             fiz1             = _mm_add_pd(fiz1,tz);
2030
2031             fjx2             = _mm_add_pd(fjx2,tx);
2032             fjy2             = _mm_add_pd(fjy2,ty);
2033             fjz2             = _mm_add_pd(fjz2,tz);
2034
2035             /**************************
2036              * CALCULATE INTERACTIONS *
2037              **************************/
2038
2039             r13              = _mm_mul_pd(rsq13,rinv13);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044             ewrt             = _mm_mul_pd(r13,ewtabscale);
2045             ewitab           = _mm_cvttpd_epi32(ewrt);
2046             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2047             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2048             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2049             felec            = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2050
2051             fscal            = felec;
2052
2053             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2054
2055             /* Calculate temporary vectorial force */
2056             tx               = _mm_mul_pd(fscal,dx13);
2057             ty               = _mm_mul_pd(fscal,dy13);
2058             tz               = _mm_mul_pd(fscal,dz13);
2059
2060             /* Update vectorial force */
2061             fix1             = _mm_add_pd(fix1,tx);
2062             fiy1             = _mm_add_pd(fiy1,ty);
2063             fiz1             = _mm_add_pd(fiz1,tz);
2064
2065             fjx3             = _mm_add_pd(fjx3,tx);
2066             fjy3             = _mm_add_pd(fjy3,ty);
2067             fjz3             = _mm_add_pd(fjz3,tz);
2068
2069             /**************************
2070              * CALCULATE INTERACTIONS *
2071              **************************/
2072
2073             r21              = _mm_mul_pd(rsq21,rinv21);
2074
2075             /* EWALD ELECTROSTATICS */
2076
2077             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2078             ewrt             = _mm_mul_pd(r21,ewtabscale);
2079             ewitab           = _mm_cvttpd_epi32(ewrt);
2080             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2081             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2082             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2083             felec            = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2084
2085             fscal            = felec;
2086
2087             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2088
2089             /* Calculate temporary vectorial force */
2090             tx               = _mm_mul_pd(fscal,dx21);
2091             ty               = _mm_mul_pd(fscal,dy21);
2092             tz               = _mm_mul_pd(fscal,dz21);
2093
2094             /* Update vectorial force */
2095             fix2             = _mm_add_pd(fix2,tx);
2096             fiy2             = _mm_add_pd(fiy2,ty);
2097             fiz2             = _mm_add_pd(fiz2,tz);
2098
2099             fjx1             = _mm_add_pd(fjx1,tx);
2100             fjy1             = _mm_add_pd(fjy1,ty);
2101             fjz1             = _mm_add_pd(fjz1,tz);
2102
2103             /**************************
2104              * CALCULATE INTERACTIONS *
2105              **************************/
2106
2107             r22              = _mm_mul_pd(rsq22,rinv22);
2108
2109             /* EWALD ELECTROSTATICS */
2110
2111             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112             ewrt             = _mm_mul_pd(r22,ewtabscale);
2113             ewitab           = _mm_cvttpd_epi32(ewrt);
2114             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2115             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2116             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2117             felec            = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2118
2119             fscal            = felec;
2120
2121             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2122
2123             /* Calculate temporary vectorial force */
2124             tx               = _mm_mul_pd(fscal,dx22);
2125             ty               = _mm_mul_pd(fscal,dy22);
2126             tz               = _mm_mul_pd(fscal,dz22);
2127
2128             /* Update vectorial force */
2129             fix2             = _mm_add_pd(fix2,tx);
2130             fiy2             = _mm_add_pd(fiy2,ty);
2131             fiz2             = _mm_add_pd(fiz2,tz);
2132
2133             fjx2             = _mm_add_pd(fjx2,tx);
2134             fjy2             = _mm_add_pd(fjy2,ty);
2135             fjz2             = _mm_add_pd(fjz2,tz);
2136
2137             /**************************
2138              * CALCULATE INTERACTIONS *
2139              **************************/
2140
2141             r23              = _mm_mul_pd(rsq23,rinv23);
2142
2143             /* EWALD ELECTROSTATICS */
2144
2145             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2146             ewrt             = _mm_mul_pd(r23,ewtabscale);
2147             ewitab           = _mm_cvttpd_epi32(ewrt);
2148             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2149             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2150             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2151             felec            = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2152
2153             fscal            = felec;
2154
2155             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2156
2157             /* Calculate temporary vectorial force */
2158             tx               = _mm_mul_pd(fscal,dx23);
2159             ty               = _mm_mul_pd(fscal,dy23);
2160             tz               = _mm_mul_pd(fscal,dz23);
2161
2162             /* Update vectorial force */
2163             fix2             = _mm_add_pd(fix2,tx);
2164             fiy2             = _mm_add_pd(fiy2,ty);
2165             fiz2             = _mm_add_pd(fiz2,tz);
2166
2167             fjx3             = _mm_add_pd(fjx3,tx);
2168             fjy3             = _mm_add_pd(fjy3,ty);
2169             fjz3             = _mm_add_pd(fjz3,tz);
2170
2171             /**************************
2172              * CALCULATE INTERACTIONS *
2173              **************************/
2174
2175             r31              = _mm_mul_pd(rsq31,rinv31);
2176
2177             /* EWALD ELECTROSTATICS */
2178
2179             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180             ewrt             = _mm_mul_pd(r31,ewtabscale);
2181             ewitab           = _mm_cvttpd_epi32(ewrt);
2182             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2183             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2184             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2185             felec            = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2186
2187             fscal            = felec;
2188
2189             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2190
2191             /* Calculate temporary vectorial force */
2192             tx               = _mm_mul_pd(fscal,dx31);
2193             ty               = _mm_mul_pd(fscal,dy31);
2194             tz               = _mm_mul_pd(fscal,dz31);
2195
2196             /* Update vectorial force */
2197             fix3             = _mm_add_pd(fix3,tx);
2198             fiy3             = _mm_add_pd(fiy3,ty);
2199             fiz3             = _mm_add_pd(fiz3,tz);
2200
2201             fjx1             = _mm_add_pd(fjx1,tx);
2202             fjy1             = _mm_add_pd(fjy1,ty);
2203             fjz1             = _mm_add_pd(fjz1,tz);
2204
2205             /**************************
2206              * CALCULATE INTERACTIONS *
2207              **************************/
2208
2209             r32              = _mm_mul_pd(rsq32,rinv32);
2210
2211             /* EWALD ELECTROSTATICS */
2212
2213             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214             ewrt             = _mm_mul_pd(r32,ewtabscale);
2215             ewitab           = _mm_cvttpd_epi32(ewrt);
2216             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2217             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2218             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2219             felec            = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2220
2221             fscal            = felec;
2222
2223             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2224
2225             /* Calculate temporary vectorial force */
2226             tx               = _mm_mul_pd(fscal,dx32);
2227             ty               = _mm_mul_pd(fscal,dy32);
2228             tz               = _mm_mul_pd(fscal,dz32);
2229
2230             /* Update vectorial force */
2231             fix3             = _mm_add_pd(fix3,tx);
2232             fiy3             = _mm_add_pd(fiy3,ty);
2233             fiz3             = _mm_add_pd(fiz3,tz);
2234
2235             fjx2             = _mm_add_pd(fjx2,tx);
2236             fjy2             = _mm_add_pd(fjy2,ty);
2237             fjz2             = _mm_add_pd(fjz2,tz);
2238
2239             /**************************
2240              * CALCULATE INTERACTIONS *
2241              **************************/
2242
2243             r33              = _mm_mul_pd(rsq33,rinv33);
2244
2245             /* EWALD ELECTROSTATICS */
2246
2247             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2248             ewrt             = _mm_mul_pd(r33,ewtabscale);
2249             ewitab           = _mm_cvttpd_epi32(ewrt);
2250             eweps            = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2251             gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2252             felec            = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2253             felec            = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2254
2255             fscal            = felec;
2256
2257             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2258
2259             /* Calculate temporary vectorial force */
2260             tx               = _mm_mul_pd(fscal,dx33);
2261             ty               = _mm_mul_pd(fscal,dy33);
2262             tz               = _mm_mul_pd(fscal,dz33);
2263
2264             /* Update vectorial force */
2265             fix3             = _mm_add_pd(fix3,tx);
2266             fiy3             = _mm_add_pd(fiy3,ty);
2267             fiz3             = _mm_add_pd(fiz3,tz);
2268
2269             fjx3             = _mm_add_pd(fjx3,tx);
2270             fjy3             = _mm_add_pd(fjy3,ty);
2271             fjz3             = _mm_add_pd(fjz3,tz);
2272
2273             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2274
2275             /* Inner loop uses 354 flops */
2276         }
2277
2278         /* End of innermost loop */
2279
2280         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2281                                               f+i_coord_offset,fshift+i_shift_offset);
2282
2283         /* Increment number of inner iterations */
2284         inneriter                  += j_index_end - j_index_start;
2285
2286         /* Outer loop uses 24 flops */
2287     }
2288
2289     /* Increment number of outer iterations */
2290     outeriter        += nri;
2291
2292     /* Update outer/inner flops */
2293
2294     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*354);
2295 }