Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_128_fma_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwjidx0A,vdwjidx0B;
86     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87     int              vdwjidx1A,vdwjidx1B;
88     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89     int              vdwjidx2A,vdwjidx2B;
90     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
101     real             *charge;
102     __m128i          vfitab;
103     __m128i          ifour       = _mm_set1_epi32(4);
104     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
105     real             *vftab;
106     __m128d          dummy_mask,cutoff_mask;
107     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108     __m128d          one     = _mm_set1_pd(1.0);
109     __m128d          two     = _mm_set1_pd(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm_set1_pd(fr->ic->epsfac);
122     charge           = mdatoms->chargeA;
123
124     vftab            = kernel_data->table_elec->data;
125     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
130     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
132
133     jq0              = _mm_set1_pd(charge[inr+0]);
134     jq1              = _mm_set1_pd(charge[inr+1]);
135     jq2              = _mm_set1_pd(charge[inr+2]);
136     qq00             = _mm_mul_pd(iq0,jq0);
137     qq01             = _mm_mul_pd(iq0,jq1);
138     qq02             = _mm_mul_pd(iq0,jq2);
139     qq10             = _mm_mul_pd(iq1,jq0);
140     qq11             = _mm_mul_pd(iq1,jq1);
141     qq12             = _mm_mul_pd(iq1,jq2);
142     qq20             = _mm_mul_pd(iq2,jq0);
143     qq21             = _mm_mul_pd(iq2,jq1);
144     qq22             = _mm_mul_pd(iq2,jq2);
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     /* Start outer loop over neighborlists */
155     for(iidx=0; iidx<nri; iidx++)
156     {
157         /* Load shift vector for this list */
158         i_shift_offset   = DIM*shiftidx[iidx];
159
160         /* Load limits for loop over neighbors */
161         j_index_start    = jindex[iidx];
162         j_index_end      = jindex[iidx+1];
163
164         /* Get outer coordinate index */
165         inr              = iinr[iidx];
166         i_coord_offset   = DIM*inr;
167
168         /* Load i particle coords and add shift vector */
169         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
171
172         fix0             = _mm_setzero_pd();
173         fiy0             = _mm_setzero_pd();
174         fiz0             = _mm_setzero_pd();
175         fix1             = _mm_setzero_pd();
176         fiy1             = _mm_setzero_pd();
177         fiz1             = _mm_setzero_pd();
178         fix2             = _mm_setzero_pd();
179         fiy2             = _mm_setzero_pd();
180         fiz2             = _mm_setzero_pd();
181
182         /* Reset potential sums */
183         velecsum         = _mm_setzero_pd();
184
185         /* Start inner kernel loop */
186         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
187         {
188
189             /* Get j neighbor index, and coordinate index */
190             jnrA             = jjnr[jidx];
191             jnrB             = jjnr[jidx+1];
192             j_coord_offsetA  = DIM*jnrA;
193             j_coord_offsetB  = DIM*jnrB;
194
195             /* load j atom coordinates */
196             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
197                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
198
199             /* Calculate displacement vector */
200             dx00             = _mm_sub_pd(ix0,jx0);
201             dy00             = _mm_sub_pd(iy0,jy0);
202             dz00             = _mm_sub_pd(iz0,jz0);
203             dx01             = _mm_sub_pd(ix0,jx1);
204             dy01             = _mm_sub_pd(iy0,jy1);
205             dz01             = _mm_sub_pd(iz0,jz1);
206             dx02             = _mm_sub_pd(ix0,jx2);
207             dy02             = _mm_sub_pd(iy0,jy2);
208             dz02             = _mm_sub_pd(iz0,jz2);
209             dx10             = _mm_sub_pd(ix1,jx0);
210             dy10             = _mm_sub_pd(iy1,jy0);
211             dz10             = _mm_sub_pd(iz1,jz0);
212             dx11             = _mm_sub_pd(ix1,jx1);
213             dy11             = _mm_sub_pd(iy1,jy1);
214             dz11             = _mm_sub_pd(iz1,jz1);
215             dx12             = _mm_sub_pd(ix1,jx2);
216             dy12             = _mm_sub_pd(iy1,jy2);
217             dz12             = _mm_sub_pd(iz1,jz2);
218             dx20             = _mm_sub_pd(ix2,jx0);
219             dy20             = _mm_sub_pd(iy2,jy0);
220             dz20             = _mm_sub_pd(iz2,jz0);
221             dx21             = _mm_sub_pd(ix2,jx1);
222             dy21             = _mm_sub_pd(iy2,jy1);
223             dz21             = _mm_sub_pd(iz2,jz1);
224             dx22             = _mm_sub_pd(ix2,jx2);
225             dy22             = _mm_sub_pd(iy2,jy2);
226             dz22             = _mm_sub_pd(iz2,jz2);
227
228             /* Calculate squared distance and things based on it */
229             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
230             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
231             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
232             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
233             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
234             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
235             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
236             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
237             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
238
239             rinv00           = avx128fma_invsqrt_d(rsq00);
240             rinv01           = avx128fma_invsqrt_d(rsq01);
241             rinv02           = avx128fma_invsqrt_d(rsq02);
242             rinv10           = avx128fma_invsqrt_d(rsq10);
243             rinv11           = avx128fma_invsqrt_d(rsq11);
244             rinv12           = avx128fma_invsqrt_d(rsq12);
245             rinv20           = avx128fma_invsqrt_d(rsq20);
246             rinv21           = avx128fma_invsqrt_d(rsq21);
247             rinv22           = avx128fma_invsqrt_d(rsq22);
248
249             fjx0             = _mm_setzero_pd();
250             fjy0             = _mm_setzero_pd();
251             fjz0             = _mm_setzero_pd();
252             fjx1             = _mm_setzero_pd();
253             fjy1             = _mm_setzero_pd();
254             fjz1             = _mm_setzero_pd();
255             fjx2             = _mm_setzero_pd();
256             fjy2             = _mm_setzero_pd();
257             fjz2             = _mm_setzero_pd();
258
259             /**************************
260              * CALCULATE INTERACTIONS *
261              **************************/
262
263             r00              = _mm_mul_pd(rsq00,rinv00);
264
265             /* Calculate table index by multiplying r with table scale and truncate to integer */
266             rt               = _mm_mul_pd(r00,vftabscale);
267             vfitab           = _mm_cvttpd_epi32(rt);
268 #ifdef __XOP__
269             vfeps            = _mm_frcz_pd(rt);
270 #else
271             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
272 #endif
273             twovfeps         = _mm_add_pd(vfeps,vfeps);
274             vfitab           = _mm_slli_epi32(vfitab,2);
275
276             /* CUBIC SPLINE TABLE ELECTROSTATICS */
277             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
278             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
279             GMX_MM_TRANSPOSE2_PD(Y,F);
280             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
281             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
282             GMX_MM_TRANSPOSE2_PD(G,H);
283             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
284             VV               = _mm_macc_pd(vfeps,Fp,Y);
285             velec            = _mm_mul_pd(qq00,VV);
286             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
287             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
288
289             /* Update potential sum for this i atom from the interaction with this j atom. */
290             velecsum         = _mm_add_pd(velecsum,velec);
291
292             fscal            = felec;
293
294             /* Update vectorial force */
295             fix0             = _mm_macc_pd(dx00,fscal,fix0);
296             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
297             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
298             
299             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
300             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
301             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
302
303             /**************************
304              * CALCULATE INTERACTIONS *
305              **************************/
306
307             r01              = _mm_mul_pd(rsq01,rinv01);
308
309             /* Calculate table index by multiplying r with table scale and truncate to integer */
310             rt               = _mm_mul_pd(r01,vftabscale);
311             vfitab           = _mm_cvttpd_epi32(rt);
312 #ifdef __XOP__
313             vfeps            = _mm_frcz_pd(rt);
314 #else
315             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
316 #endif
317             twovfeps         = _mm_add_pd(vfeps,vfeps);
318             vfitab           = _mm_slli_epi32(vfitab,2);
319
320             /* CUBIC SPLINE TABLE ELECTROSTATICS */
321             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
322             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
323             GMX_MM_TRANSPOSE2_PD(Y,F);
324             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
325             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
326             GMX_MM_TRANSPOSE2_PD(G,H);
327             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
328             VV               = _mm_macc_pd(vfeps,Fp,Y);
329             velec            = _mm_mul_pd(qq01,VV);
330             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
331             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_pd(velecsum,velec);
335
336             fscal            = felec;
337
338             /* Update vectorial force */
339             fix0             = _mm_macc_pd(dx01,fscal,fix0);
340             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
341             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
342             
343             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
344             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
345             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
346
347             /**************************
348              * CALCULATE INTERACTIONS *
349              **************************/
350
351             r02              = _mm_mul_pd(rsq02,rinv02);
352
353             /* Calculate table index by multiplying r with table scale and truncate to integer */
354             rt               = _mm_mul_pd(r02,vftabscale);
355             vfitab           = _mm_cvttpd_epi32(rt);
356 #ifdef __XOP__
357             vfeps            = _mm_frcz_pd(rt);
358 #else
359             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
360 #endif
361             twovfeps         = _mm_add_pd(vfeps,vfeps);
362             vfitab           = _mm_slli_epi32(vfitab,2);
363
364             /* CUBIC SPLINE TABLE ELECTROSTATICS */
365             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
366             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
367             GMX_MM_TRANSPOSE2_PD(Y,F);
368             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
369             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
370             GMX_MM_TRANSPOSE2_PD(G,H);
371             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
372             VV               = _mm_macc_pd(vfeps,Fp,Y);
373             velec            = _mm_mul_pd(qq02,VV);
374             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
375             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
376
377             /* Update potential sum for this i atom from the interaction with this j atom. */
378             velecsum         = _mm_add_pd(velecsum,velec);
379
380             fscal            = felec;
381
382             /* Update vectorial force */
383             fix0             = _mm_macc_pd(dx02,fscal,fix0);
384             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
385             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
386             
387             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
388             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
389             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
390
391             /**************************
392              * CALCULATE INTERACTIONS *
393              **************************/
394
395             r10              = _mm_mul_pd(rsq10,rinv10);
396
397             /* Calculate table index by multiplying r with table scale and truncate to integer */
398             rt               = _mm_mul_pd(r10,vftabscale);
399             vfitab           = _mm_cvttpd_epi32(rt);
400 #ifdef __XOP__
401             vfeps            = _mm_frcz_pd(rt);
402 #else
403             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
404 #endif
405             twovfeps         = _mm_add_pd(vfeps,vfeps);
406             vfitab           = _mm_slli_epi32(vfitab,2);
407
408             /* CUBIC SPLINE TABLE ELECTROSTATICS */
409             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
410             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
411             GMX_MM_TRANSPOSE2_PD(Y,F);
412             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
413             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
414             GMX_MM_TRANSPOSE2_PD(G,H);
415             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
416             VV               = _mm_macc_pd(vfeps,Fp,Y);
417             velec            = _mm_mul_pd(qq10,VV);
418             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
419             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
420
421             /* Update potential sum for this i atom from the interaction with this j atom. */
422             velecsum         = _mm_add_pd(velecsum,velec);
423
424             fscal            = felec;
425
426             /* Update vectorial force */
427             fix1             = _mm_macc_pd(dx10,fscal,fix1);
428             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
429             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
430             
431             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
432             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
433             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             r11              = _mm_mul_pd(rsq11,rinv11);
440
441             /* Calculate table index by multiplying r with table scale and truncate to integer */
442             rt               = _mm_mul_pd(r11,vftabscale);
443             vfitab           = _mm_cvttpd_epi32(rt);
444 #ifdef __XOP__
445             vfeps            = _mm_frcz_pd(rt);
446 #else
447             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
448 #endif
449             twovfeps         = _mm_add_pd(vfeps,vfeps);
450             vfitab           = _mm_slli_epi32(vfitab,2);
451
452             /* CUBIC SPLINE TABLE ELECTROSTATICS */
453             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
454             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
455             GMX_MM_TRANSPOSE2_PD(Y,F);
456             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
457             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
458             GMX_MM_TRANSPOSE2_PD(G,H);
459             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
460             VV               = _mm_macc_pd(vfeps,Fp,Y);
461             velec            = _mm_mul_pd(qq11,VV);
462             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
463             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
464
465             /* Update potential sum for this i atom from the interaction with this j atom. */
466             velecsum         = _mm_add_pd(velecsum,velec);
467
468             fscal            = felec;
469
470             /* Update vectorial force */
471             fix1             = _mm_macc_pd(dx11,fscal,fix1);
472             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
473             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
474             
475             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
476             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
477             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
478
479             /**************************
480              * CALCULATE INTERACTIONS *
481              **************************/
482
483             r12              = _mm_mul_pd(rsq12,rinv12);
484
485             /* Calculate table index by multiplying r with table scale and truncate to integer */
486             rt               = _mm_mul_pd(r12,vftabscale);
487             vfitab           = _mm_cvttpd_epi32(rt);
488 #ifdef __XOP__
489             vfeps            = _mm_frcz_pd(rt);
490 #else
491             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
492 #endif
493             twovfeps         = _mm_add_pd(vfeps,vfeps);
494             vfitab           = _mm_slli_epi32(vfitab,2);
495
496             /* CUBIC SPLINE TABLE ELECTROSTATICS */
497             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
498             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
499             GMX_MM_TRANSPOSE2_PD(Y,F);
500             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
501             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
502             GMX_MM_TRANSPOSE2_PD(G,H);
503             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
504             VV               = _mm_macc_pd(vfeps,Fp,Y);
505             velec            = _mm_mul_pd(qq12,VV);
506             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
507             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velecsum         = _mm_add_pd(velecsum,velec);
511
512             fscal            = felec;
513
514             /* Update vectorial force */
515             fix1             = _mm_macc_pd(dx12,fscal,fix1);
516             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
517             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
518             
519             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
520             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
521             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             r20              = _mm_mul_pd(rsq20,rinv20);
528
529             /* Calculate table index by multiplying r with table scale and truncate to integer */
530             rt               = _mm_mul_pd(r20,vftabscale);
531             vfitab           = _mm_cvttpd_epi32(rt);
532 #ifdef __XOP__
533             vfeps            = _mm_frcz_pd(rt);
534 #else
535             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
536 #endif
537             twovfeps         = _mm_add_pd(vfeps,vfeps);
538             vfitab           = _mm_slli_epi32(vfitab,2);
539
540             /* CUBIC SPLINE TABLE ELECTROSTATICS */
541             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
542             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
543             GMX_MM_TRANSPOSE2_PD(Y,F);
544             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
545             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
546             GMX_MM_TRANSPOSE2_PD(G,H);
547             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
548             VV               = _mm_macc_pd(vfeps,Fp,Y);
549             velec            = _mm_mul_pd(qq20,VV);
550             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
551             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
552
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velecsum         = _mm_add_pd(velecsum,velec);
555
556             fscal            = felec;
557
558             /* Update vectorial force */
559             fix2             = _mm_macc_pd(dx20,fscal,fix2);
560             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
561             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
562             
563             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
564             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
565             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
566
567             /**************************
568              * CALCULATE INTERACTIONS *
569              **************************/
570
571             r21              = _mm_mul_pd(rsq21,rinv21);
572
573             /* Calculate table index by multiplying r with table scale and truncate to integer */
574             rt               = _mm_mul_pd(r21,vftabscale);
575             vfitab           = _mm_cvttpd_epi32(rt);
576 #ifdef __XOP__
577             vfeps            = _mm_frcz_pd(rt);
578 #else
579             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
580 #endif
581             twovfeps         = _mm_add_pd(vfeps,vfeps);
582             vfitab           = _mm_slli_epi32(vfitab,2);
583
584             /* CUBIC SPLINE TABLE ELECTROSTATICS */
585             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
586             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
587             GMX_MM_TRANSPOSE2_PD(Y,F);
588             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
589             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
590             GMX_MM_TRANSPOSE2_PD(G,H);
591             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
592             VV               = _mm_macc_pd(vfeps,Fp,Y);
593             velec            = _mm_mul_pd(qq21,VV);
594             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
595             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velecsum         = _mm_add_pd(velecsum,velec);
599
600             fscal            = felec;
601
602             /* Update vectorial force */
603             fix2             = _mm_macc_pd(dx21,fscal,fix2);
604             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
605             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
606             
607             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
608             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
609             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
610
611             /**************************
612              * CALCULATE INTERACTIONS *
613              **************************/
614
615             r22              = _mm_mul_pd(rsq22,rinv22);
616
617             /* Calculate table index by multiplying r with table scale and truncate to integer */
618             rt               = _mm_mul_pd(r22,vftabscale);
619             vfitab           = _mm_cvttpd_epi32(rt);
620 #ifdef __XOP__
621             vfeps            = _mm_frcz_pd(rt);
622 #else
623             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
624 #endif
625             twovfeps         = _mm_add_pd(vfeps,vfeps);
626             vfitab           = _mm_slli_epi32(vfitab,2);
627
628             /* CUBIC SPLINE TABLE ELECTROSTATICS */
629             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
630             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
631             GMX_MM_TRANSPOSE2_PD(Y,F);
632             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
633             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
634             GMX_MM_TRANSPOSE2_PD(G,H);
635             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
636             VV               = _mm_macc_pd(vfeps,Fp,Y);
637             velec            = _mm_mul_pd(qq22,VV);
638             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
639             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
640
641             /* Update potential sum for this i atom from the interaction with this j atom. */
642             velecsum         = _mm_add_pd(velecsum,velec);
643
644             fscal            = felec;
645
646             /* Update vectorial force */
647             fix2             = _mm_macc_pd(dx22,fscal,fix2);
648             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
649             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
650             
651             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
652             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
653             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
654
655             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
656
657             /* Inner loop uses 414 flops */
658         }
659
660         if(jidx<j_index_end)
661         {
662
663             jnrA             = jjnr[jidx];
664             j_coord_offsetA  = DIM*jnrA;
665
666             /* load j atom coordinates */
667             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
668                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
669
670             /* Calculate displacement vector */
671             dx00             = _mm_sub_pd(ix0,jx0);
672             dy00             = _mm_sub_pd(iy0,jy0);
673             dz00             = _mm_sub_pd(iz0,jz0);
674             dx01             = _mm_sub_pd(ix0,jx1);
675             dy01             = _mm_sub_pd(iy0,jy1);
676             dz01             = _mm_sub_pd(iz0,jz1);
677             dx02             = _mm_sub_pd(ix0,jx2);
678             dy02             = _mm_sub_pd(iy0,jy2);
679             dz02             = _mm_sub_pd(iz0,jz2);
680             dx10             = _mm_sub_pd(ix1,jx0);
681             dy10             = _mm_sub_pd(iy1,jy0);
682             dz10             = _mm_sub_pd(iz1,jz0);
683             dx11             = _mm_sub_pd(ix1,jx1);
684             dy11             = _mm_sub_pd(iy1,jy1);
685             dz11             = _mm_sub_pd(iz1,jz1);
686             dx12             = _mm_sub_pd(ix1,jx2);
687             dy12             = _mm_sub_pd(iy1,jy2);
688             dz12             = _mm_sub_pd(iz1,jz2);
689             dx20             = _mm_sub_pd(ix2,jx0);
690             dy20             = _mm_sub_pd(iy2,jy0);
691             dz20             = _mm_sub_pd(iz2,jz0);
692             dx21             = _mm_sub_pd(ix2,jx1);
693             dy21             = _mm_sub_pd(iy2,jy1);
694             dz21             = _mm_sub_pd(iz2,jz1);
695             dx22             = _mm_sub_pd(ix2,jx2);
696             dy22             = _mm_sub_pd(iy2,jy2);
697             dz22             = _mm_sub_pd(iz2,jz2);
698
699             /* Calculate squared distance and things based on it */
700             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
701             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
702             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
703             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
704             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
705             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
706             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
707             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
708             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
709
710             rinv00           = avx128fma_invsqrt_d(rsq00);
711             rinv01           = avx128fma_invsqrt_d(rsq01);
712             rinv02           = avx128fma_invsqrt_d(rsq02);
713             rinv10           = avx128fma_invsqrt_d(rsq10);
714             rinv11           = avx128fma_invsqrt_d(rsq11);
715             rinv12           = avx128fma_invsqrt_d(rsq12);
716             rinv20           = avx128fma_invsqrt_d(rsq20);
717             rinv21           = avx128fma_invsqrt_d(rsq21);
718             rinv22           = avx128fma_invsqrt_d(rsq22);
719
720             fjx0             = _mm_setzero_pd();
721             fjy0             = _mm_setzero_pd();
722             fjz0             = _mm_setzero_pd();
723             fjx1             = _mm_setzero_pd();
724             fjy1             = _mm_setzero_pd();
725             fjz1             = _mm_setzero_pd();
726             fjx2             = _mm_setzero_pd();
727             fjy2             = _mm_setzero_pd();
728             fjz2             = _mm_setzero_pd();
729
730             /**************************
731              * CALCULATE INTERACTIONS *
732              **************************/
733
734             r00              = _mm_mul_pd(rsq00,rinv00);
735
736             /* Calculate table index by multiplying r with table scale and truncate to integer */
737             rt               = _mm_mul_pd(r00,vftabscale);
738             vfitab           = _mm_cvttpd_epi32(rt);
739 #ifdef __XOP__
740             vfeps            = _mm_frcz_pd(rt);
741 #else
742             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
743 #endif
744             twovfeps         = _mm_add_pd(vfeps,vfeps);
745             vfitab           = _mm_slli_epi32(vfitab,2);
746
747             /* CUBIC SPLINE TABLE ELECTROSTATICS */
748             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
749             F                = _mm_setzero_pd();
750             GMX_MM_TRANSPOSE2_PD(Y,F);
751             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
752             H                = _mm_setzero_pd();
753             GMX_MM_TRANSPOSE2_PD(G,H);
754             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
755             VV               = _mm_macc_pd(vfeps,Fp,Y);
756             velec            = _mm_mul_pd(qq00,VV);
757             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
758             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
759
760             /* Update potential sum for this i atom from the interaction with this j atom. */
761             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
762             velecsum         = _mm_add_pd(velecsum,velec);
763
764             fscal            = felec;
765
766             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
767
768             /* Update vectorial force */
769             fix0             = _mm_macc_pd(dx00,fscal,fix0);
770             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
771             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
772             
773             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
774             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
775             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
776
777             /**************************
778              * CALCULATE INTERACTIONS *
779              **************************/
780
781             r01              = _mm_mul_pd(rsq01,rinv01);
782
783             /* Calculate table index by multiplying r with table scale and truncate to integer */
784             rt               = _mm_mul_pd(r01,vftabscale);
785             vfitab           = _mm_cvttpd_epi32(rt);
786 #ifdef __XOP__
787             vfeps            = _mm_frcz_pd(rt);
788 #else
789             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
790 #endif
791             twovfeps         = _mm_add_pd(vfeps,vfeps);
792             vfitab           = _mm_slli_epi32(vfitab,2);
793
794             /* CUBIC SPLINE TABLE ELECTROSTATICS */
795             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
796             F                = _mm_setzero_pd();
797             GMX_MM_TRANSPOSE2_PD(Y,F);
798             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
799             H                = _mm_setzero_pd();
800             GMX_MM_TRANSPOSE2_PD(G,H);
801             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
802             VV               = _mm_macc_pd(vfeps,Fp,Y);
803             velec            = _mm_mul_pd(qq01,VV);
804             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
805             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
809             velecsum         = _mm_add_pd(velecsum,velec);
810
811             fscal            = felec;
812
813             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
814
815             /* Update vectorial force */
816             fix0             = _mm_macc_pd(dx01,fscal,fix0);
817             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
818             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
819             
820             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
821             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
822             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             r02              = _mm_mul_pd(rsq02,rinv02);
829
830             /* Calculate table index by multiplying r with table scale and truncate to integer */
831             rt               = _mm_mul_pd(r02,vftabscale);
832             vfitab           = _mm_cvttpd_epi32(rt);
833 #ifdef __XOP__
834             vfeps            = _mm_frcz_pd(rt);
835 #else
836             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
837 #endif
838             twovfeps         = _mm_add_pd(vfeps,vfeps);
839             vfitab           = _mm_slli_epi32(vfitab,2);
840
841             /* CUBIC SPLINE TABLE ELECTROSTATICS */
842             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
843             F                = _mm_setzero_pd();
844             GMX_MM_TRANSPOSE2_PD(Y,F);
845             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
846             H                = _mm_setzero_pd();
847             GMX_MM_TRANSPOSE2_PD(G,H);
848             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
849             VV               = _mm_macc_pd(vfeps,Fp,Y);
850             velec            = _mm_mul_pd(qq02,VV);
851             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
852             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
853
854             /* Update potential sum for this i atom from the interaction with this j atom. */
855             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
856             velecsum         = _mm_add_pd(velecsum,velec);
857
858             fscal            = felec;
859
860             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
861
862             /* Update vectorial force */
863             fix0             = _mm_macc_pd(dx02,fscal,fix0);
864             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
865             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
866             
867             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
868             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
869             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
870
871             /**************************
872              * CALCULATE INTERACTIONS *
873              **************************/
874
875             r10              = _mm_mul_pd(rsq10,rinv10);
876
877             /* Calculate table index by multiplying r with table scale and truncate to integer */
878             rt               = _mm_mul_pd(r10,vftabscale);
879             vfitab           = _mm_cvttpd_epi32(rt);
880 #ifdef __XOP__
881             vfeps            = _mm_frcz_pd(rt);
882 #else
883             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
884 #endif
885             twovfeps         = _mm_add_pd(vfeps,vfeps);
886             vfitab           = _mm_slli_epi32(vfitab,2);
887
888             /* CUBIC SPLINE TABLE ELECTROSTATICS */
889             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
890             F                = _mm_setzero_pd();
891             GMX_MM_TRANSPOSE2_PD(Y,F);
892             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
893             H                = _mm_setzero_pd();
894             GMX_MM_TRANSPOSE2_PD(G,H);
895             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
896             VV               = _mm_macc_pd(vfeps,Fp,Y);
897             velec            = _mm_mul_pd(qq10,VV);
898             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
899             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
900
901             /* Update potential sum for this i atom from the interaction with this j atom. */
902             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
903             velecsum         = _mm_add_pd(velecsum,velec);
904
905             fscal            = felec;
906
907             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
908
909             /* Update vectorial force */
910             fix1             = _mm_macc_pd(dx10,fscal,fix1);
911             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
912             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
913             
914             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
915             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
916             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
917
918             /**************************
919              * CALCULATE INTERACTIONS *
920              **************************/
921
922             r11              = _mm_mul_pd(rsq11,rinv11);
923
924             /* Calculate table index by multiplying r with table scale and truncate to integer */
925             rt               = _mm_mul_pd(r11,vftabscale);
926             vfitab           = _mm_cvttpd_epi32(rt);
927 #ifdef __XOP__
928             vfeps            = _mm_frcz_pd(rt);
929 #else
930             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
931 #endif
932             twovfeps         = _mm_add_pd(vfeps,vfeps);
933             vfitab           = _mm_slli_epi32(vfitab,2);
934
935             /* CUBIC SPLINE TABLE ELECTROSTATICS */
936             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
937             F                = _mm_setzero_pd();
938             GMX_MM_TRANSPOSE2_PD(Y,F);
939             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
940             H                = _mm_setzero_pd();
941             GMX_MM_TRANSPOSE2_PD(G,H);
942             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
943             VV               = _mm_macc_pd(vfeps,Fp,Y);
944             velec            = _mm_mul_pd(qq11,VV);
945             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
946             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
947
948             /* Update potential sum for this i atom from the interaction with this j atom. */
949             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
950             velecsum         = _mm_add_pd(velecsum,velec);
951
952             fscal            = felec;
953
954             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
955
956             /* Update vectorial force */
957             fix1             = _mm_macc_pd(dx11,fscal,fix1);
958             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
959             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
960             
961             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
962             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
963             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             r12              = _mm_mul_pd(rsq12,rinv12);
970
971             /* Calculate table index by multiplying r with table scale and truncate to integer */
972             rt               = _mm_mul_pd(r12,vftabscale);
973             vfitab           = _mm_cvttpd_epi32(rt);
974 #ifdef __XOP__
975             vfeps            = _mm_frcz_pd(rt);
976 #else
977             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
978 #endif
979             twovfeps         = _mm_add_pd(vfeps,vfeps);
980             vfitab           = _mm_slli_epi32(vfitab,2);
981
982             /* CUBIC SPLINE TABLE ELECTROSTATICS */
983             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
984             F                = _mm_setzero_pd();
985             GMX_MM_TRANSPOSE2_PD(Y,F);
986             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
987             H                = _mm_setzero_pd();
988             GMX_MM_TRANSPOSE2_PD(G,H);
989             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
990             VV               = _mm_macc_pd(vfeps,Fp,Y);
991             velec            = _mm_mul_pd(qq12,VV);
992             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
993             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
994
995             /* Update potential sum for this i atom from the interaction with this j atom. */
996             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
997             velecsum         = _mm_add_pd(velecsum,velec);
998
999             fscal            = felec;
1000
1001             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1002
1003             /* Update vectorial force */
1004             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1005             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1006             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1007             
1008             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1009             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1010             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1011
1012             /**************************
1013              * CALCULATE INTERACTIONS *
1014              **************************/
1015
1016             r20              = _mm_mul_pd(rsq20,rinv20);
1017
1018             /* Calculate table index by multiplying r with table scale and truncate to integer */
1019             rt               = _mm_mul_pd(r20,vftabscale);
1020             vfitab           = _mm_cvttpd_epi32(rt);
1021 #ifdef __XOP__
1022             vfeps            = _mm_frcz_pd(rt);
1023 #else
1024             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1025 #endif
1026             twovfeps         = _mm_add_pd(vfeps,vfeps);
1027             vfitab           = _mm_slli_epi32(vfitab,2);
1028
1029             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1030             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1031             F                = _mm_setzero_pd();
1032             GMX_MM_TRANSPOSE2_PD(Y,F);
1033             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1034             H                = _mm_setzero_pd();
1035             GMX_MM_TRANSPOSE2_PD(G,H);
1036             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1037             VV               = _mm_macc_pd(vfeps,Fp,Y);
1038             velec            = _mm_mul_pd(qq20,VV);
1039             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1040             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1044             velecsum         = _mm_add_pd(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1049
1050             /* Update vectorial force */
1051             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1052             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1053             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1054             
1055             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1056             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1057             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1058
1059             /**************************
1060              * CALCULATE INTERACTIONS *
1061              **************************/
1062
1063             r21              = _mm_mul_pd(rsq21,rinv21);
1064
1065             /* Calculate table index by multiplying r with table scale and truncate to integer */
1066             rt               = _mm_mul_pd(r21,vftabscale);
1067             vfitab           = _mm_cvttpd_epi32(rt);
1068 #ifdef __XOP__
1069             vfeps            = _mm_frcz_pd(rt);
1070 #else
1071             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1072 #endif
1073             twovfeps         = _mm_add_pd(vfeps,vfeps);
1074             vfitab           = _mm_slli_epi32(vfitab,2);
1075
1076             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1077             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1078             F                = _mm_setzero_pd();
1079             GMX_MM_TRANSPOSE2_PD(Y,F);
1080             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1081             H                = _mm_setzero_pd();
1082             GMX_MM_TRANSPOSE2_PD(G,H);
1083             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1084             VV               = _mm_macc_pd(vfeps,Fp,Y);
1085             velec            = _mm_mul_pd(qq21,VV);
1086             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1087             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1091             velecsum         = _mm_add_pd(velecsum,velec);
1092
1093             fscal            = felec;
1094
1095             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1096
1097             /* Update vectorial force */
1098             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1099             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1100             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1101             
1102             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1103             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1104             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1105
1106             /**************************
1107              * CALCULATE INTERACTIONS *
1108              **************************/
1109
1110             r22              = _mm_mul_pd(rsq22,rinv22);
1111
1112             /* Calculate table index by multiplying r with table scale and truncate to integer */
1113             rt               = _mm_mul_pd(r22,vftabscale);
1114             vfitab           = _mm_cvttpd_epi32(rt);
1115 #ifdef __XOP__
1116             vfeps            = _mm_frcz_pd(rt);
1117 #else
1118             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1119 #endif
1120             twovfeps         = _mm_add_pd(vfeps,vfeps);
1121             vfitab           = _mm_slli_epi32(vfitab,2);
1122
1123             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1124             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1125             F                = _mm_setzero_pd();
1126             GMX_MM_TRANSPOSE2_PD(Y,F);
1127             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1128             H                = _mm_setzero_pd();
1129             GMX_MM_TRANSPOSE2_PD(G,H);
1130             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1131             VV               = _mm_macc_pd(vfeps,Fp,Y);
1132             velec            = _mm_mul_pd(qq22,VV);
1133             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1134             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1135
1136             /* Update potential sum for this i atom from the interaction with this j atom. */
1137             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1138             velecsum         = _mm_add_pd(velecsum,velec);
1139
1140             fscal            = felec;
1141
1142             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1143
1144             /* Update vectorial force */
1145             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1146             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1147             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1148             
1149             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1150             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1151             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1152
1153             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1154
1155             /* Inner loop uses 414 flops */
1156         }
1157
1158         /* End of innermost loop */
1159
1160         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1161                                               f+i_coord_offset,fshift+i_shift_offset);
1162
1163         ggid                        = gid[iidx];
1164         /* Update potential energies */
1165         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1166
1167         /* Increment number of inner iterations */
1168         inneriter                  += j_index_end - j_index_start;
1169
1170         /* Outer loop uses 19 flops */
1171     }
1172
1173     /* Increment number of outer iterations */
1174     outeriter        += nri;
1175
1176     /* Update outer/inner flops */
1177
1178     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*414);
1179 }
1180 /*
1181  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_double
1182  * Electrostatics interaction: CubicSplineTable
1183  * VdW interaction:            None
1184  * Geometry:                   Water3-Water3
1185  * Calculate force/pot:        Force
1186  */
1187 void
1188 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_double
1189                     (t_nblist                    * gmx_restrict       nlist,
1190                      rvec                        * gmx_restrict          xx,
1191                      rvec                        * gmx_restrict          ff,
1192                      struct t_forcerec           * gmx_restrict          fr,
1193                      t_mdatoms                   * gmx_restrict     mdatoms,
1194                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1195                      t_nrnb                      * gmx_restrict        nrnb)
1196 {
1197     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1198      * just 0 for non-waters.
1199      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1200      * jnr indices corresponding to data put in the four positions in the SIMD register.
1201      */
1202     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1203     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1204     int              jnrA,jnrB;
1205     int              j_coord_offsetA,j_coord_offsetB;
1206     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207     real             rcutoff_scalar;
1208     real             *shiftvec,*fshift,*x,*f;
1209     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1210     int              vdwioffset0;
1211     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1212     int              vdwioffset1;
1213     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1214     int              vdwioffset2;
1215     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1216     int              vdwjidx0A,vdwjidx0B;
1217     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1218     int              vdwjidx1A,vdwjidx1B;
1219     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1220     int              vdwjidx2A,vdwjidx2B;
1221     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1222     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1223     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1224     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1225     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1226     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1227     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1228     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1229     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1230     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1231     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1232     real             *charge;
1233     __m128i          vfitab;
1234     __m128i          ifour       = _mm_set1_epi32(4);
1235     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1236     real             *vftab;
1237     __m128d          dummy_mask,cutoff_mask;
1238     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1239     __m128d          one     = _mm_set1_pd(1.0);
1240     __m128d          two     = _mm_set1_pd(2.0);
1241     x                = xx[0];
1242     f                = ff[0];
1243
1244     nri              = nlist->nri;
1245     iinr             = nlist->iinr;
1246     jindex           = nlist->jindex;
1247     jjnr             = nlist->jjnr;
1248     shiftidx         = nlist->shift;
1249     gid              = nlist->gid;
1250     shiftvec         = fr->shift_vec[0];
1251     fshift           = fr->fshift[0];
1252     facel            = _mm_set1_pd(fr->ic->epsfac);
1253     charge           = mdatoms->chargeA;
1254
1255     vftab            = kernel_data->table_elec->data;
1256     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
1257
1258     /* Setup water-specific parameters */
1259     inr              = nlist->iinr[0];
1260     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1261     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1262     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1263
1264     jq0              = _mm_set1_pd(charge[inr+0]);
1265     jq1              = _mm_set1_pd(charge[inr+1]);
1266     jq2              = _mm_set1_pd(charge[inr+2]);
1267     qq00             = _mm_mul_pd(iq0,jq0);
1268     qq01             = _mm_mul_pd(iq0,jq1);
1269     qq02             = _mm_mul_pd(iq0,jq2);
1270     qq10             = _mm_mul_pd(iq1,jq0);
1271     qq11             = _mm_mul_pd(iq1,jq1);
1272     qq12             = _mm_mul_pd(iq1,jq2);
1273     qq20             = _mm_mul_pd(iq2,jq0);
1274     qq21             = _mm_mul_pd(iq2,jq1);
1275     qq22             = _mm_mul_pd(iq2,jq2);
1276
1277     /* Avoid stupid compiler warnings */
1278     jnrA = jnrB = 0;
1279     j_coord_offsetA = 0;
1280     j_coord_offsetB = 0;
1281
1282     outeriter        = 0;
1283     inneriter        = 0;
1284
1285     /* Start outer loop over neighborlists */
1286     for(iidx=0; iidx<nri; iidx++)
1287     {
1288         /* Load shift vector for this list */
1289         i_shift_offset   = DIM*shiftidx[iidx];
1290
1291         /* Load limits for loop over neighbors */
1292         j_index_start    = jindex[iidx];
1293         j_index_end      = jindex[iidx+1];
1294
1295         /* Get outer coordinate index */
1296         inr              = iinr[iidx];
1297         i_coord_offset   = DIM*inr;
1298
1299         /* Load i particle coords and add shift vector */
1300         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1301                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1302
1303         fix0             = _mm_setzero_pd();
1304         fiy0             = _mm_setzero_pd();
1305         fiz0             = _mm_setzero_pd();
1306         fix1             = _mm_setzero_pd();
1307         fiy1             = _mm_setzero_pd();
1308         fiz1             = _mm_setzero_pd();
1309         fix2             = _mm_setzero_pd();
1310         fiy2             = _mm_setzero_pd();
1311         fiz2             = _mm_setzero_pd();
1312
1313         /* Start inner kernel loop */
1314         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1315         {
1316
1317             /* Get j neighbor index, and coordinate index */
1318             jnrA             = jjnr[jidx];
1319             jnrB             = jjnr[jidx+1];
1320             j_coord_offsetA  = DIM*jnrA;
1321             j_coord_offsetB  = DIM*jnrB;
1322
1323             /* load j atom coordinates */
1324             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1325                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1326
1327             /* Calculate displacement vector */
1328             dx00             = _mm_sub_pd(ix0,jx0);
1329             dy00             = _mm_sub_pd(iy0,jy0);
1330             dz00             = _mm_sub_pd(iz0,jz0);
1331             dx01             = _mm_sub_pd(ix0,jx1);
1332             dy01             = _mm_sub_pd(iy0,jy1);
1333             dz01             = _mm_sub_pd(iz0,jz1);
1334             dx02             = _mm_sub_pd(ix0,jx2);
1335             dy02             = _mm_sub_pd(iy0,jy2);
1336             dz02             = _mm_sub_pd(iz0,jz2);
1337             dx10             = _mm_sub_pd(ix1,jx0);
1338             dy10             = _mm_sub_pd(iy1,jy0);
1339             dz10             = _mm_sub_pd(iz1,jz0);
1340             dx11             = _mm_sub_pd(ix1,jx1);
1341             dy11             = _mm_sub_pd(iy1,jy1);
1342             dz11             = _mm_sub_pd(iz1,jz1);
1343             dx12             = _mm_sub_pd(ix1,jx2);
1344             dy12             = _mm_sub_pd(iy1,jy2);
1345             dz12             = _mm_sub_pd(iz1,jz2);
1346             dx20             = _mm_sub_pd(ix2,jx0);
1347             dy20             = _mm_sub_pd(iy2,jy0);
1348             dz20             = _mm_sub_pd(iz2,jz0);
1349             dx21             = _mm_sub_pd(ix2,jx1);
1350             dy21             = _mm_sub_pd(iy2,jy1);
1351             dz21             = _mm_sub_pd(iz2,jz1);
1352             dx22             = _mm_sub_pd(ix2,jx2);
1353             dy22             = _mm_sub_pd(iy2,jy2);
1354             dz22             = _mm_sub_pd(iz2,jz2);
1355
1356             /* Calculate squared distance and things based on it */
1357             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1358             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1359             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1360             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1361             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1362             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1363             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1364             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1365             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1366
1367             rinv00           = avx128fma_invsqrt_d(rsq00);
1368             rinv01           = avx128fma_invsqrt_d(rsq01);
1369             rinv02           = avx128fma_invsqrt_d(rsq02);
1370             rinv10           = avx128fma_invsqrt_d(rsq10);
1371             rinv11           = avx128fma_invsqrt_d(rsq11);
1372             rinv12           = avx128fma_invsqrt_d(rsq12);
1373             rinv20           = avx128fma_invsqrt_d(rsq20);
1374             rinv21           = avx128fma_invsqrt_d(rsq21);
1375             rinv22           = avx128fma_invsqrt_d(rsq22);
1376
1377             fjx0             = _mm_setzero_pd();
1378             fjy0             = _mm_setzero_pd();
1379             fjz0             = _mm_setzero_pd();
1380             fjx1             = _mm_setzero_pd();
1381             fjy1             = _mm_setzero_pd();
1382             fjz1             = _mm_setzero_pd();
1383             fjx2             = _mm_setzero_pd();
1384             fjy2             = _mm_setzero_pd();
1385             fjz2             = _mm_setzero_pd();
1386
1387             /**************************
1388              * CALCULATE INTERACTIONS *
1389              **************************/
1390
1391             r00              = _mm_mul_pd(rsq00,rinv00);
1392
1393             /* Calculate table index by multiplying r with table scale and truncate to integer */
1394             rt               = _mm_mul_pd(r00,vftabscale);
1395             vfitab           = _mm_cvttpd_epi32(rt);
1396 #ifdef __XOP__
1397             vfeps            = _mm_frcz_pd(rt);
1398 #else
1399             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1400 #endif
1401             twovfeps         = _mm_add_pd(vfeps,vfeps);
1402             vfitab           = _mm_slli_epi32(vfitab,2);
1403
1404             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1405             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1406             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1407             GMX_MM_TRANSPOSE2_PD(Y,F);
1408             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1409             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1410             GMX_MM_TRANSPOSE2_PD(G,H);
1411             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1412             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1413             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1414
1415             fscal            = felec;
1416
1417             /* Update vectorial force */
1418             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1419             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1420             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1421             
1422             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1423             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1424             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1425
1426             /**************************
1427              * CALCULATE INTERACTIONS *
1428              **************************/
1429
1430             r01              = _mm_mul_pd(rsq01,rinv01);
1431
1432             /* Calculate table index by multiplying r with table scale and truncate to integer */
1433             rt               = _mm_mul_pd(r01,vftabscale);
1434             vfitab           = _mm_cvttpd_epi32(rt);
1435 #ifdef __XOP__
1436             vfeps            = _mm_frcz_pd(rt);
1437 #else
1438             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1439 #endif
1440             twovfeps         = _mm_add_pd(vfeps,vfeps);
1441             vfitab           = _mm_slli_epi32(vfitab,2);
1442
1443             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1444             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1445             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1446             GMX_MM_TRANSPOSE2_PD(Y,F);
1447             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1448             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1449             GMX_MM_TRANSPOSE2_PD(G,H);
1450             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1451             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1452             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1453
1454             fscal            = felec;
1455
1456             /* Update vectorial force */
1457             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1458             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1459             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1460             
1461             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1462             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1463             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1464
1465             /**************************
1466              * CALCULATE INTERACTIONS *
1467              **************************/
1468
1469             r02              = _mm_mul_pd(rsq02,rinv02);
1470
1471             /* Calculate table index by multiplying r with table scale and truncate to integer */
1472             rt               = _mm_mul_pd(r02,vftabscale);
1473             vfitab           = _mm_cvttpd_epi32(rt);
1474 #ifdef __XOP__
1475             vfeps            = _mm_frcz_pd(rt);
1476 #else
1477             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1478 #endif
1479             twovfeps         = _mm_add_pd(vfeps,vfeps);
1480             vfitab           = _mm_slli_epi32(vfitab,2);
1481
1482             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1483             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1484             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1485             GMX_MM_TRANSPOSE2_PD(Y,F);
1486             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1487             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1488             GMX_MM_TRANSPOSE2_PD(G,H);
1489             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1490             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1491             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1492
1493             fscal            = felec;
1494
1495             /* Update vectorial force */
1496             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1497             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1498             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1499             
1500             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1501             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1502             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             r10              = _mm_mul_pd(rsq10,rinv10);
1509
1510             /* Calculate table index by multiplying r with table scale and truncate to integer */
1511             rt               = _mm_mul_pd(r10,vftabscale);
1512             vfitab           = _mm_cvttpd_epi32(rt);
1513 #ifdef __XOP__
1514             vfeps            = _mm_frcz_pd(rt);
1515 #else
1516             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1517 #endif
1518             twovfeps         = _mm_add_pd(vfeps,vfeps);
1519             vfitab           = _mm_slli_epi32(vfitab,2);
1520
1521             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1522             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1523             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1524             GMX_MM_TRANSPOSE2_PD(Y,F);
1525             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1526             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1527             GMX_MM_TRANSPOSE2_PD(G,H);
1528             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1529             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1530             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1531
1532             fscal            = felec;
1533
1534             /* Update vectorial force */
1535             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1536             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1537             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1538             
1539             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1540             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1541             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             r11              = _mm_mul_pd(rsq11,rinv11);
1548
1549             /* Calculate table index by multiplying r with table scale and truncate to integer */
1550             rt               = _mm_mul_pd(r11,vftabscale);
1551             vfitab           = _mm_cvttpd_epi32(rt);
1552 #ifdef __XOP__
1553             vfeps            = _mm_frcz_pd(rt);
1554 #else
1555             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1556 #endif
1557             twovfeps         = _mm_add_pd(vfeps,vfeps);
1558             vfitab           = _mm_slli_epi32(vfitab,2);
1559
1560             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563             GMX_MM_TRANSPOSE2_PD(Y,F);
1564             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1565             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1566             GMX_MM_TRANSPOSE2_PD(G,H);
1567             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1568             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1569             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1570
1571             fscal            = felec;
1572
1573             /* Update vectorial force */
1574             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1575             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1576             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1577             
1578             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1579             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1580             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1581
1582             /**************************
1583              * CALCULATE INTERACTIONS *
1584              **************************/
1585
1586             r12              = _mm_mul_pd(rsq12,rinv12);
1587
1588             /* Calculate table index by multiplying r with table scale and truncate to integer */
1589             rt               = _mm_mul_pd(r12,vftabscale);
1590             vfitab           = _mm_cvttpd_epi32(rt);
1591 #ifdef __XOP__
1592             vfeps            = _mm_frcz_pd(rt);
1593 #else
1594             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1595 #endif
1596             twovfeps         = _mm_add_pd(vfeps,vfeps);
1597             vfitab           = _mm_slli_epi32(vfitab,2);
1598
1599             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1600             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1601             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1602             GMX_MM_TRANSPOSE2_PD(Y,F);
1603             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1604             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1605             GMX_MM_TRANSPOSE2_PD(G,H);
1606             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1607             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1608             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1609
1610             fscal            = felec;
1611
1612             /* Update vectorial force */
1613             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1614             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1615             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1616             
1617             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1618             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1619             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             r20              = _mm_mul_pd(rsq20,rinv20);
1626
1627             /* Calculate table index by multiplying r with table scale and truncate to integer */
1628             rt               = _mm_mul_pd(r20,vftabscale);
1629             vfitab           = _mm_cvttpd_epi32(rt);
1630 #ifdef __XOP__
1631             vfeps            = _mm_frcz_pd(rt);
1632 #else
1633             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1634 #endif
1635             twovfeps         = _mm_add_pd(vfeps,vfeps);
1636             vfitab           = _mm_slli_epi32(vfitab,2);
1637
1638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1639             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1640             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1641             GMX_MM_TRANSPOSE2_PD(Y,F);
1642             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1643             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1644             GMX_MM_TRANSPOSE2_PD(G,H);
1645             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1646             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1647             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1648
1649             fscal            = felec;
1650
1651             /* Update vectorial force */
1652             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1653             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1654             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1655             
1656             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1657             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1658             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             r21              = _mm_mul_pd(rsq21,rinv21);
1665
1666             /* Calculate table index by multiplying r with table scale and truncate to integer */
1667             rt               = _mm_mul_pd(r21,vftabscale);
1668             vfitab           = _mm_cvttpd_epi32(rt);
1669 #ifdef __XOP__
1670             vfeps            = _mm_frcz_pd(rt);
1671 #else
1672             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1673 #endif
1674             twovfeps         = _mm_add_pd(vfeps,vfeps);
1675             vfitab           = _mm_slli_epi32(vfitab,2);
1676
1677             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1679             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1680             GMX_MM_TRANSPOSE2_PD(Y,F);
1681             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1682             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1683             GMX_MM_TRANSPOSE2_PD(G,H);
1684             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1685             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1686             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1687
1688             fscal            = felec;
1689
1690             /* Update vectorial force */
1691             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1692             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1693             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1694             
1695             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1696             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1697             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             r22              = _mm_mul_pd(rsq22,rinv22);
1704
1705             /* Calculate table index by multiplying r with table scale and truncate to integer */
1706             rt               = _mm_mul_pd(r22,vftabscale);
1707             vfitab           = _mm_cvttpd_epi32(rt);
1708 #ifdef __XOP__
1709             vfeps            = _mm_frcz_pd(rt);
1710 #else
1711             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1712 #endif
1713             twovfeps         = _mm_add_pd(vfeps,vfeps);
1714             vfitab           = _mm_slli_epi32(vfitab,2);
1715
1716             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1718             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1719             GMX_MM_TRANSPOSE2_PD(Y,F);
1720             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1721             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1722             GMX_MM_TRANSPOSE2_PD(G,H);
1723             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1724             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1725             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1726
1727             fscal            = felec;
1728
1729             /* Update vectorial force */
1730             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1731             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1732             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1733             
1734             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1735             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1736             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1737
1738             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1739
1740             /* Inner loop uses 378 flops */
1741         }
1742
1743         if(jidx<j_index_end)
1744         {
1745
1746             jnrA             = jjnr[jidx];
1747             j_coord_offsetA  = DIM*jnrA;
1748
1749             /* load j atom coordinates */
1750             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1751                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1752
1753             /* Calculate displacement vector */
1754             dx00             = _mm_sub_pd(ix0,jx0);
1755             dy00             = _mm_sub_pd(iy0,jy0);
1756             dz00             = _mm_sub_pd(iz0,jz0);
1757             dx01             = _mm_sub_pd(ix0,jx1);
1758             dy01             = _mm_sub_pd(iy0,jy1);
1759             dz01             = _mm_sub_pd(iz0,jz1);
1760             dx02             = _mm_sub_pd(ix0,jx2);
1761             dy02             = _mm_sub_pd(iy0,jy2);
1762             dz02             = _mm_sub_pd(iz0,jz2);
1763             dx10             = _mm_sub_pd(ix1,jx0);
1764             dy10             = _mm_sub_pd(iy1,jy0);
1765             dz10             = _mm_sub_pd(iz1,jz0);
1766             dx11             = _mm_sub_pd(ix1,jx1);
1767             dy11             = _mm_sub_pd(iy1,jy1);
1768             dz11             = _mm_sub_pd(iz1,jz1);
1769             dx12             = _mm_sub_pd(ix1,jx2);
1770             dy12             = _mm_sub_pd(iy1,jy2);
1771             dz12             = _mm_sub_pd(iz1,jz2);
1772             dx20             = _mm_sub_pd(ix2,jx0);
1773             dy20             = _mm_sub_pd(iy2,jy0);
1774             dz20             = _mm_sub_pd(iz2,jz0);
1775             dx21             = _mm_sub_pd(ix2,jx1);
1776             dy21             = _mm_sub_pd(iy2,jy1);
1777             dz21             = _mm_sub_pd(iz2,jz1);
1778             dx22             = _mm_sub_pd(ix2,jx2);
1779             dy22             = _mm_sub_pd(iy2,jy2);
1780             dz22             = _mm_sub_pd(iz2,jz2);
1781
1782             /* Calculate squared distance and things based on it */
1783             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1784             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1785             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1786             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1787             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1788             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1789             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1790             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1791             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1792
1793             rinv00           = avx128fma_invsqrt_d(rsq00);
1794             rinv01           = avx128fma_invsqrt_d(rsq01);
1795             rinv02           = avx128fma_invsqrt_d(rsq02);
1796             rinv10           = avx128fma_invsqrt_d(rsq10);
1797             rinv11           = avx128fma_invsqrt_d(rsq11);
1798             rinv12           = avx128fma_invsqrt_d(rsq12);
1799             rinv20           = avx128fma_invsqrt_d(rsq20);
1800             rinv21           = avx128fma_invsqrt_d(rsq21);
1801             rinv22           = avx128fma_invsqrt_d(rsq22);
1802
1803             fjx0             = _mm_setzero_pd();
1804             fjy0             = _mm_setzero_pd();
1805             fjz0             = _mm_setzero_pd();
1806             fjx1             = _mm_setzero_pd();
1807             fjy1             = _mm_setzero_pd();
1808             fjz1             = _mm_setzero_pd();
1809             fjx2             = _mm_setzero_pd();
1810             fjy2             = _mm_setzero_pd();
1811             fjz2             = _mm_setzero_pd();
1812
1813             /**************************
1814              * CALCULATE INTERACTIONS *
1815              **************************/
1816
1817             r00              = _mm_mul_pd(rsq00,rinv00);
1818
1819             /* Calculate table index by multiplying r with table scale and truncate to integer */
1820             rt               = _mm_mul_pd(r00,vftabscale);
1821             vfitab           = _mm_cvttpd_epi32(rt);
1822 #ifdef __XOP__
1823             vfeps            = _mm_frcz_pd(rt);
1824 #else
1825             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1826 #endif
1827             twovfeps         = _mm_add_pd(vfeps,vfeps);
1828             vfitab           = _mm_slli_epi32(vfitab,2);
1829
1830             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1831             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1832             F                = _mm_setzero_pd();
1833             GMX_MM_TRANSPOSE2_PD(Y,F);
1834             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1835             H                = _mm_setzero_pd();
1836             GMX_MM_TRANSPOSE2_PD(G,H);
1837             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1838             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1839             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1840
1841             fscal            = felec;
1842
1843             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1844
1845             /* Update vectorial force */
1846             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1847             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1848             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1849             
1850             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1851             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1852             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1853
1854             /**************************
1855              * CALCULATE INTERACTIONS *
1856              **************************/
1857
1858             r01              = _mm_mul_pd(rsq01,rinv01);
1859
1860             /* Calculate table index by multiplying r with table scale and truncate to integer */
1861             rt               = _mm_mul_pd(r01,vftabscale);
1862             vfitab           = _mm_cvttpd_epi32(rt);
1863 #ifdef __XOP__
1864             vfeps            = _mm_frcz_pd(rt);
1865 #else
1866             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1867 #endif
1868             twovfeps         = _mm_add_pd(vfeps,vfeps);
1869             vfitab           = _mm_slli_epi32(vfitab,2);
1870
1871             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1873             F                = _mm_setzero_pd();
1874             GMX_MM_TRANSPOSE2_PD(Y,F);
1875             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1876             H                = _mm_setzero_pd();
1877             GMX_MM_TRANSPOSE2_PD(G,H);
1878             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1879             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1880             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1881
1882             fscal            = felec;
1883
1884             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1885
1886             /* Update vectorial force */
1887             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1888             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1889             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1890             
1891             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1892             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1893             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1894
1895             /**************************
1896              * CALCULATE INTERACTIONS *
1897              **************************/
1898
1899             r02              = _mm_mul_pd(rsq02,rinv02);
1900
1901             /* Calculate table index by multiplying r with table scale and truncate to integer */
1902             rt               = _mm_mul_pd(r02,vftabscale);
1903             vfitab           = _mm_cvttpd_epi32(rt);
1904 #ifdef __XOP__
1905             vfeps            = _mm_frcz_pd(rt);
1906 #else
1907             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1908 #endif
1909             twovfeps         = _mm_add_pd(vfeps,vfeps);
1910             vfitab           = _mm_slli_epi32(vfitab,2);
1911
1912             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1913             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1914             F                = _mm_setzero_pd();
1915             GMX_MM_TRANSPOSE2_PD(Y,F);
1916             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1917             H                = _mm_setzero_pd();
1918             GMX_MM_TRANSPOSE2_PD(G,H);
1919             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1920             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1921             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1922
1923             fscal            = felec;
1924
1925             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1926
1927             /* Update vectorial force */
1928             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1929             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1930             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1931             
1932             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1933             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1934             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1935
1936             /**************************
1937              * CALCULATE INTERACTIONS *
1938              **************************/
1939
1940             r10              = _mm_mul_pd(rsq10,rinv10);
1941
1942             /* Calculate table index by multiplying r with table scale and truncate to integer */
1943             rt               = _mm_mul_pd(r10,vftabscale);
1944             vfitab           = _mm_cvttpd_epi32(rt);
1945 #ifdef __XOP__
1946             vfeps            = _mm_frcz_pd(rt);
1947 #else
1948             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1949 #endif
1950             twovfeps         = _mm_add_pd(vfeps,vfeps);
1951             vfitab           = _mm_slli_epi32(vfitab,2);
1952
1953             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1954             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1955             F                = _mm_setzero_pd();
1956             GMX_MM_TRANSPOSE2_PD(Y,F);
1957             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1958             H                = _mm_setzero_pd();
1959             GMX_MM_TRANSPOSE2_PD(G,H);
1960             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1961             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1962             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1963
1964             fscal            = felec;
1965
1966             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1967
1968             /* Update vectorial force */
1969             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1970             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1971             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1972             
1973             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1974             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1975             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1976
1977             /**************************
1978              * CALCULATE INTERACTIONS *
1979              **************************/
1980
1981             r11              = _mm_mul_pd(rsq11,rinv11);
1982
1983             /* Calculate table index by multiplying r with table scale and truncate to integer */
1984             rt               = _mm_mul_pd(r11,vftabscale);
1985             vfitab           = _mm_cvttpd_epi32(rt);
1986 #ifdef __XOP__
1987             vfeps            = _mm_frcz_pd(rt);
1988 #else
1989             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1990 #endif
1991             twovfeps         = _mm_add_pd(vfeps,vfeps);
1992             vfitab           = _mm_slli_epi32(vfitab,2);
1993
1994             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1995             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1996             F                = _mm_setzero_pd();
1997             GMX_MM_TRANSPOSE2_PD(Y,F);
1998             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1999             H                = _mm_setzero_pd();
2000             GMX_MM_TRANSPOSE2_PD(G,H);
2001             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2002             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2003             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2004
2005             fscal            = felec;
2006
2007             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2008
2009             /* Update vectorial force */
2010             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2011             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2012             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2013             
2014             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2015             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2016             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2017
2018             /**************************
2019              * CALCULATE INTERACTIONS *
2020              **************************/
2021
2022             r12              = _mm_mul_pd(rsq12,rinv12);
2023
2024             /* Calculate table index by multiplying r with table scale and truncate to integer */
2025             rt               = _mm_mul_pd(r12,vftabscale);
2026             vfitab           = _mm_cvttpd_epi32(rt);
2027 #ifdef __XOP__
2028             vfeps            = _mm_frcz_pd(rt);
2029 #else
2030             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2031 #endif
2032             twovfeps         = _mm_add_pd(vfeps,vfeps);
2033             vfitab           = _mm_slli_epi32(vfitab,2);
2034
2035             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2036             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2037             F                = _mm_setzero_pd();
2038             GMX_MM_TRANSPOSE2_PD(Y,F);
2039             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2040             H                = _mm_setzero_pd();
2041             GMX_MM_TRANSPOSE2_PD(G,H);
2042             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2043             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2044             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2045
2046             fscal            = felec;
2047
2048             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2049
2050             /* Update vectorial force */
2051             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2052             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2053             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2054             
2055             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2056             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2057             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2058
2059             /**************************
2060              * CALCULATE INTERACTIONS *
2061              **************************/
2062
2063             r20              = _mm_mul_pd(rsq20,rinv20);
2064
2065             /* Calculate table index by multiplying r with table scale and truncate to integer */
2066             rt               = _mm_mul_pd(r20,vftabscale);
2067             vfitab           = _mm_cvttpd_epi32(rt);
2068 #ifdef __XOP__
2069             vfeps            = _mm_frcz_pd(rt);
2070 #else
2071             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2072 #endif
2073             twovfeps         = _mm_add_pd(vfeps,vfeps);
2074             vfitab           = _mm_slli_epi32(vfitab,2);
2075
2076             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2077             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2078             F                = _mm_setzero_pd();
2079             GMX_MM_TRANSPOSE2_PD(Y,F);
2080             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2081             H                = _mm_setzero_pd();
2082             GMX_MM_TRANSPOSE2_PD(G,H);
2083             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2084             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2085             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2086
2087             fscal            = felec;
2088
2089             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2090
2091             /* Update vectorial force */
2092             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2093             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2094             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2095             
2096             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2097             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2098             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2099
2100             /**************************
2101              * CALCULATE INTERACTIONS *
2102              **************************/
2103
2104             r21              = _mm_mul_pd(rsq21,rinv21);
2105
2106             /* Calculate table index by multiplying r with table scale and truncate to integer */
2107             rt               = _mm_mul_pd(r21,vftabscale);
2108             vfitab           = _mm_cvttpd_epi32(rt);
2109 #ifdef __XOP__
2110             vfeps            = _mm_frcz_pd(rt);
2111 #else
2112             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2113 #endif
2114             twovfeps         = _mm_add_pd(vfeps,vfeps);
2115             vfitab           = _mm_slli_epi32(vfitab,2);
2116
2117             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2118             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2119             F                = _mm_setzero_pd();
2120             GMX_MM_TRANSPOSE2_PD(Y,F);
2121             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2122             H                = _mm_setzero_pd();
2123             GMX_MM_TRANSPOSE2_PD(G,H);
2124             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2125             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2126             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2127
2128             fscal            = felec;
2129
2130             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2131
2132             /* Update vectorial force */
2133             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2134             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2135             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2136             
2137             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2138             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2139             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2140
2141             /**************************
2142              * CALCULATE INTERACTIONS *
2143              **************************/
2144
2145             r22              = _mm_mul_pd(rsq22,rinv22);
2146
2147             /* Calculate table index by multiplying r with table scale and truncate to integer */
2148             rt               = _mm_mul_pd(r22,vftabscale);
2149             vfitab           = _mm_cvttpd_epi32(rt);
2150 #ifdef __XOP__
2151             vfeps            = _mm_frcz_pd(rt);
2152 #else
2153             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2154 #endif
2155             twovfeps         = _mm_add_pd(vfeps,vfeps);
2156             vfitab           = _mm_slli_epi32(vfitab,2);
2157
2158             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2159             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2160             F                = _mm_setzero_pd();
2161             GMX_MM_TRANSPOSE2_PD(Y,F);
2162             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2163             H                = _mm_setzero_pd();
2164             GMX_MM_TRANSPOSE2_PD(G,H);
2165             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2166             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2167             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2168
2169             fscal            = felec;
2170
2171             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2172
2173             /* Update vectorial force */
2174             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2175             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2176             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2177             
2178             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2179             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2180             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2181
2182             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2183
2184             /* Inner loop uses 378 flops */
2185         }
2186
2187         /* End of innermost loop */
2188
2189         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2190                                               f+i_coord_offset,fshift+i_shift_offset);
2191
2192         /* Increment number of inner iterations */
2193         inneriter                  += j_index_end - j_index_start;
2194
2195         /* Outer loop uses 18 flops */
2196     }
2197
2198     /* Increment number of outer iterations */
2199     outeriter        += nri;
2200
2201     /* Update outer/inner flops */
2202
2203     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*378);
2204 }