Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     __m128i          vfitab;
107     __m128i          ifour       = _mm_set1_epi32(4);
108     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
109     real             *vftab;
110     __m256d          dummy_mask,cutoff_mask;
111     __m128           tmpmask0,tmpmask1;
112     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
113     __m256d          one     = _mm256_set1_pd(1.0);
114     __m256d          two     = _mm256_set1_pd(2.0);
115     x                = xx[0];
116     f                = ff[0];
117
118     nri              = nlist->nri;
119     iinr             = nlist->iinr;
120     jindex           = nlist->jindex;
121     jjnr             = nlist->jjnr;
122     shiftidx         = nlist->shift;
123     gid              = nlist->gid;
124     shiftvec         = fr->shift_vec[0];
125     fshift           = fr->fshift[0];
126     facel            = _mm256_set1_pd(fr->ic->epsfac);
127     charge           = mdatoms->chargeA;
128
129     vftab            = kernel_data->table_elec->data;
130     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
131
132     /* Setup water-specific parameters */
133     inr              = nlist->iinr[0];
134     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
135     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
136     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
137
138     jq0              = _mm256_set1_pd(charge[inr+0]);
139     jq1              = _mm256_set1_pd(charge[inr+1]);
140     jq2              = _mm256_set1_pd(charge[inr+2]);
141     qq00             = _mm256_mul_pd(iq0,jq0);
142     qq01             = _mm256_mul_pd(iq0,jq1);
143     qq02             = _mm256_mul_pd(iq0,jq2);
144     qq10             = _mm256_mul_pd(iq1,jq0);
145     qq11             = _mm256_mul_pd(iq1,jq1);
146     qq12             = _mm256_mul_pd(iq1,jq2);
147     qq20             = _mm256_mul_pd(iq2,jq0);
148     qq21             = _mm256_mul_pd(iq2,jq1);
149     qq22             = _mm256_mul_pd(iq2,jq2);
150
151     /* Avoid stupid compiler warnings */
152     jnrA = jnrB = jnrC = jnrD = 0;
153     j_coord_offsetA = 0;
154     j_coord_offsetB = 0;
155     j_coord_offsetC = 0;
156     j_coord_offsetD = 0;
157
158     outeriter        = 0;
159     inneriter        = 0;
160
161     for(iidx=0;iidx<4*DIM;iidx++)
162     {
163         scratch[iidx] = 0.0;
164     }
165
166     /* Start outer loop over neighborlists */
167     for(iidx=0; iidx<nri; iidx++)
168     {
169         /* Load shift vector for this list */
170         i_shift_offset   = DIM*shiftidx[iidx];
171
172         /* Load limits for loop over neighbors */
173         j_index_start    = jindex[iidx];
174         j_index_end      = jindex[iidx+1];
175
176         /* Get outer coordinate index */
177         inr              = iinr[iidx];
178         i_coord_offset   = DIM*inr;
179
180         /* Load i particle coords and add shift vector */
181         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
182                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183
184         fix0             = _mm256_setzero_pd();
185         fiy0             = _mm256_setzero_pd();
186         fiz0             = _mm256_setzero_pd();
187         fix1             = _mm256_setzero_pd();
188         fiy1             = _mm256_setzero_pd();
189         fiz1             = _mm256_setzero_pd();
190         fix2             = _mm256_setzero_pd();
191         fiy2             = _mm256_setzero_pd();
192         fiz2             = _mm256_setzero_pd();
193
194         /* Reset potential sums */
195         velecsum         = _mm256_setzero_pd();
196
197         /* Start inner kernel loop */
198         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
199         {
200
201             /* Get j neighbor index, and coordinate index */
202             jnrA             = jjnr[jidx];
203             jnrB             = jjnr[jidx+1];
204             jnrC             = jjnr[jidx+2];
205             jnrD             = jjnr[jidx+3];
206             j_coord_offsetA  = DIM*jnrA;
207             j_coord_offsetB  = DIM*jnrB;
208             j_coord_offsetC  = DIM*jnrC;
209             j_coord_offsetD  = DIM*jnrD;
210
211             /* load j atom coordinates */
212             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
213                                                  x+j_coord_offsetC,x+j_coord_offsetD,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm256_sub_pd(ix0,jx0);
218             dy00             = _mm256_sub_pd(iy0,jy0);
219             dz00             = _mm256_sub_pd(iz0,jz0);
220             dx01             = _mm256_sub_pd(ix0,jx1);
221             dy01             = _mm256_sub_pd(iy0,jy1);
222             dz01             = _mm256_sub_pd(iz0,jz1);
223             dx02             = _mm256_sub_pd(ix0,jx2);
224             dy02             = _mm256_sub_pd(iy0,jy2);
225             dz02             = _mm256_sub_pd(iz0,jz2);
226             dx10             = _mm256_sub_pd(ix1,jx0);
227             dy10             = _mm256_sub_pd(iy1,jy0);
228             dz10             = _mm256_sub_pd(iz1,jz0);
229             dx11             = _mm256_sub_pd(ix1,jx1);
230             dy11             = _mm256_sub_pd(iy1,jy1);
231             dz11             = _mm256_sub_pd(iz1,jz1);
232             dx12             = _mm256_sub_pd(ix1,jx2);
233             dy12             = _mm256_sub_pd(iy1,jy2);
234             dz12             = _mm256_sub_pd(iz1,jz2);
235             dx20             = _mm256_sub_pd(ix2,jx0);
236             dy20             = _mm256_sub_pd(iy2,jy0);
237             dz20             = _mm256_sub_pd(iz2,jz0);
238             dx21             = _mm256_sub_pd(ix2,jx1);
239             dy21             = _mm256_sub_pd(iy2,jy1);
240             dz21             = _mm256_sub_pd(iz2,jz1);
241             dx22             = _mm256_sub_pd(ix2,jx2);
242             dy22             = _mm256_sub_pd(iy2,jy2);
243             dz22             = _mm256_sub_pd(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
247             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
248             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
249             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
250             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
251             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
252             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
253             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
254             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
255
256             rinv00           = avx256_invsqrt_d(rsq00);
257             rinv01           = avx256_invsqrt_d(rsq01);
258             rinv02           = avx256_invsqrt_d(rsq02);
259             rinv10           = avx256_invsqrt_d(rsq10);
260             rinv11           = avx256_invsqrt_d(rsq11);
261             rinv12           = avx256_invsqrt_d(rsq12);
262             rinv20           = avx256_invsqrt_d(rsq20);
263             rinv21           = avx256_invsqrt_d(rsq21);
264             rinv22           = avx256_invsqrt_d(rsq22);
265
266             fjx0             = _mm256_setzero_pd();
267             fjy0             = _mm256_setzero_pd();
268             fjz0             = _mm256_setzero_pd();
269             fjx1             = _mm256_setzero_pd();
270             fjy1             = _mm256_setzero_pd();
271             fjz1             = _mm256_setzero_pd();
272             fjx2             = _mm256_setzero_pd();
273             fjy2             = _mm256_setzero_pd();
274             fjz2             = _mm256_setzero_pd();
275
276             /**************************
277              * CALCULATE INTERACTIONS *
278              **************************/
279
280             r00              = _mm256_mul_pd(rsq00,rinv00);
281
282             /* Calculate table index by multiplying r with table scale and truncate to integer */
283             rt               = _mm256_mul_pd(r00,vftabscale);
284             vfitab           = _mm256_cvttpd_epi32(rt);
285             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
286             vfitab           = _mm_slli_epi32(vfitab,2);
287
288             /* CUBIC SPLINE TABLE ELECTROSTATICS */
289             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
290             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
291             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
292             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
293             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
294             Heps             = _mm256_mul_pd(vfeps,H);
295             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
296             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
297             velec            = _mm256_mul_pd(qq00,VV);
298             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
299             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
300
301             /* Update potential sum for this i atom from the interaction with this j atom. */
302             velecsum         = _mm256_add_pd(velecsum,velec);
303
304             fscal            = felec;
305
306             /* Calculate temporary vectorial force */
307             tx               = _mm256_mul_pd(fscal,dx00);
308             ty               = _mm256_mul_pd(fscal,dy00);
309             tz               = _mm256_mul_pd(fscal,dz00);
310
311             /* Update vectorial force */
312             fix0             = _mm256_add_pd(fix0,tx);
313             fiy0             = _mm256_add_pd(fiy0,ty);
314             fiz0             = _mm256_add_pd(fiz0,tz);
315
316             fjx0             = _mm256_add_pd(fjx0,tx);
317             fjy0             = _mm256_add_pd(fjy0,ty);
318             fjz0             = _mm256_add_pd(fjz0,tz);
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             r01              = _mm256_mul_pd(rsq01,rinv01);
325
326             /* Calculate table index by multiplying r with table scale and truncate to integer */
327             rt               = _mm256_mul_pd(r01,vftabscale);
328             vfitab           = _mm256_cvttpd_epi32(rt);
329             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
330             vfitab           = _mm_slli_epi32(vfitab,2);
331
332             /* CUBIC SPLINE TABLE ELECTROSTATICS */
333             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
334             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
335             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
336             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
337             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
338             Heps             = _mm256_mul_pd(vfeps,H);
339             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
340             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
341             velec            = _mm256_mul_pd(qq01,VV);
342             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
343             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
344
345             /* Update potential sum for this i atom from the interaction with this j atom. */
346             velecsum         = _mm256_add_pd(velecsum,velec);
347
348             fscal            = felec;
349
350             /* Calculate temporary vectorial force */
351             tx               = _mm256_mul_pd(fscal,dx01);
352             ty               = _mm256_mul_pd(fscal,dy01);
353             tz               = _mm256_mul_pd(fscal,dz01);
354
355             /* Update vectorial force */
356             fix0             = _mm256_add_pd(fix0,tx);
357             fiy0             = _mm256_add_pd(fiy0,ty);
358             fiz0             = _mm256_add_pd(fiz0,tz);
359
360             fjx1             = _mm256_add_pd(fjx1,tx);
361             fjy1             = _mm256_add_pd(fjy1,ty);
362             fjz1             = _mm256_add_pd(fjz1,tz);
363
364             /**************************
365              * CALCULATE INTERACTIONS *
366              **************************/
367
368             r02              = _mm256_mul_pd(rsq02,rinv02);
369
370             /* Calculate table index by multiplying r with table scale and truncate to integer */
371             rt               = _mm256_mul_pd(r02,vftabscale);
372             vfitab           = _mm256_cvttpd_epi32(rt);
373             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
374             vfitab           = _mm_slli_epi32(vfitab,2);
375
376             /* CUBIC SPLINE TABLE ELECTROSTATICS */
377             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
378             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
379             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
380             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
381             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
382             Heps             = _mm256_mul_pd(vfeps,H);
383             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
384             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
385             velec            = _mm256_mul_pd(qq02,VV);
386             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
387             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm256_add_pd(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm256_mul_pd(fscal,dx02);
396             ty               = _mm256_mul_pd(fscal,dy02);
397             tz               = _mm256_mul_pd(fscal,dz02);
398
399             /* Update vectorial force */
400             fix0             = _mm256_add_pd(fix0,tx);
401             fiy0             = _mm256_add_pd(fiy0,ty);
402             fiz0             = _mm256_add_pd(fiz0,tz);
403
404             fjx2             = _mm256_add_pd(fjx2,tx);
405             fjy2             = _mm256_add_pd(fjy2,ty);
406             fjz2             = _mm256_add_pd(fjz2,tz);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             r10              = _mm256_mul_pd(rsq10,rinv10);
413
414             /* Calculate table index by multiplying r with table scale and truncate to integer */
415             rt               = _mm256_mul_pd(r10,vftabscale);
416             vfitab           = _mm256_cvttpd_epi32(rt);
417             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
418             vfitab           = _mm_slli_epi32(vfitab,2);
419
420             /* CUBIC SPLINE TABLE ELECTROSTATICS */
421             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
422             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
423             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
424             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
425             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
426             Heps             = _mm256_mul_pd(vfeps,H);
427             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
428             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
429             velec            = _mm256_mul_pd(qq10,VV);
430             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
431             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velecsum         = _mm256_add_pd(velecsum,velec);
435
436             fscal            = felec;
437
438             /* Calculate temporary vectorial force */
439             tx               = _mm256_mul_pd(fscal,dx10);
440             ty               = _mm256_mul_pd(fscal,dy10);
441             tz               = _mm256_mul_pd(fscal,dz10);
442
443             /* Update vectorial force */
444             fix1             = _mm256_add_pd(fix1,tx);
445             fiy1             = _mm256_add_pd(fiy1,ty);
446             fiz1             = _mm256_add_pd(fiz1,tz);
447
448             fjx0             = _mm256_add_pd(fjx0,tx);
449             fjy0             = _mm256_add_pd(fjy0,ty);
450             fjz0             = _mm256_add_pd(fjz0,tz);
451
452             /**************************
453              * CALCULATE INTERACTIONS *
454              **************************/
455
456             r11              = _mm256_mul_pd(rsq11,rinv11);
457
458             /* Calculate table index by multiplying r with table scale and truncate to integer */
459             rt               = _mm256_mul_pd(r11,vftabscale);
460             vfitab           = _mm256_cvttpd_epi32(rt);
461             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
462             vfitab           = _mm_slli_epi32(vfitab,2);
463
464             /* CUBIC SPLINE TABLE ELECTROSTATICS */
465             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
466             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
467             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
468             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
469             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
470             Heps             = _mm256_mul_pd(vfeps,H);
471             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
472             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
473             velec            = _mm256_mul_pd(qq11,VV);
474             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
475             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm256_add_pd(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm256_mul_pd(fscal,dx11);
484             ty               = _mm256_mul_pd(fscal,dy11);
485             tz               = _mm256_mul_pd(fscal,dz11);
486
487             /* Update vectorial force */
488             fix1             = _mm256_add_pd(fix1,tx);
489             fiy1             = _mm256_add_pd(fiy1,ty);
490             fiz1             = _mm256_add_pd(fiz1,tz);
491
492             fjx1             = _mm256_add_pd(fjx1,tx);
493             fjy1             = _mm256_add_pd(fjy1,ty);
494             fjz1             = _mm256_add_pd(fjz1,tz);
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r12              = _mm256_mul_pd(rsq12,rinv12);
501
502             /* Calculate table index by multiplying r with table scale and truncate to integer */
503             rt               = _mm256_mul_pd(r12,vftabscale);
504             vfitab           = _mm256_cvttpd_epi32(rt);
505             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
506             vfitab           = _mm_slli_epi32(vfitab,2);
507
508             /* CUBIC SPLINE TABLE ELECTROSTATICS */
509             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
510             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
511             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
512             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
513             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
514             Heps             = _mm256_mul_pd(vfeps,H);
515             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
516             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
517             velec            = _mm256_mul_pd(qq12,VV);
518             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
519             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm256_add_pd(velecsum,velec);
523
524             fscal            = felec;
525
526             /* Calculate temporary vectorial force */
527             tx               = _mm256_mul_pd(fscal,dx12);
528             ty               = _mm256_mul_pd(fscal,dy12);
529             tz               = _mm256_mul_pd(fscal,dz12);
530
531             /* Update vectorial force */
532             fix1             = _mm256_add_pd(fix1,tx);
533             fiy1             = _mm256_add_pd(fiy1,ty);
534             fiz1             = _mm256_add_pd(fiz1,tz);
535
536             fjx2             = _mm256_add_pd(fjx2,tx);
537             fjy2             = _mm256_add_pd(fjy2,ty);
538             fjz2             = _mm256_add_pd(fjz2,tz);
539
540             /**************************
541              * CALCULATE INTERACTIONS *
542              **************************/
543
544             r20              = _mm256_mul_pd(rsq20,rinv20);
545
546             /* Calculate table index by multiplying r with table scale and truncate to integer */
547             rt               = _mm256_mul_pd(r20,vftabscale);
548             vfitab           = _mm256_cvttpd_epi32(rt);
549             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
550             vfitab           = _mm_slli_epi32(vfitab,2);
551
552             /* CUBIC SPLINE TABLE ELECTROSTATICS */
553             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
554             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
555             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
556             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
557             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
558             Heps             = _mm256_mul_pd(vfeps,H);
559             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
560             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
561             velec            = _mm256_mul_pd(qq20,VV);
562             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
563             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velecsum         = _mm256_add_pd(velecsum,velec);
567
568             fscal            = felec;
569
570             /* Calculate temporary vectorial force */
571             tx               = _mm256_mul_pd(fscal,dx20);
572             ty               = _mm256_mul_pd(fscal,dy20);
573             tz               = _mm256_mul_pd(fscal,dz20);
574
575             /* Update vectorial force */
576             fix2             = _mm256_add_pd(fix2,tx);
577             fiy2             = _mm256_add_pd(fiy2,ty);
578             fiz2             = _mm256_add_pd(fiz2,tz);
579
580             fjx0             = _mm256_add_pd(fjx0,tx);
581             fjy0             = _mm256_add_pd(fjy0,ty);
582             fjz0             = _mm256_add_pd(fjz0,tz);
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             r21              = _mm256_mul_pd(rsq21,rinv21);
589
590             /* Calculate table index by multiplying r with table scale and truncate to integer */
591             rt               = _mm256_mul_pd(r21,vftabscale);
592             vfitab           = _mm256_cvttpd_epi32(rt);
593             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
594             vfitab           = _mm_slli_epi32(vfitab,2);
595
596             /* CUBIC SPLINE TABLE ELECTROSTATICS */
597             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
598             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
599             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
600             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
601             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
602             Heps             = _mm256_mul_pd(vfeps,H);
603             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
604             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
605             velec            = _mm256_mul_pd(qq21,VV);
606             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
607             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velecsum         = _mm256_add_pd(velecsum,velec);
611
612             fscal            = felec;
613
614             /* Calculate temporary vectorial force */
615             tx               = _mm256_mul_pd(fscal,dx21);
616             ty               = _mm256_mul_pd(fscal,dy21);
617             tz               = _mm256_mul_pd(fscal,dz21);
618
619             /* Update vectorial force */
620             fix2             = _mm256_add_pd(fix2,tx);
621             fiy2             = _mm256_add_pd(fiy2,ty);
622             fiz2             = _mm256_add_pd(fiz2,tz);
623
624             fjx1             = _mm256_add_pd(fjx1,tx);
625             fjy1             = _mm256_add_pd(fjy1,ty);
626             fjz1             = _mm256_add_pd(fjz1,tz);
627
628             /**************************
629              * CALCULATE INTERACTIONS *
630              **************************/
631
632             r22              = _mm256_mul_pd(rsq22,rinv22);
633
634             /* Calculate table index by multiplying r with table scale and truncate to integer */
635             rt               = _mm256_mul_pd(r22,vftabscale);
636             vfitab           = _mm256_cvttpd_epi32(rt);
637             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
638             vfitab           = _mm_slli_epi32(vfitab,2);
639
640             /* CUBIC SPLINE TABLE ELECTROSTATICS */
641             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
642             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
643             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
644             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
645             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
646             Heps             = _mm256_mul_pd(vfeps,H);
647             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
648             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
649             velec            = _mm256_mul_pd(qq22,VV);
650             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
651             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
652
653             /* Update potential sum for this i atom from the interaction with this j atom. */
654             velecsum         = _mm256_add_pd(velecsum,velec);
655
656             fscal            = felec;
657
658             /* Calculate temporary vectorial force */
659             tx               = _mm256_mul_pd(fscal,dx22);
660             ty               = _mm256_mul_pd(fscal,dy22);
661             tz               = _mm256_mul_pd(fscal,dz22);
662
663             /* Update vectorial force */
664             fix2             = _mm256_add_pd(fix2,tx);
665             fiy2             = _mm256_add_pd(fiy2,ty);
666             fiz2             = _mm256_add_pd(fiz2,tz);
667
668             fjx2             = _mm256_add_pd(fjx2,tx);
669             fjy2             = _mm256_add_pd(fjy2,ty);
670             fjz2             = _mm256_add_pd(fjz2,tz);
671
672             fjptrA             = f+j_coord_offsetA;
673             fjptrB             = f+j_coord_offsetB;
674             fjptrC             = f+j_coord_offsetC;
675             fjptrD             = f+j_coord_offsetD;
676
677             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
678                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
679
680             /* Inner loop uses 387 flops */
681         }
682
683         if(jidx<j_index_end)
684         {
685
686             /* Get j neighbor index, and coordinate index */
687             jnrlistA         = jjnr[jidx];
688             jnrlistB         = jjnr[jidx+1];
689             jnrlistC         = jjnr[jidx+2];
690             jnrlistD         = jjnr[jidx+3];
691             /* Sign of each element will be negative for non-real atoms.
692              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
693              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
694              */
695             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
696
697             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
698             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
699             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
700
701             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
702             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
703             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
704             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
705             j_coord_offsetA  = DIM*jnrA;
706             j_coord_offsetB  = DIM*jnrB;
707             j_coord_offsetC  = DIM*jnrC;
708             j_coord_offsetD  = DIM*jnrD;
709
710             /* load j atom coordinates */
711             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
712                                                  x+j_coord_offsetC,x+j_coord_offsetD,
713                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
714
715             /* Calculate displacement vector */
716             dx00             = _mm256_sub_pd(ix0,jx0);
717             dy00             = _mm256_sub_pd(iy0,jy0);
718             dz00             = _mm256_sub_pd(iz0,jz0);
719             dx01             = _mm256_sub_pd(ix0,jx1);
720             dy01             = _mm256_sub_pd(iy0,jy1);
721             dz01             = _mm256_sub_pd(iz0,jz1);
722             dx02             = _mm256_sub_pd(ix0,jx2);
723             dy02             = _mm256_sub_pd(iy0,jy2);
724             dz02             = _mm256_sub_pd(iz0,jz2);
725             dx10             = _mm256_sub_pd(ix1,jx0);
726             dy10             = _mm256_sub_pd(iy1,jy0);
727             dz10             = _mm256_sub_pd(iz1,jz0);
728             dx11             = _mm256_sub_pd(ix1,jx1);
729             dy11             = _mm256_sub_pd(iy1,jy1);
730             dz11             = _mm256_sub_pd(iz1,jz1);
731             dx12             = _mm256_sub_pd(ix1,jx2);
732             dy12             = _mm256_sub_pd(iy1,jy2);
733             dz12             = _mm256_sub_pd(iz1,jz2);
734             dx20             = _mm256_sub_pd(ix2,jx0);
735             dy20             = _mm256_sub_pd(iy2,jy0);
736             dz20             = _mm256_sub_pd(iz2,jz0);
737             dx21             = _mm256_sub_pd(ix2,jx1);
738             dy21             = _mm256_sub_pd(iy2,jy1);
739             dz21             = _mm256_sub_pd(iz2,jz1);
740             dx22             = _mm256_sub_pd(ix2,jx2);
741             dy22             = _mm256_sub_pd(iy2,jy2);
742             dz22             = _mm256_sub_pd(iz2,jz2);
743
744             /* Calculate squared distance and things based on it */
745             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
746             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
747             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
748             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
749             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
750             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
751             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
752             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
753             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
754
755             rinv00           = avx256_invsqrt_d(rsq00);
756             rinv01           = avx256_invsqrt_d(rsq01);
757             rinv02           = avx256_invsqrt_d(rsq02);
758             rinv10           = avx256_invsqrt_d(rsq10);
759             rinv11           = avx256_invsqrt_d(rsq11);
760             rinv12           = avx256_invsqrt_d(rsq12);
761             rinv20           = avx256_invsqrt_d(rsq20);
762             rinv21           = avx256_invsqrt_d(rsq21);
763             rinv22           = avx256_invsqrt_d(rsq22);
764
765             fjx0             = _mm256_setzero_pd();
766             fjy0             = _mm256_setzero_pd();
767             fjz0             = _mm256_setzero_pd();
768             fjx1             = _mm256_setzero_pd();
769             fjy1             = _mm256_setzero_pd();
770             fjz1             = _mm256_setzero_pd();
771             fjx2             = _mm256_setzero_pd();
772             fjy2             = _mm256_setzero_pd();
773             fjz2             = _mm256_setzero_pd();
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             r00              = _mm256_mul_pd(rsq00,rinv00);
780             r00              = _mm256_andnot_pd(dummy_mask,r00);
781
782             /* Calculate table index by multiplying r with table scale and truncate to integer */
783             rt               = _mm256_mul_pd(r00,vftabscale);
784             vfitab           = _mm256_cvttpd_epi32(rt);
785             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
786             vfitab           = _mm_slli_epi32(vfitab,2);
787
788             /* CUBIC SPLINE TABLE ELECTROSTATICS */
789             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
790             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
791             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
792             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
793             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
794             Heps             = _mm256_mul_pd(vfeps,H);
795             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
796             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
797             velec            = _mm256_mul_pd(qq00,VV);
798             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
799             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
800
801             /* Update potential sum for this i atom from the interaction with this j atom. */
802             velec            = _mm256_andnot_pd(dummy_mask,velec);
803             velecsum         = _mm256_add_pd(velecsum,velec);
804
805             fscal            = felec;
806
807             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
808
809             /* Calculate temporary vectorial force */
810             tx               = _mm256_mul_pd(fscal,dx00);
811             ty               = _mm256_mul_pd(fscal,dy00);
812             tz               = _mm256_mul_pd(fscal,dz00);
813
814             /* Update vectorial force */
815             fix0             = _mm256_add_pd(fix0,tx);
816             fiy0             = _mm256_add_pd(fiy0,ty);
817             fiz0             = _mm256_add_pd(fiz0,tz);
818
819             fjx0             = _mm256_add_pd(fjx0,tx);
820             fjy0             = _mm256_add_pd(fjy0,ty);
821             fjz0             = _mm256_add_pd(fjz0,tz);
822
823             /**************************
824              * CALCULATE INTERACTIONS *
825              **************************/
826
827             r01              = _mm256_mul_pd(rsq01,rinv01);
828             r01              = _mm256_andnot_pd(dummy_mask,r01);
829
830             /* Calculate table index by multiplying r with table scale and truncate to integer */
831             rt               = _mm256_mul_pd(r01,vftabscale);
832             vfitab           = _mm256_cvttpd_epi32(rt);
833             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
834             vfitab           = _mm_slli_epi32(vfitab,2);
835
836             /* CUBIC SPLINE TABLE ELECTROSTATICS */
837             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
838             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
839             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
840             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
841             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
842             Heps             = _mm256_mul_pd(vfeps,H);
843             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
844             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
845             velec            = _mm256_mul_pd(qq01,VV);
846             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
847             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
848
849             /* Update potential sum for this i atom from the interaction with this j atom. */
850             velec            = _mm256_andnot_pd(dummy_mask,velec);
851             velecsum         = _mm256_add_pd(velecsum,velec);
852
853             fscal            = felec;
854
855             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
856
857             /* Calculate temporary vectorial force */
858             tx               = _mm256_mul_pd(fscal,dx01);
859             ty               = _mm256_mul_pd(fscal,dy01);
860             tz               = _mm256_mul_pd(fscal,dz01);
861
862             /* Update vectorial force */
863             fix0             = _mm256_add_pd(fix0,tx);
864             fiy0             = _mm256_add_pd(fiy0,ty);
865             fiz0             = _mm256_add_pd(fiz0,tz);
866
867             fjx1             = _mm256_add_pd(fjx1,tx);
868             fjy1             = _mm256_add_pd(fjy1,ty);
869             fjz1             = _mm256_add_pd(fjz1,tz);
870
871             /**************************
872              * CALCULATE INTERACTIONS *
873              **************************/
874
875             r02              = _mm256_mul_pd(rsq02,rinv02);
876             r02              = _mm256_andnot_pd(dummy_mask,r02);
877
878             /* Calculate table index by multiplying r with table scale and truncate to integer */
879             rt               = _mm256_mul_pd(r02,vftabscale);
880             vfitab           = _mm256_cvttpd_epi32(rt);
881             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
882             vfitab           = _mm_slli_epi32(vfitab,2);
883
884             /* CUBIC SPLINE TABLE ELECTROSTATICS */
885             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
886             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
887             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
888             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
889             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
890             Heps             = _mm256_mul_pd(vfeps,H);
891             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
892             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
893             velec            = _mm256_mul_pd(qq02,VV);
894             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
895             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
896
897             /* Update potential sum for this i atom from the interaction with this j atom. */
898             velec            = _mm256_andnot_pd(dummy_mask,velec);
899             velecsum         = _mm256_add_pd(velecsum,velec);
900
901             fscal            = felec;
902
903             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
904
905             /* Calculate temporary vectorial force */
906             tx               = _mm256_mul_pd(fscal,dx02);
907             ty               = _mm256_mul_pd(fscal,dy02);
908             tz               = _mm256_mul_pd(fscal,dz02);
909
910             /* Update vectorial force */
911             fix0             = _mm256_add_pd(fix0,tx);
912             fiy0             = _mm256_add_pd(fiy0,ty);
913             fiz0             = _mm256_add_pd(fiz0,tz);
914
915             fjx2             = _mm256_add_pd(fjx2,tx);
916             fjy2             = _mm256_add_pd(fjy2,ty);
917             fjz2             = _mm256_add_pd(fjz2,tz);
918
919             /**************************
920              * CALCULATE INTERACTIONS *
921              **************************/
922
923             r10              = _mm256_mul_pd(rsq10,rinv10);
924             r10              = _mm256_andnot_pd(dummy_mask,r10);
925
926             /* Calculate table index by multiplying r with table scale and truncate to integer */
927             rt               = _mm256_mul_pd(r10,vftabscale);
928             vfitab           = _mm256_cvttpd_epi32(rt);
929             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
930             vfitab           = _mm_slli_epi32(vfitab,2);
931
932             /* CUBIC SPLINE TABLE ELECTROSTATICS */
933             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
934             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
935             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
936             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
937             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
938             Heps             = _mm256_mul_pd(vfeps,H);
939             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
940             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
941             velec            = _mm256_mul_pd(qq10,VV);
942             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
943             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm256_andnot_pd(dummy_mask,velec);
947             velecsum         = _mm256_add_pd(velecsum,velec);
948
949             fscal            = felec;
950
951             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
952
953             /* Calculate temporary vectorial force */
954             tx               = _mm256_mul_pd(fscal,dx10);
955             ty               = _mm256_mul_pd(fscal,dy10);
956             tz               = _mm256_mul_pd(fscal,dz10);
957
958             /* Update vectorial force */
959             fix1             = _mm256_add_pd(fix1,tx);
960             fiy1             = _mm256_add_pd(fiy1,ty);
961             fiz1             = _mm256_add_pd(fiz1,tz);
962
963             fjx0             = _mm256_add_pd(fjx0,tx);
964             fjy0             = _mm256_add_pd(fjy0,ty);
965             fjz0             = _mm256_add_pd(fjz0,tz);
966
967             /**************************
968              * CALCULATE INTERACTIONS *
969              **************************/
970
971             r11              = _mm256_mul_pd(rsq11,rinv11);
972             r11              = _mm256_andnot_pd(dummy_mask,r11);
973
974             /* Calculate table index by multiplying r with table scale and truncate to integer */
975             rt               = _mm256_mul_pd(r11,vftabscale);
976             vfitab           = _mm256_cvttpd_epi32(rt);
977             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
978             vfitab           = _mm_slli_epi32(vfitab,2);
979
980             /* CUBIC SPLINE TABLE ELECTROSTATICS */
981             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
982             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
983             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
984             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
985             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
986             Heps             = _mm256_mul_pd(vfeps,H);
987             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
988             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
989             velec            = _mm256_mul_pd(qq11,VV);
990             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
991             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
992
993             /* Update potential sum for this i atom from the interaction with this j atom. */
994             velec            = _mm256_andnot_pd(dummy_mask,velec);
995             velecsum         = _mm256_add_pd(velecsum,velec);
996
997             fscal            = felec;
998
999             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1000
1001             /* Calculate temporary vectorial force */
1002             tx               = _mm256_mul_pd(fscal,dx11);
1003             ty               = _mm256_mul_pd(fscal,dy11);
1004             tz               = _mm256_mul_pd(fscal,dz11);
1005
1006             /* Update vectorial force */
1007             fix1             = _mm256_add_pd(fix1,tx);
1008             fiy1             = _mm256_add_pd(fiy1,ty);
1009             fiz1             = _mm256_add_pd(fiz1,tz);
1010
1011             fjx1             = _mm256_add_pd(fjx1,tx);
1012             fjy1             = _mm256_add_pd(fjy1,ty);
1013             fjz1             = _mm256_add_pd(fjz1,tz);
1014
1015             /**************************
1016              * CALCULATE INTERACTIONS *
1017              **************************/
1018
1019             r12              = _mm256_mul_pd(rsq12,rinv12);
1020             r12              = _mm256_andnot_pd(dummy_mask,r12);
1021
1022             /* Calculate table index by multiplying r with table scale and truncate to integer */
1023             rt               = _mm256_mul_pd(r12,vftabscale);
1024             vfitab           = _mm256_cvttpd_epi32(rt);
1025             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1026             vfitab           = _mm_slli_epi32(vfitab,2);
1027
1028             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1029             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1030             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1031             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1032             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1033             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1034             Heps             = _mm256_mul_pd(vfeps,H);
1035             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1036             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1037             velec            = _mm256_mul_pd(qq12,VV);
1038             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1039             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1040
1041             /* Update potential sum for this i atom from the interaction with this j atom. */
1042             velec            = _mm256_andnot_pd(dummy_mask,velec);
1043             velecsum         = _mm256_add_pd(velecsum,velec);
1044
1045             fscal            = felec;
1046
1047             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1048
1049             /* Calculate temporary vectorial force */
1050             tx               = _mm256_mul_pd(fscal,dx12);
1051             ty               = _mm256_mul_pd(fscal,dy12);
1052             tz               = _mm256_mul_pd(fscal,dz12);
1053
1054             /* Update vectorial force */
1055             fix1             = _mm256_add_pd(fix1,tx);
1056             fiy1             = _mm256_add_pd(fiy1,ty);
1057             fiz1             = _mm256_add_pd(fiz1,tz);
1058
1059             fjx2             = _mm256_add_pd(fjx2,tx);
1060             fjy2             = _mm256_add_pd(fjy2,ty);
1061             fjz2             = _mm256_add_pd(fjz2,tz);
1062
1063             /**************************
1064              * CALCULATE INTERACTIONS *
1065              **************************/
1066
1067             r20              = _mm256_mul_pd(rsq20,rinv20);
1068             r20              = _mm256_andnot_pd(dummy_mask,r20);
1069
1070             /* Calculate table index by multiplying r with table scale and truncate to integer */
1071             rt               = _mm256_mul_pd(r20,vftabscale);
1072             vfitab           = _mm256_cvttpd_epi32(rt);
1073             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1074             vfitab           = _mm_slli_epi32(vfitab,2);
1075
1076             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1077             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1078             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1079             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1080             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1081             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1082             Heps             = _mm256_mul_pd(vfeps,H);
1083             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1084             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1085             velec            = _mm256_mul_pd(qq20,VV);
1086             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1087             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm256_andnot_pd(dummy_mask,velec);
1091             velecsum         = _mm256_add_pd(velecsum,velec);
1092
1093             fscal            = felec;
1094
1095             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1096
1097             /* Calculate temporary vectorial force */
1098             tx               = _mm256_mul_pd(fscal,dx20);
1099             ty               = _mm256_mul_pd(fscal,dy20);
1100             tz               = _mm256_mul_pd(fscal,dz20);
1101
1102             /* Update vectorial force */
1103             fix2             = _mm256_add_pd(fix2,tx);
1104             fiy2             = _mm256_add_pd(fiy2,ty);
1105             fiz2             = _mm256_add_pd(fiz2,tz);
1106
1107             fjx0             = _mm256_add_pd(fjx0,tx);
1108             fjy0             = _mm256_add_pd(fjy0,ty);
1109             fjz0             = _mm256_add_pd(fjz0,tz);
1110
1111             /**************************
1112              * CALCULATE INTERACTIONS *
1113              **************************/
1114
1115             r21              = _mm256_mul_pd(rsq21,rinv21);
1116             r21              = _mm256_andnot_pd(dummy_mask,r21);
1117
1118             /* Calculate table index by multiplying r with table scale and truncate to integer */
1119             rt               = _mm256_mul_pd(r21,vftabscale);
1120             vfitab           = _mm256_cvttpd_epi32(rt);
1121             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1122             vfitab           = _mm_slli_epi32(vfitab,2);
1123
1124             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1125             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1126             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1127             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1128             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1129             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1130             Heps             = _mm256_mul_pd(vfeps,H);
1131             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1132             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1133             velec            = _mm256_mul_pd(qq21,VV);
1134             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1135             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1136
1137             /* Update potential sum for this i atom from the interaction with this j atom. */
1138             velec            = _mm256_andnot_pd(dummy_mask,velec);
1139             velecsum         = _mm256_add_pd(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1144
1145             /* Calculate temporary vectorial force */
1146             tx               = _mm256_mul_pd(fscal,dx21);
1147             ty               = _mm256_mul_pd(fscal,dy21);
1148             tz               = _mm256_mul_pd(fscal,dz21);
1149
1150             /* Update vectorial force */
1151             fix2             = _mm256_add_pd(fix2,tx);
1152             fiy2             = _mm256_add_pd(fiy2,ty);
1153             fiz2             = _mm256_add_pd(fiz2,tz);
1154
1155             fjx1             = _mm256_add_pd(fjx1,tx);
1156             fjy1             = _mm256_add_pd(fjy1,ty);
1157             fjz1             = _mm256_add_pd(fjz1,tz);
1158
1159             /**************************
1160              * CALCULATE INTERACTIONS *
1161              **************************/
1162
1163             r22              = _mm256_mul_pd(rsq22,rinv22);
1164             r22              = _mm256_andnot_pd(dummy_mask,r22);
1165
1166             /* Calculate table index by multiplying r with table scale and truncate to integer */
1167             rt               = _mm256_mul_pd(r22,vftabscale);
1168             vfitab           = _mm256_cvttpd_epi32(rt);
1169             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1170             vfitab           = _mm_slli_epi32(vfitab,2);
1171
1172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1173             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1174             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1175             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1176             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1177             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1178             Heps             = _mm256_mul_pd(vfeps,H);
1179             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1180             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1181             velec            = _mm256_mul_pd(qq22,VV);
1182             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1183             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1184
1185             /* Update potential sum for this i atom from the interaction with this j atom. */
1186             velec            = _mm256_andnot_pd(dummy_mask,velec);
1187             velecsum         = _mm256_add_pd(velecsum,velec);
1188
1189             fscal            = felec;
1190
1191             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1192
1193             /* Calculate temporary vectorial force */
1194             tx               = _mm256_mul_pd(fscal,dx22);
1195             ty               = _mm256_mul_pd(fscal,dy22);
1196             tz               = _mm256_mul_pd(fscal,dz22);
1197
1198             /* Update vectorial force */
1199             fix2             = _mm256_add_pd(fix2,tx);
1200             fiy2             = _mm256_add_pd(fiy2,ty);
1201             fiz2             = _mm256_add_pd(fiz2,tz);
1202
1203             fjx2             = _mm256_add_pd(fjx2,tx);
1204             fjy2             = _mm256_add_pd(fjy2,ty);
1205             fjz2             = _mm256_add_pd(fjz2,tz);
1206
1207             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1208             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1209             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1210             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1211
1212             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1213                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1214
1215             /* Inner loop uses 396 flops */
1216         }
1217
1218         /* End of innermost loop */
1219
1220         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1221                                                  f+i_coord_offset,fshift+i_shift_offset);
1222
1223         ggid                        = gid[iidx];
1224         /* Update potential energies */
1225         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1226
1227         /* Increment number of inner iterations */
1228         inneriter                  += j_index_end - j_index_start;
1229
1230         /* Outer loop uses 19 flops */
1231     }
1232
1233     /* Increment number of outer iterations */
1234     outeriter        += nri;
1235
1236     /* Update outer/inner flops */
1237
1238     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1239 }
1240 /*
1241  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
1242  * Electrostatics interaction: CubicSplineTable
1243  * VdW interaction:            None
1244  * Geometry:                   Water3-Water3
1245  * Calculate force/pot:        Force
1246  */
1247 void
1248 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
1249                     (t_nblist                    * gmx_restrict       nlist,
1250                      rvec                        * gmx_restrict          xx,
1251                      rvec                        * gmx_restrict          ff,
1252                      struct t_forcerec           * gmx_restrict          fr,
1253                      t_mdatoms                   * gmx_restrict     mdatoms,
1254                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1255                      t_nrnb                      * gmx_restrict        nrnb)
1256 {
1257     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1258      * just 0 for non-waters.
1259      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1260      * jnr indices corresponding to data put in the four positions in the SIMD register.
1261      */
1262     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1263     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1264     int              jnrA,jnrB,jnrC,jnrD;
1265     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1266     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1267     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1268     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1269     real             rcutoff_scalar;
1270     real             *shiftvec,*fshift,*x,*f;
1271     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1272     real             scratch[4*DIM];
1273     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1274     real *           vdwioffsetptr0;
1275     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1276     real *           vdwioffsetptr1;
1277     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1278     real *           vdwioffsetptr2;
1279     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1280     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1281     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1282     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1283     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1284     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1285     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1286     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1287     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1288     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1289     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1290     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1291     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1292     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1293     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1294     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1295     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1296     real             *charge;
1297     __m128i          vfitab;
1298     __m128i          ifour       = _mm_set1_epi32(4);
1299     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1300     real             *vftab;
1301     __m256d          dummy_mask,cutoff_mask;
1302     __m128           tmpmask0,tmpmask1;
1303     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1304     __m256d          one     = _mm256_set1_pd(1.0);
1305     __m256d          two     = _mm256_set1_pd(2.0);
1306     x                = xx[0];
1307     f                = ff[0];
1308
1309     nri              = nlist->nri;
1310     iinr             = nlist->iinr;
1311     jindex           = nlist->jindex;
1312     jjnr             = nlist->jjnr;
1313     shiftidx         = nlist->shift;
1314     gid              = nlist->gid;
1315     shiftvec         = fr->shift_vec[0];
1316     fshift           = fr->fshift[0];
1317     facel            = _mm256_set1_pd(fr->ic->epsfac);
1318     charge           = mdatoms->chargeA;
1319
1320     vftab            = kernel_data->table_elec->data;
1321     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
1322
1323     /* Setup water-specific parameters */
1324     inr              = nlist->iinr[0];
1325     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1326     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1327     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1328
1329     jq0              = _mm256_set1_pd(charge[inr+0]);
1330     jq1              = _mm256_set1_pd(charge[inr+1]);
1331     jq2              = _mm256_set1_pd(charge[inr+2]);
1332     qq00             = _mm256_mul_pd(iq0,jq0);
1333     qq01             = _mm256_mul_pd(iq0,jq1);
1334     qq02             = _mm256_mul_pd(iq0,jq2);
1335     qq10             = _mm256_mul_pd(iq1,jq0);
1336     qq11             = _mm256_mul_pd(iq1,jq1);
1337     qq12             = _mm256_mul_pd(iq1,jq2);
1338     qq20             = _mm256_mul_pd(iq2,jq0);
1339     qq21             = _mm256_mul_pd(iq2,jq1);
1340     qq22             = _mm256_mul_pd(iq2,jq2);
1341
1342     /* Avoid stupid compiler warnings */
1343     jnrA = jnrB = jnrC = jnrD = 0;
1344     j_coord_offsetA = 0;
1345     j_coord_offsetB = 0;
1346     j_coord_offsetC = 0;
1347     j_coord_offsetD = 0;
1348
1349     outeriter        = 0;
1350     inneriter        = 0;
1351
1352     for(iidx=0;iidx<4*DIM;iidx++)
1353     {
1354         scratch[iidx] = 0.0;
1355     }
1356
1357     /* Start outer loop over neighborlists */
1358     for(iidx=0; iidx<nri; iidx++)
1359     {
1360         /* Load shift vector for this list */
1361         i_shift_offset   = DIM*shiftidx[iidx];
1362
1363         /* Load limits for loop over neighbors */
1364         j_index_start    = jindex[iidx];
1365         j_index_end      = jindex[iidx+1];
1366
1367         /* Get outer coordinate index */
1368         inr              = iinr[iidx];
1369         i_coord_offset   = DIM*inr;
1370
1371         /* Load i particle coords and add shift vector */
1372         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1373                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1374
1375         fix0             = _mm256_setzero_pd();
1376         fiy0             = _mm256_setzero_pd();
1377         fiz0             = _mm256_setzero_pd();
1378         fix1             = _mm256_setzero_pd();
1379         fiy1             = _mm256_setzero_pd();
1380         fiz1             = _mm256_setzero_pd();
1381         fix2             = _mm256_setzero_pd();
1382         fiy2             = _mm256_setzero_pd();
1383         fiz2             = _mm256_setzero_pd();
1384
1385         /* Start inner kernel loop */
1386         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1387         {
1388
1389             /* Get j neighbor index, and coordinate index */
1390             jnrA             = jjnr[jidx];
1391             jnrB             = jjnr[jidx+1];
1392             jnrC             = jjnr[jidx+2];
1393             jnrD             = jjnr[jidx+3];
1394             j_coord_offsetA  = DIM*jnrA;
1395             j_coord_offsetB  = DIM*jnrB;
1396             j_coord_offsetC  = DIM*jnrC;
1397             j_coord_offsetD  = DIM*jnrD;
1398
1399             /* load j atom coordinates */
1400             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1401                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1402                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1403
1404             /* Calculate displacement vector */
1405             dx00             = _mm256_sub_pd(ix0,jx0);
1406             dy00             = _mm256_sub_pd(iy0,jy0);
1407             dz00             = _mm256_sub_pd(iz0,jz0);
1408             dx01             = _mm256_sub_pd(ix0,jx1);
1409             dy01             = _mm256_sub_pd(iy0,jy1);
1410             dz01             = _mm256_sub_pd(iz0,jz1);
1411             dx02             = _mm256_sub_pd(ix0,jx2);
1412             dy02             = _mm256_sub_pd(iy0,jy2);
1413             dz02             = _mm256_sub_pd(iz0,jz2);
1414             dx10             = _mm256_sub_pd(ix1,jx0);
1415             dy10             = _mm256_sub_pd(iy1,jy0);
1416             dz10             = _mm256_sub_pd(iz1,jz0);
1417             dx11             = _mm256_sub_pd(ix1,jx1);
1418             dy11             = _mm256_sub_pd(iy1,jy1);
1419             dz11             = _mm256_sub_pd(iz1,jz1);
1420             dx12             = _mm256_sub_pd(ix1,jx2);
1421             dy12             = _mm256_sub_pd(iy1,jy2);
1422             dz12             = _mm256_sub_pd(iz1,jz2);
1423             dx20             = _mm256_sub_pd(ix2,jx0);
1424             dy20             = _mm256_sub_pd(iy2,jy0);
1425             dz20             = _mm256_sub_pd(iz2,jz0);
1426             dx21             = _mm256_sub_pd(ix2,jx1);
1427             dy21             = _mm256_sub_pd(iy2,jy1);
1428             dz21             = _mm256_sub_pd(iz2,jz1);
1429             dx22             = _mm256_sub_pd(ix2,jx2);
1430             dy22             = _mm256_sub_pd(iy2,jy2);
1431             dz22             = _mm256_sub_pd(iz2,jz2);
1432
1433             /* Calculate squared distance and things based on it */
1434             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1435             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1436             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1437             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1438             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1439             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1440             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1441             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1442             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1443
1444             rinv00           = avx256_invsqrt_d(rsq00);
1445             rinv01           = avx256_invsqrt_d(rsq01);
1446             rinv02           = avx256_invsqrt_d(rsq02);
1447             rinv10           = avx256_invsqrt_d(rsq10);
1448             rinv11           = avx256_invsqrt_d(rsq11);
1449             rinv12           = avx256_invsqrt_d(rsq12);
1450             rinv20           = avx256_invsqrt_d(rsq20);
1451             rinv21           = avx256_invsqrt_d(rsq21);
1452             rinv22           = avx256_invsqrt_d(rsq22);
1453
1454             fjx0             = _mm256_setzero_pd();
1455             fjy0             = _mm256_setzero_pd();
1456             fjz0             = _mm256_setzero_pd();
1457             fjx1             = _mm256_setzero_pd();
1458             fjy1             = _mm256_setzero_pd();
1459             fjz1             = _mm256_setzero_pd();
1460             fjx2             = _mm256_setzero_pd();
1461             fjy2             = _mm256_setzero_pd();
1462             fjz2             = _mm256_setzero_pd();
1463
1464             /**************************
1465              * CALCULATE INTERACTIONS *
1466              **************************/
1467
1468             r00              = _mm256_mul_pd(rsq00,rinv00);
1469
1470             /* Calculate table index by multiplying r with table scale and truncate to integer */
1471             rt               = _mm256_mul_pd(r00,vftabscale);
1472             vfitab           = _mm256_cvttpd_epi32(rt);
1473             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1474             vfitab           = _mm_slli_epi32(vfitab,2);
1475
1476             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1477             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1478             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1479             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1480             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1481             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1482             Heps             = _mm256_mul_pd(vfeps,H);
1483             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1484             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1485             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1486
1487             fscal            = felec;
1488
1489             /* Calculate temporary vectorial force */
1490             tx               = _mm256_mul_pd(fscal,dx00);
1491             ty               = _mm256_mul_pd(fscal,dy00);
1492             tz               = _mm256_mul_pd(fscal,dz00);
1493
1494             /* Update vectorial force */
1495             fix0             = _mm256_add_pd(fix0,tx);
1496             fiy0             = _mm256_add_pd(fiy0,ty);
1497             fiz0             = _mm256_add_pd(fiz0,tz);
1498
1499             fjx0             = _mm256_add_pd(fjx0,tx);
1500             fjy0             = _mm256_add_pd(fjy0,ty);
1501             fjz0             = _mm256_add_pd(fjz0,tz);
1502
1503             /**************************
1504              * CALCULATE INTERACTIONS *
1505              **************************/
1506
1507             r01              = _mm256_mul_pd(rsq01,rinv01);
1508
1509             /* Calculate table index by multiplying r with table scale and truncate to integer */
1510             rt               = _mm256_mul_pd(r01,vftabscale);
1511             vfitab           = _mm256_cvttpd_epi32(rt);
1512             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1513             vfitab           = _mm_slli_epi32(vfitab,2);
1514
1515             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1516             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1517             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1518             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1519             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1520             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1521             Heps             = _mm256_mul_pd(vfeps,H);
1522             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1523             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1524             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1525
1526             fscal            = felec;
1527
1528             /* Calculate temporary vectorial force */
1529             tx               = _mm256_mul_pd(fscal,dx01);
1530             ty               = _mm256_mul_pd(fscal,dy01);
1531             tz               = _mm256_mul_pd(fscal,dz01);
1532
1533             /* Update vectorial force */
1534             fix0             = _mm256_add_pd(fix0,tx);
1535             fiy0             = _mm256_add_pd(fiy0,ty);
1536             fiz0             = _mm256_add_pd(fiz0,tz);
1537
1538             fjx1             = _mm256_add_pd(fjx1,tx);
1539             fjy1             = _mm256_add_pd(fjy1,ty);
1540             fjz1             = _mm256_add_pd(fjz1,tz);
1541
1542             /**************************
1543              * CALCULATE INTERACTIONS *
1544              **************************/
1545
1546             r02              = _mm256_mul_pd(rsq02,rinv02);
1547
1548             /* Calculate table index by multiplying r with table scale and truncate to integer */
1549             rt               = _mm256_mul_pd(r02,vftabscale);
1550             vfitab           = _mm256_cvttpd_epi32(rt);
1551             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1552             vfitab           = _mm_slli_epi32(vfitab,2);
1553
1554             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1555             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1556             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1557             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1558             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1559             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1560             Heps             = _mm256_mul_pd(vfeps,H);
1561             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1562             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1563             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1564
1565             fscal            = felec;
1566
1567             /* Calculate temporary vectorial force */
1568             tx               = _mm256_mul_pd(fscal,dx02);
1569             ty               = _mm256_mul_pd(fscal,dy02);
1570             tz               = _mm256_mul_pd(fscal,dz02);
1571
1572             /* Update vectorial force */
1573             fix0             = _mm256_add_pd(fix0,tx);
1574             fiy0             = _mm256_add_pd(fiy0,ty);
1575             fiz0             = _mm256_add_pd(fiz0,tz);
1576
1577             fjx2             = _mm256_add_pd(fjx2,tx);
1578             fjy2             = _mm256_add_pd(fjy2,ty);
1579             fjz2             = _mm256_add_pd(fjz2,tz);
1580
1581             /**************************
1582              * CALCULATE INTERACTIONS *
1583              **************************/
1584
1585             r10              = _mm256_mul_pd(rsq10,rinv10);
1586
1587             /* Calculate table index by multiplying r with table scale and truncate to integer */
1588             rt               = _mm256_mul_pd(r10,vftabscale);
1589             vfitab           = _mm256_cvttpd_epi32(rt);
1590             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1591             vfitab           = _mm_slli_epi32(vfitab,2);
1592
1593             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1594             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1595             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1596             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1597             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1598             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1599             Heps             = _mm256_mul_pd(vfeps,H);
1600             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1601             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1602             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1603
1604             fscal            = felec;
1605
1606             /* Calculate temporary vectorial force */
1607             tx               = _mm256_mul_pd(fscal,dx10);
1608             ty               = _mm256_mul_pd(fscal,dy10);
1609             tz               = _mm256_mul_pd(fscal,dz10);
1610
1611             /* Update vectorial force */
1612             fix1             = _mm256_add_pd(fix1,tx);
1613             fiy1             = _mm256_add_pd(fiy1,ty);
1614             fiz1             = _mm256_add_pd(fiz1,tz);
1615
1616             fjx0             = _mm256_add_pd(fjx0,tx);
1617             fjy0             = _mm256_add_pd(fjy0,ty);
1618             fjz0             = _mm256_add_pd(fjz0,tz);
1619
1620             /**************************
1621              * CALCULATE INTERACTIONS *
1622              **************************/
1623
1624             r11              = _mm256_mul_pd(rsq11,rinv11);
1625
1626             /* Calculate table index by multiplying r with table scale and truncate to integer */
1627             rt               = _mm256_mul_pd(r11,vftabscale);
1628             vfitab           = _mm256_cvttpd_epi32(rt);
1629             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1630             vfitab           = _mm_slli_epi32(vfitab,2);
1631
1632             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1633             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1634             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1635             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1636             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1637             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1638             Heps             = _mm256_mul_pd(vfeps,H);
1639             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1640             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1641             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1642
1643             fscal            = felec;
1644
1645             /* Calculate temporary vectorial force */
1646             tx               = _mm256_mul_pd(fscal,dx11);
1647             ty               = _mm256_mul_pd(fscal,dy11);
1648             tz               = _mm256_mul_pd(fscal,dz11);
1649
1650             /* Update vectorial force */
1651             fix1             = _mm256_add_pd(fix1,tx);
1652             fiy1             = _mm256_add_pd(fiy1,ty);
1653             fiz1             = _mm256_add_pd(fiz1,tz);
1654
1655             fjx1             = _mm256_add_pd(fjx1,tx);
1656             fjy1             = _mm256_add_pd(fjy1,ty);
1657             fjz1             = _mm256_add_pd(fjz1,tz);
1658
1659             /**************************
1660              * CALCULATE INTERACTIONS *
1661              **************************/
1662
1663             r12              = _mm256_mul_pd(rsq12,rinv12);
1664
1665             /* Calculate table index by multiplying r with table scale and truncate to integer */
1666             rt               = _mm256_mul_pd(r12,vftabscale);
1667             vfitab           = _mm256_cvttpd_epi32(rt);
1668             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1669             vfitab           = _mm_slli_epi32(vfitab,2);
1670
1671             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1672             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1673             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1674             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1675             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1676             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1677             Heps             = _mm256_mul_pd(vfeps,H);
1678             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1679             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1680             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1681
1682             fscal            = felec;
1683
1684             /* Calculate temporary vectorial force */
1685             tx               = _mm256_mul_pd(fscal,dx12);
1686             ty               = _mm256_mul_pd(fscal,dy12);
1687             tz               = _mm256_mul_pd(fscal,dz12);
1688
1689             /* Update vectorial force */
1690             fix1             = _mm256_add_pd(fix1,tx);
1691             fiy1             = _mm256_add_pd(fiy1,ty);
1692             fiz1             = _mm256_add_pd(fiz1,tz);
1693
1694             fjx2             = _mm256_add_pd(fjx2,tx);
1695             fjy2             = _mm256_add_pd(fjy2,ty);
1696             fjz2             = _mm256_add_pd(fjz2,tz);
1697
1698             /**************************
1699              * CALCULATE INTERACTIONS *
1700              **************************/
1701
1702             r20              = _mm256_mul_pd(rsq20,rinv20);
1703
1704             /* Calculate table index by multiplying r with table scale and truncate to integer */
1705             rt               = _mm256_mul_pd(r20,vftabscale);
1706             vfitab           = _mm256_cvttpd_epi32(rt);
1707             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1708             vfitab           = _mm_slli_epi32(vfitab,2);
1709
1710             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1711             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1712             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1713             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1714             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1715             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1716             Heps             = _mm256_mul_pd(vfeps,H);
1717             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1718             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1719             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1720
1721             fscal            = felec;
1722
1723             /* Calculate temporary vectorial force */
1724             tx               = _mm256_mul_pd(fscal,dx20);
1725             ty               = _mm256_mul_pd(fscal,dy20);
1726             tz               = _mm256_mul_pd(fscal,dz20);
1727
1728             /* Update vectorial force */
1729             fix2             = _mm256_add_pd(fix2,tx);
1730             fiy2             = _mm256_add_pd(fiy2,ty);
1731             fiz2             = _mm256_add_pd(fiz2,tz);
1732
1733             fjx0             = _mm256_add_pd(fjx0,tx);
1734             fjy0             = _mm256_add_pd(fjy0,ty);
1735             fjz0             = _mm256_add_pd(fjz0,tz);
1736
1737             /**************************
1738              * CALCULATE INTERACTIONS *
1739              **************************/
1740
1741             r21              = _mm256_mul_pd(rsq21,rinv21);
1742
1743             /* Calculate table index by multiplying r with table scale and truncate to integer */
1744             rt               = _mm256_mul_pd(r21,vftabscale);
1745             vfitab           = _mm256_cvttpd_epi32(rt);
1746             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1747             vfitab           = _mm_slli_epi32(vfitab,2);
1748
1749             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1750             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1751             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1752             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1753             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1754             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1755             Heps             = _mm256_mul_pd(vfeps,H);
1756             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1757             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1758             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1759
1760             fscal            = felec;
1761
1762             /* Calculate temporary vectorial force */
1763             tx               = _mm256_mul_pd(fscal,dx21);
1764             ty               = _mm256_mul_pd(fscal,dy21);
1765             tz               = _mm256_mul_pd(fscal,dz21);
1766
1767             /* Update vectorial force */
1768             fix2             = _mm256_add_pd(fix2,tx);
1769             fiy2             = _mm256_add_pd(fiy2,ty);
1770             fiz2             = _mm256_add_pd(fiz2,tz);
1771
1772             fjx1             = _mm256_add_pd(fjx1,tx);
1773             fjy1             = _mm256_add_pd(fjy1,ty);
1774             fjz1             = _mm256_add_pd(fjz1,tz);
1775
1776             /**************************
1777              * CALCULATE INTERACTIONS *
1778              **************************/
1779
1780             r22              = _mm256_mul_pd(rsq22,rinv22);
1781
1782             /* Calculate table index by multiplying r with table scale and truncate to integer */
1783             rt               = _mm256_mul_pd(r22,vftabscale);
1784             vfitab           = _mm256_cvttpd_epi32(rt);
1785             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1786             vfitab           = _mm_slli_epi32(vfitab,2);
1787
1788             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1789             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1790             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1791             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1792             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1793             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1794             Heps             = _mm256_mul_pd(vfeps,H);
1795             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1796             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1797             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1798
1799             fscal            = felec;
1800
1801             /* Calculate temporary vectorial force */
1802             tx               = _mm256_mul_pd(fscal,dx22);
1803             ty               = _mm256_mul_pd(fscal,dy22);
1804             tz               = _mm256_mul_pd(fscal,dz22);
1805
1806             /* Update vectorial force */
1807             fix2             = _mm256_add_pd(fix2,tx);
1808             fiy2             = _mm256_add_pd(fiy2,ty);
1809             fiz2             = _mm256_add_pd(fiz2,tz);
1810
1811             fjx2             = _mm256_add_pd(fjx2,tx);
1812             fjy2             = _mm256_add_pd(fjy2,ty);
1813             fjz2             = _mm256_add_pd(fjz2,tz);
1814
1815             fjptrA             = f+j_coord_offsetA;
1816             fjptrB             = f+j_coord_offsetB;
1817             fjptrC             = f+j_coord_offsetC;
1818             fjptrD             = f+j_coord_offsetD;
1819
1820             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1821                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1822
1823             /* Inner loop uses 351 flops */
1824         }
1825
1826         if(jidx<j_index_end)
1827         {
1828
1829             /* Get j neighbor index, and coordinate index */
1830             jnrlistA         = jjnr[jidx];
1831             jnrlistB         = jjnr[jidx+1];
1832             jnrlistC         = jjnr[jidx+2];
1833             jnrlistD         = jjnr[jidx+3];
1834             /* Sign of each element will be negative for non-real atoms.
1835              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1836              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1837              */
1838             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1839
1840             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1841             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1842             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1843
1844             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1845             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1846             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1847             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1848             j_coord_offsetA  = DIM*jnrA;
1849             j_coord_offsetB  = DIM*jnrB;
1850             j_coord_offsetC  = DIM*jnrC;
1851             j_coord_offsetD  = DIM*jnrD;
1852
1853             /* load j atom coordinates */
1854             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1855                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1856                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1857
1858             /* Calculate displacement vector */
1859             dx00             = _mm256_sub_pd(ix0,jx0);
1860             dy00             = _mm256_sub_pd(iy0,jy0);
1861             dz00             = _mm256_sub_pd(iz0,jz0);
1862             dx01             = _mm256_sub_pd(ix0,jx1);
1863             dy01             = _mm256_sub_pd(iy0,jy1);
1864             dz01             = _mm256_sub_pd(iz0,jz1);
1865             dx02             = _mm256_sub_pd(ix0,jx2);
1866             dy02             = _mm256_sub_pd(iy0,jy2);
1867             dz02             = _mm256_sub_pd(iz0,jz2);
1868             dx10             = _mm256_sub_pd(ix1,jx0);
1869             dy10             = _mm256_sub_pd(iy1,jy0);
1870             dz10             = _mm256_sub_pd(iz1,jz0);
1871             dx11             = _mm256_sub_pd(ix1,jx1);
1872             dy11             = _mm256_sub_pd(iy1,jy1);
1873             dz11             = _mm256_sub_pd(iz1,jz1);
1874             dx12             = _mm256_sub_pd(ix1,jx2);
1875             dy12             = _mm256_sub_pd(iy1,jy2);
1876             dz12             = _mm256_sub_pd(iz1,jz2);
1877             dx20             = _mm256_sub_pd(ix2,jx0);
1878             dy20             = _mm256_sub_pd(iy2,jy0);
1879             dz20             = _mm256_sub_pd(iz2,jz0);
1880             dx21             = _mm256_sub_pd(ix2,jx1);
1881             dy21             = _mm256_sub_pd(iy2,jy1);
1882             dz21             = _mm256_sub_pd(iz2,jz1);
1883             dx22             = _mm256_sub_pd(ix2,jx2);
1884             dy22             = _mm256_sub_pd(iy2,jy2);
1885             dz22             = _mm256_sub_pd(iz2,jz2);
1886
1887             /* Calculate squared distance and things based on it */
1888             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1889             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1890             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1891             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1892             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1893             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1894             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1895             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1896             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1897
1898             rinv00           = avx256_invsqrt_d(rsq00);
1899             rinv01           = avx256_invsqrt_d(rsq01);
1900             rinv02           = avx256_invsqrt_d(rsq02);
1901             rinv10           = avx256_invsqrt_d(rsq10);
1902             rinv11           = avx256_invsqrt_d(rsq11);
1903             rinv12           = avx256_invsqrt_d(rsq12);
1904             rinv20           = avx256_invsqrt_d(rsq20);
1905             rinv21           = avx256_invsqrt_d(rsq21);
1906             rinv22           = avx256_invsqrt_d(rsq22);
1907
1908             fjx0             = _mm256_setzero_pd();
1909             fjy0             = _mm256_setzero_pd();
1910             fjz0             = _mm256_setzero_pd();
1911             fjx1             = _mm256_setzero_pd();
1912             fjy1             = _mm256_setzero_pd();
1913             fjz1             = _mm256_setzero_pd();
1914             fjx2             = _mm256_setzero_pd();
1915             fjy2             = _mm256_setzero_pd();
1916             fjz2             = _mm256_setzero_pd();
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             r00              = _mm256_mul_pd(rsq00,rinv00);
1923             r00              = _mm256_andnot_pd(dummy_mask,r00);
1924
1925             /* Calculate table index by multiplying r with table scale and truncate to integer */
1926             rt               = _mm256_mul_pd(r00,vftabscale);
1927             vfitab           = _mm256_cvttpd_epi32(rt);
1928             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1929             vfitab           = _mm_slli_epi32(vfitab,2);
1930
1931             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1932             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1933             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1934             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1935             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1936             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1937             Heps             = _mm256_mul_pd(vfeps,H);
1938             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1939             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1940             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1941
1942             fscal            = felec;
1943
1944             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm256_mul_pd(fscal,dx00);
1948             ty               = _mm256_mul_pd(fscal,dy00);
1949             tz               = _mm256_mul_pd(fscal,dz00);
1950
1951             /* Update vectorial force */
1952             fix0             = _mm256_add_pd(fix0,tx);
1953             fiy0             = _mm256_add_pd(fiy0,ty);
1954             fiz0             = _mm256_add_pd(fiz0,tz);
1955
1956             fjx0             = _mm256_add_pd(fjx0,tx);
1957             fjy0             = _mm256_add_pd(fjy0,ty);
1958             fjz0             = _mm256_add_pd(fjz0,tz);
1959
1960             /**************************
1961              * CALCULATE INTERACTIONS *
1962              **************************/
1963
1964             r01              = _mm256_mul_pd(rsq01,rinv01);
1965             r01              = _mm256_andnot_pd(dummy_mask,r01);
1966
1967             /* Calculate table index by multiplying r with table scale and truncate to integer */
1968             rt               = _mm256_mul_pd(r01,vftabscale);
1969             vfitab           = _mm256_cvttpd_epi32(rt);
1970             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1971             vfitab           = _mm_slli_epi32(vfitab,2);
1972
1973             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1974             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1975             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1976             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1977             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1978             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1979             Heps             = _mm256_mul_pd(vfeps,H);
1980             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1981             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1982             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1983
1984             fscal            = felec;
1985
1986             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1987
1988             /* Calculate temporary vectorial force */
1989             tx               = _mm256_mul_pd(fscal,dx01);
1990             ty               = _mm256_mul_pd(fscal,dy01);
1991             tz               = _mm256_mul_pd(fscal,dz01);
1992
1993             /* Update vectorial force */
1994             fix0             = _mm256_add_pd(fix0,tx);
1995             fiy0             = _mm256_add_pd(fiy0,ty);
1996             fiz0             = _mm256_add_pd(fiz0,tz);
1997
1998             fjx1             = _mm256_add_pd(fjx1,tx);
1999             fjy1             = _mm256_add_pd(fjy1,ty);
2000             fjz1             = _mm256_add_pd(fjz1,tz);
2001
2002             /**************************
2003              * CALCULATE INTERACTIONS *
2004              **************************/
2005
2006             r02              = _mm256_mul_pd(rsq02,rinv02);
2007             r02              = _mm256_andnot_pd(dummy_mask,r02);
2008
2009             /* Calculate table index by multiplying r with table scale and truncate to integer */
2010             rt               = _mm256_mul_pd(r02,vftabscale);
2011             vfitab           = _mm256_cvttpd_epi32(rt);
2012             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2013             vfitab           = _mm_slli_epi32(vfitab,2);
2014
2015             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2016             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2017             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2018             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2019             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2020             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2021             Heps             = _mm256_mul_pd(vfeps,H);
2022             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2023             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2024             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2025
2026             fscal            = felec;
2027
2028             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2029
2030             /* Calculate temporary vectorial force */
2031             tx               = _mm256_mul_pd(fscal,dx02);
2032             ty               = _mm256_mul_pd(fscal,dy02);
2033             tz               = _mm256_mul_pd(fscal,dz02);
2034
2035             /* Update vectorial force */
2036             fix0             = _mm256_add_pd(fix0,tx);
2037             fiy0             = _mm256_add_pd(fiy0,ty);
2038             fiz0             = _mm256_add_pd(fiz0,tz);
2039
2040             fjx2             = _mm256_add_pd(fjx2,tx);
2041             fjy2             = _mm256_add_pd(fjy2,ty);
2042             fjz2             = _mm256_add_pd(fjz2,tz);
2043
2044             /**************************
2045              * CALCULATE INTERACTIONS *
2046              **************************/
2047
2048             r10              = _mm256_mul_pd(rsq10,rinv10);
2049             r10              = _mm256_andnot_pd(dummy_mask,r10);
2050
2051             /* Calculate table index by multiplying r with table scale and truncate to integer */
2052             rt               = _mm256_mul_pd(r10,vftabscale);
2053             vfitab           = _mm256_cvttpd_epi32(rt);
2054             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2055             vfitab           = _mm_slli_epi32(vfitab,2);
2056
2057             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2058             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2059             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2060             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2061             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2062             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2063             Heps             = _mm256_mul_pd(vfeps,H);
2064             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2065             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2066             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2067
2068             fscal            = felec;
2069
2070             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2071
2072             /* Calculate temporary vectorial force */
2073             tx               = _mm256_mul_pd(fscal,dx10);
2074             ty               = _mm256_mul_pd(fscal,dy10);
2075             tz               = _mm256_mul_pd(fscal,dz10);
2076
2077             /* Update vectorial force */
2078             fix1             = _mm256_add_pd(fix1,tx);
2079             fiy1             = _mm256_add_pd(fiy1,ty);
2080             fiz1             = _mm256_add_pd(fiz1,tz);
2081
2082             fjx0             = _mm256_add_pd(fjx0,tx);
2083             fjy0             = _mm256_add_pd(fjy0,ty);
2084             fjz0             = _mm256_add_pd(fjz0,tz);
2085
2086             /**************************
2087              * CALCULATE INTERACTIONS *
2088              **************************/
2089
2090             r11              = _mm256_mul_pd(rsq11,rinv11);
2091             r11              = _mm256_andnot_pd(dummy_mask,r11);
2092
2093             /* Calculate table index by multiplying r with table scale and truncate to integer */
2094             rt               = _mm256_mul_pd(r11,vftabscale);
2095             vfitab           = _mm256_cvttpd_epi32(rt);
2096             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2097             vfitab           = _mm_slli_epi32(vfitab,2);
2098
2099             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2100             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2101             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2102             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2103             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2104             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2105             Heps             = _mm256_mul_pd(vfeps,H);
2106             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2107             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2108             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2113
2114             /* Calculate temporary vectorial force */
2115             tx               = _mm256_mul_pd(fscal,dx11);
2116             ty               = _mm256_mul_pd(fscal,dy11);
2117             tz               = _mm256_mul_pd(fscal,dz11);
2118
2119             /* Update vectorial force */
2120             fix1             = _mm256_add_pd(fix1,tx);
2121             fiy1             = _mm256_add_pd(fiy1,ty);
2122             fiz1             = _mm256_add_pd(fiz1,tz);
2123
2124             fjx1             = _mm256_add_pd(fjx1,tx);
2125             fjy1             = _mm256_add_pd(fjy1,ty);
2126             fjz1             = _mm256_add_pd(fjz1,tz);
2127
2128             /**************************
2129              * CALCULATE INTERACTIONS *
2130              **************************/
2131
2132             r12              = _mm256_mul_pd(rsq12,rinv12);
2133             r12              = _mm256_andnot_pd(dummy_mask,r12);
2134
2135             /* Calculate table index by multiplying r with table scale and truncate to integer */
2136             rt               = _mm256_mul_pd(r12,vftabscale);
2137             vfitab           = _mm256_cvttpd_epi32(rt);
2138             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2139             vfitab           = _mm_slli_epi32(vfitab,2);
2140
2141             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2142             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2143             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2144             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2145             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2146             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2147             Heps             = _mm256_mul_pd(vfeps,H);
2148             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2149             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2150             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2151
2152             fscal            = felec;
2153
2154             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2155
2156             /* Calculate temporary vectorial force */
2157             tx               = _mm256_mul_pd(fscal,dx12);
2158             ty               = _mm256_mul_pd(fscal,dy12);
2159             tz               = _mm256_mul_pd(fscal,dz12);
2160
2161             /* Update vectorial force */
2162             fix1             = _mm256_add_pd(fix1,tx);
2163             fiy1             = _mm256_add_pd(fiy1,ty);
2164             fiz1             = _mm256_add_pd(fiz1,tz);
2165
2166             fjx2             = _mm256_add_pd(fjx2,tx);
2167             fjy2             = _mm256_add_pd(fjy2,ty);
2168             fjz2             = _mm256_add_pd(fjz2,tz);
2169
2170             /**************************
2171              * CALCULATE INTERACTIONS *
2172              **************************/
2173
2174             r20              = _mm256_mul_pd(rsq20,rinv20);
2175             r20              = _mm256_andnot_pd(dummy_mask,r20);
2176
2177             /* Calculate table index by multiplying r with table scale and truncate to integer */
2178             rt               = _mm256_mul_pd(r20,vftabscale);
2179             vfitab           = _mm256_cvttpd_epi32(rt);
2180             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2181             vfitab           = _mm_slli_epi32(vfitab,2);
2182
2183             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2184             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2185             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2186             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2187             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2188             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2189             Heps             = _mm256_mul_pd(vfeps,H);
2190             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2191             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2192             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2193
2194             fscal            = felec;
2195
2196             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2197
2198             /* Calculate temporary vectorial force */
2199             tx               = _mm256_mul_pd(fscal,dx20);
2200             ty               = _mm256_mul_pd(fscal,dy20);
2201             tz               = _mm256_mul_pd(fscal,dz20);
2202
2203             /* Update vectorial force */
2204             fix2             = _mm256_add_pd(fix2,tx);
2205             fiy2             = _mm256_add_pd(fiy2,ty);
2206             fiz2             = _mm256_add_pd(fiz2,tz);
2207
2208             fjx0             = _mm256_add_pd(fjx0,tx);
2209             fjy0             = _mm256_add_pd(fjy0,ty);
2210             fjz0             = _mm256_add_pd(fjz0,tz);
2211
2212             /**************************
2213              * CALCULATE INTERACTIONS *
2214              **************************/
2215
2216             r21              = _mm256_mul_pd(rsq21,rinv21);
2217             r21              = _mm256_andnot_pd(dummy_mask,r21);
2218
2219             /* Calculate table index by multiplying r with table scale and truncate to integer */
2220             rt               = _mm256_mul_pd(r21,vftabscale);
2221             vfitab           = _mm256_cvttpd_epi32(rt);
2222             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2223             vfitab           = _mm_slli_epi32(vfitab,2);
2224
2225             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2226             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2227             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2228             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2229             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2230             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2231             Heps             = _mm256_mul_pd(vfeps,H);
2232             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2233             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2234             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2235
2236             fscal            = felec;
2237
2238             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2239
2240             /* Calculate temporary vectorial force */
2241             tx               = _mm256_mul_pd(fscal,dx21);
2242             ty               = _mm256_mul_pd(fscal,dy21);
2243             tz               = _mm256_mul_pd(fscal,dz21);
2244
2245             /* Update vectorial force */
2246             fix2             = _mm256_add_pd(fix2,tx);
2247             fiy2             = _mm256_add_pd(fiy2,ty);
2248             fiz2             = _mm256_add_pd(fiz2,tz);
2249
2250             fjx1             = _mm256_add_pd(fjx1,tx);
2251             fjy1             = _mm256_add_pd(fjy1,ty);
2252             fjz1             = _mm256_add_pd(fjz1,tz);
2253
2254             /**************************
2255              * CALCULATE INTERACTIONS *
2256              **************************/
2257
2258             r22              = _mm256_mul_pd(rsq22,rinv22);
2259             r22              = _mm256_andnot_pd(dummy_mask,r22);
2260
2261             /* Calculate table index by multiplying r with table scale and truncate to integer */
2262             rt               = _mm256_mul_pd(r22,vftabscale);
2263             vfitab           = _mm256_cvttpd_epi32(rt);
2264             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2265             vfitab           = _mm_slli_epi32(vfitab,2);
2266
2267             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2268             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2269             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2270             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2271             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2272             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2273             Heps             = _mm256_mul_pd(vfeps,H);
2274             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2275             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2276             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2277
2278             fscal            = felec;
2279
2280             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2281
2282             /* Calculate temporary vectorial force */
2283             tx               = _mm256_mul_pd(fscal,dx22);
2284             ty               = _mm256_mul_pd(fscal,dy22);
2285             tz               = _mm256_mul_pd(fscal,dz22);
2286
2287             /* Update vectorial force */
2288             fix2             = _mm256_add_pd(fix2,tx);
2289             fiy2             = _mm256_add_pd(fiy2,ty);
2290             fiz2             = _mm256_add_pd(fiz2,tz);
2291
2292             fjx2             = _mm256_add_pd(fjx2,tx);
2293             fjy2             = _mm256_add_pd(fjy2,ty);
2294             fjz2             = _mm256_add_pd(fjz2,tz);
2295
2296             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2297             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2298             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2299             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2300
2301             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2302                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2303
2304             /* Inner loop uses 360 flops */
2305         }
2306
2307         /* End of innermost loop */
2308
2309         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2310                                                  f+i_coord_offset,fshift+i_shift_offset);
2311
2312         /* Increment number of inner iterations */
2313         inneriter                  += j_index_end - j_index_start;
2314
2315         /* Outer loop uses 18 flops */
2316     }
2317
2318     /* Increment number of outer iterations */
2319     outeriter        += nri;
2320
2321     /* Update outer/inner flops */
2322
2323     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2324 }