697cfd637482a100852cb9be82ce96d9120afc6e
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            None
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     __m128i          vfitab;
108     __m128i          ifour       = _mm_set1_epi32(4);
109     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
110     real             *vftab;
111     __m256d          dummy_mask,cutoff_mask;
112     __m128           tmpmask0,tmpmask1;
113     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114     __m256d          one     = _mm256_set1_pd(1.0);
115     __m256d          two     = _mm256_set1_pd(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm256_set1_pd(fr->epsfac);
128     charge           = mdatoms->chargeA;
129
130     vftab            = kernel_data->table_elec->data;
131     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
136     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
137     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
138
139     jq0              = _mm256_set1_pd(charge[inr+0]);
140     jq1              = _mm256_set1_pd(charge[inr+1]);
141     jq2              = _mm256_set1_pd(charge[inr+2]);
142     qq00             = _mm256_mul_pd(iq0,jq0);
143     qq01             = _mm256_mul_pd(iq0,jq1);
144     qq02             = _mm256_mul_pd(iq0,jq2);
145     qq10             = _mm256_mul_pd(iq1,jq0);
146     qq11             = _mm256_mul_pd(iq1,jq1);
147     qq12             = _mm256_mul_pd(iq1,jq2);
148     qq20             = _mm256_mul_pd(iq2,jq0);
149     qq21             = _mm256_mul_pd(iq2,jq1);
150     qq22             = _mm256_mul_pd(iq2,jq2);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184
185         fix0             = _mm256_setzero_pd();
186         fiy0             = _mm256_setzero_pd();
187         fiz0             = _mm256_setzero_pd();
188         fix1             = _mm256_setzero_pd();
189         fiy1             = _mm256_setzero_pd();
190         fiz1             = _mm256_setzero_pd();
191         fix2             = _mm256_setzero_pd();
192         fiy2             = _mm256_setzero_pd();
193         fiz2             = _mm256_setzero_pd();
194
195         /* Reset potential sums */
196         velecsum         = _mm256_setzero_pd();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                                  x+j_coord_offsetC,x+j_coord_offsetD,
215                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216
217             /* Calculate displacement vector */
218             dx00             = _mm256_sub_pd(ix0,jx0);
219             dy00             = _mm256_sub_pd(iy0,jy0);
220             dz00             = _mm256_sub_pd(iz0,jz0);
221             dx01             = _mm256_sub_pd(ix0,jx1);
222             dy01             = _mm256_sub_pd(iy0,jy1);
223             dz01             = _mm256_sub_pd(iz0,jz1);
224             dx02             = _mm256_sub_pd(ix0,jx2);
225             dy02             = _mm256_sub_pd(iy0,jy2);
226             dz02             = _mm256_sub_pd(iz0,jz2);
227             dx10             = _mm256_sub_pd(ix1,jx0);
228             dy10             = _mm256_sub_pd(iy1,jy0);
229             dz10             = _mm256_sub_pd(iz1,jz0);
230             dx11             = _mm256_sub_pd(ix1,jx1);
231             dy11             = _mm256_sub_pd(iy1,jy1);
232             dz11             = _mm256_sub_pd(iz1,jz1);
233             dx12             = _mm256_sub_pd(ix1,jx2);
234             dy12             = _mm256_sub_pd(iy1,jy2);
235             dz12             = _mm256_sub_pd(iz1,jz2);
236             dx20             = _mm256_sub_pd(ix2,jx0);
237             dy20             = _mm256_sub_pd(iy2,jy0);
238             dz20             = _mm256_sub_pd(iz2,jz0);
239             dx21             = _mm256_sub_pd(ix2,jx1);
240             dy21             = _mm256_sub_pd(iy2,jy1);
241             dz21             = _mm256_sub_pd(iz2,jz1);
242             dx22             = _mm256_sub_pd(ix2,jx2);
243             dy22             = _mm256_sub_pd(iy2,jy2);
244             dz22             = _mm256_sub_pd(iz2,jz2);
245
246             /* Calculate squared distance and things based on it */
247             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
248             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
249             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
250             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
251             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
252             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
253             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
254             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
255             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
256
257             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
258             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
259             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
260             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
261             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
262             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
263             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
264             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
265             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
266
267             fjx0             = _mm256_setzero_pd();
268             fjy0             = _mm256_setzero_pd();
269             fjz0             = _mm256_setzero_pd();
270             fjx1             = _mm256_setzero_pd();
271             fjy1             = _mm256_setzero_pd();
272             fjz1             = _mm256_setzero_pd();
273             fjx2             = _mm256_setzero_pd();
274             fjy2             = _mm256_setzero_pd();
275             fjz2             = _mm256_setzero_pd();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             r00              = _mm256_mul_pd(rsq00,rinv00);
282
283             /* Calculate table index by multiplying r with table scale and truncate to integer */
284             rt               = _mm256_mul_pd(r00,vftabscale);
285             vfitab           = _mm256_cvttpd_epi32(rt);
286             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
287             vfitab           = _mm_slli_epi32(vfitab,2);
288
289             /* CUBIC SPLINE TABLE ELECTROSTATICS */
290             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
291             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
292             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
293             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
294             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
295             Heps             = _mm256_mul_pd(vfeps,H);
296             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
297             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
298             velec            = _mm256_mul_pd(qq00,VV);
299             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
300             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
301
302             /* Update potential sum for this i atom from the interaction with this j atom. */
303             velecsum         = _mm256_add_pd(velecsum,velec);
304
305             fscal            = felec;
306
307             /* Calculate temporary vectorial force */
308             tx               = _mm256_mul_pd(fscal,dx00);
309             ty               = _mm256_mul_pd(fscal,dy00);
310             tz               = _mm256_mul_pd(fscal,dz00);
311
312             /* Update vectorial force */
313             fix0             = _mm256_add_pd(fix0,tx);
314             fiy0             = _mm256_add_pd(fiy0,ty);
315             fiz0             = _mm256_add_pd(fiz0,tz);
316
317             fjx0             = _mm256_add_pd(fjx0,tx);
318             fjy0             = _mm256_add_pd(fjy0,ty);
319             fjz0             = _mm256_add_pd(fjz0,tz);
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             r01              = _mm256_mul_pd(rsq01,rinv01);
326
327             /* Calculate table index by multiplying r with table scale and truncate to integer */
328             rt               = _mm256_mul_pd(r01,vftabscale);
329             vfitab           = _mm256_cvttpd_epi32(rt);
330             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
331             vfitab           = _mm_slli_epi32(vfitab,2);
332
333             /* CUBIC SPLINE TABLE ELECTROSTATICS */
334             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
335             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
336             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
337             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
338             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
339             Heps             = _mm256_mul_pd(vfeps,H);
340             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
341             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
342             velec            = _mm256_mul_pd(qq01,VV);
343             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
344             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velecsum         = _mm256_add_pd(velecsum,velec);
348
349             fscal            = felec;
350
351             /* Calculate temporary vectorial force */
352             tx               = _mm256_mul_pd(fscal,dx01);
353             ty               = _mm256_mul_pd(fscal,dy01);
354             tz               = _mm256_mul_pd(fscal,dz01);
355
356             /* Update vectorial force */
357             fix0             = _mm256_add_pd(fix0,tx);
358             fiy0             = _mm256_add_pd(fiy0,ty);
359             fiz0             = _mm256_add_pd(fiz0,tz);
360
361             fjx1             = _mm256_add_pd(fjx1,tx);
362             fjy1             = _mm256_add_pd(fjy1,ty);
363             fjz1             = _mm256_add_pd(fjz1,tz);
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             r02              = _mm256_mul_pd(rsq02,rinv02);
370
371             /* Calculate table index by multiplying r with table scale and truncate to integer */
372             rt               = _mm256_mul_pd(r02,vftabscale);
373             vfitab           = _mm256_cvttpd_epi32(rt);
374             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
375             vfitab           = _mm_slli_epi32(vfitab,2);
376
377             /* CUBIC SPLINE TABLE ELECTROSTATICS */
378             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
379             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
380             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
381             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
382             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
383             Heps             = _mm256_mul_pd(vfeps,H);
384             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
385             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
386             velec            = _mm256_mul_pd(qq02,VV);
387             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
388             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
389
390             /* Update potential sum for this i atom from the interaction with this j atom. */
391             velecsum         = _mm256_add_pd(velecsum,velec);
392
393             fscal            = felec;
394
395             /* Calculate temporary vectorial force */
396             tx               = _mm256_mul_pd(fscal,dx02);
397             ty               = _mm256_mul_pd(fscal,dy02);
398             tz               = _mm256_mul_pd(fscal,dz02);
399
400             /* Update vectorial force */
401             fix0             = _mm256_add_pd(fix0,tx);
402             fiy0             = _mm256_add_pd(fiy0,ty);
403             fiz0             = _mm256_add_pd(fiz0,tz);
404
405             fjx2             = _mm256_add_pd(fjx2,tx);
406             fjy2             = _mm256_add_pd(fjy2,ty);
407             fjz2             = _mm256_add_pd(fjz2,tz);
408
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             r10              = _mm256_mul_pd(rsq10,rinv10);
414
415             /* Calculate table index by multiplying r with table scale and truncate to integer */
416             rt               = _mm256_mul_pd(r10,vftabscale);
417             vfitab           = _mm256_cvttpd_epi32(rt);
418             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
419             vfitab           = _mm_slli_epi32(vfitab,2);
420
421             /* CUBIC SPLINE TABLE ELECTROSTATICS */
422             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
423             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
424             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
425             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
426             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
427             Heps             = _mm256_mul_pd(vfeps,H);
428             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
429             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
430             velec            = _mm256_mul_pd(qq10,VV);
431             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
432             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velecsum         = _mm256_add_pd(velecsum,velec);
436
437             fscal            = felec;
438
439             /* Calculate temporary vectorial force */
440             tx               = _mm256_mul_pd(fscal,dx10);
441             ty               = _mm256_mul_pd(fscal,dy10);
442             tz               = _mm256_mul_pd(fscal,dz10);
443
444             /* Update vectorial force */
445             fix1             = _mm256_add_pd(fix1,tx);
446             fiy1             = _mm256_add_pd(fiy1,ty);
447             fiz1             = _mm256_add_pd(fiz1,tz);
448
449             fjx0             = _mm256_add_pd(fjx0,tx);
450             fjy0             = _mm256_add_pd(fjy0,ty);
451             fjz0             = _mm256_add_pd(fjz0,tz);
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             r11              = _mm256_mul_pd(rsq11,rinv11);
458
459             /* Calculate table index by multiplying r with table scale and truncate to integer */
460             rt               = _mm256_mul_pd(r11,vftabscale);
461             vfitab           = _mm256_cvttpd_epi32(rt);
462             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
463             vfitab           = _mm_slli_epi32(vfitab,2);
464
465             /* CUBIC SPLINE TABLE ELECTROSTATICS */
466             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
467             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
468             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
469             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
470             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
471             Heps             = _mm256_mul_pd(vfeps,H);
472             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
473             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
474             velec            = _mm256_mul_pd(qq11,VV);
475             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
476             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velecsum         = _mm256_add_pd(velecsum,velec);
480
481             fscal            = felec;
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm256_mul_pd(fscal,dx11);
485             ty               = _mm256_mul_pd(fscal,dy11);
486             tz               = _mm256_mul_pd(fscal,dz11);
487
488             /* Update vectorial force */
489             fix1             = _mm256_add_pd(fix1,tx);
490             fiy1             = _mm256_add_pd(fiy1,ty);
491             fiz1             = _mm256_add_pd(fiz1,tz);
492
493             fjx1             = _mm256_add_pd(fjx1,tx);
494             fjy1             = _mm256_add_pd(fjy1,ty);
495             fjz1             = _mm256_add_pd(fjz1,tz);
496
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             r12              = _mm256_mul_pd(rsq12,rinv12);
502
503             /* Calculate table index by multiplying r with table scale and truncate to integer */
504             rt               = _mm256_mul_pd(r12,vftabscale);
505             vfitab           = _mm256_cvttpd_epi32(rt);
506             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
507             vfitab           = _mm_slli_epi32(vfitab,2);
508
509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
510             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
511             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
512             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
513             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
514             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
515             Heps             = _mm256_mul_pd(vfeps,H);
516             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
517             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
518             velec            = _mm256_mul_pd(qq12,VV);
519             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
520             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velecsum         = _mm256_add_pd(velecsum,velec);
524
525             fscal            = felec;
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm256_mul_pd(fscal,dx12);
529             ty               = _mm256_mul_pd(fscal,dy12);
530             tz               = _mm256_mul_pd(fscal,dz12);
531
532             /* Update vectorial force */
533             fix1             = _mm256_add_pd(fix1,tx);
534             fiy1             = _mm256_add_pd(fiy1,ty);
535             fiz1             = _mm256_add_pd(fiz1,tz);
536
537             fjx2             = _mm256_add_pd(fjx2,tx);
538             fjy2             = _mm256_add_pd(fjy2,ty);
539             fjz2             = _mm256_add_pd(fjz2,tz);
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             r20              = _mm256_mul_pd(rsq20,rinv20);
546
547             /* Calculate table index by multiplying r with table scale and truncate to integer */
548             rt               = _mm256_mul_pd(r20,vftabscale);
549             vfitab           = _mm256_cvttpd_epi32(rt);
550             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
551             vfitab           = _mm_slli_epi32(vfitab,2);
552
553             /* CUBIC SPLINE TABLE ELECTROSTATICS */
554             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
555             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
556             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
557             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
558             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
559             Heps             = _mm256_mul_pd(vfeps,H);
560             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
561             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
562             velec            = _mm256_mul_pd(qq20,VV);
563             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
564             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm256_add_pd(velecsum,velec);
568
569             fscal            = felec;
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm256_mul_pd(fscal,dx20);
573             ty               = _mm256_mul_pd(fscal,dy20);
574             tz               = _mm256_mul_pd(fscal,dz20);
575
576             /* Update vectorial force */
577             fix2             = _mm256_add_pd(fix2,tx);
578             fiy2             = _mm256_add_pd(fiy2,ty);
579             fiz2             = _mm256_add_pd(fiz2,tz);
580
581             fjx0             = _mm256_add_pd(fjx0,tx);
582             fjy0             = _mm256_add_pd(fjy0,ty);
583             fjz0             = _mm256_add_pd(fjz0,tz);
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             r21              = _mm256_mul_pd(rsq21,rinv21);
590
591             /* Calculate table index by multiplying r with table scale and truncate to integer */
592             rt               = _mm256_mul_pd(r21,vftabscale);
593             vfitab           = _mm256_cvttpd_epi32(rt);
594             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
595             vfitab           = _mm_slli_epi32(vfitab,2);
596
597             /* CUBIC SPLINE TABLE ELECTROSTATICS */
598             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
599             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
600             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
601             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
602             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
603             Heps             = _mm256_mul_pd(vfeps,H);
604             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
605             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
606             velec            = _mm256_mul_pd(qq21,VV);
607             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
608             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velecsum         = _mm256_add_pd(velecsum,velec);
612
613             fscal            = felec;
614
615             /* Calculate temporary vectorial force */
616             tx               = _mm256_mul_pd(fscal,dx21);
617             ty               = _mm256_mul_pd(fscal,dy21);
618             tz               = _mm256_mul_pd(fscal,dz21);
619
620             /* Update vectorial force */
621             fix2             = _mm256_add_pd(fix2,tx);
622             fiy2             = _mm256_add_pd(fiy2,ty);
623             fiz2             = _mm256_add_pd(fiz2,tz);
624
625             fjx1             = _mm256_add_pd(fjx1,tx);
626             fjy1             = _mm256_add_pd(fjy1,ty);
627             fjz1             = _mm256_add_pd(fjz1,tz);
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             r22              = _mm256_mul_pd(rsq22,rinv22);
634
635             /* Calculate table index by multiplying r with table scale and truncate to integer */
636             rt               = _mm256_mul_pd(r22,vftabscale);
637             vfitab           = _mm256_cvttpd_epi32(rt);
638             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
639             vfitab           = _mm_slli_epi32(vfitab,2);
640
641             /* CUBIC SPLINE TABLE ELECTROSTATICS */
642             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
643             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
644             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
645             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
646             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
647             Heps             = _mm256_mul_pd(vfeps,H);
648             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
649             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
650             velec            = _mm256_mul_pd(qq22,VV);
651             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
652             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm256_add_pd(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Calculate temporary vectorial force */
660             tx               = _mm256_mul_pd(fscal,dx22);
661             ty               = _mm256_mul_pd(fscal,dy22);
662             tz               = _mm256_mul_pd(fscal,dz22);
663
664             /* Update vectorial force */
665             fix2             = _mm256_add_pd(fix2,tx);
666             fiy2             = _mm256_add_pd(fiy2,ty);
667             fiz2             = _mm256_add_pd(fiz2,tz);
668
669             fjx2             = _mm256_add_pd(fjx2,tx);
670             fjy2             = _mm256_add_pd(fjy2,ty);
671             fjz2             = _mm256_add_pd(fjz2,tz);
672
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677
678             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
679                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
680
681             /* Inner loop uses 387 flops */
682         }
683
684         if(jidx<j_index_end)
685         {
686
687             /* Get j neighbor index, and coordinate index */
688             jnrlistA         = jjnr[jidx];
689             jnrlistB         = jjnr[jidx+1];
690             jnrlistC         = jjnr[jidx+2];
691             jnrlistD         = jjnr[jidx+3];
692             /* Sign of each element will be negative for non-real atoms.
693              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
695              */
696             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697
698             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
699             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
700             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
701
702             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
703             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
704             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
705             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
706             j_coord_offsetA  = DIM*jnrA;
707             j_coord_offsetB  = DIM*jnrB;
708             j_coord_offsetC  = DIM*jnrC;
709             j_coord_offsetD  = DIM*jnrD;
710
711             /* load j atom coordinates */
712             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
713                                                  x+j_coord_offsetC,x+j_coord_offsetD,
714                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
715
716             /* Calculate displacement vector */
717             dx00             = _mm256_sub_pd(ix0,jx0);
718             dy00             = _mm256_sub_pd(iy0,jy0);
719             dz00             = _mm256_sub_pd(iz0,jz0);
720             dx01             = _mm256_sub_pd(ix0,jx1);
721             dy01             = _mm256_sub_pd(iy0,jy1);
722             dz01             = _mm256_sub_pd(iz0,jz1);
723             dx02             = _mm256_sub_pd(ix0,jx2);
724             dy02             = _mm256_sub_pd(iy0,jy2);
725             dz02             = _mm256_sub_pd(iz0,jz2);
726             dx10             = _mm256_sub_pd(ix1,jx0);
727             dy10             = _mm256_sub_pd(iy1,jy0);
728             dz10             = _mm256_sub_pd(iz1,jz0);
729             dx11             = _mm256_sub_pd(ix1,jx1);
730             dy11             = _mm256_sub_pd(iy1,jy1);
731             dz11             = _mm256_sub_pd(iz1,jz1);
732             dx12             = _mm256_sub_pd(ix1,jx2);
733             dy12             = _mm256_sub_pd(iy1,jy2);
734             dz12             = _mm256_sub_pd(iz1,jz2);
735             dx20             = _mm256_sub_pd(ix2,jx0);
736             dy20             = _mm256_sub_pd(iy2,jy0);
737             dz20             = _mm256_sub_pd(iz2,jz0);
738             dx21             = _mm256_sub_pd(ix2,jx1);
739             dy21             = _mm256_sub_pd(iy2,jy1);
740             dz21             = _mm256_sub_pd(iz2,jz1);
741             dx22             = _mm256_sub_pd(ix2,jx2);
742             dy22             = _mm256_sub_pd(iy2,jy2);
743             dz22             = _mm256_sub_pd(iz2,jz2);
744
745             /* Calculate squared distance and things based on it */
746             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
747             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
748             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
749             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
750             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
751             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
752             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
753             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
754             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
755
756             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
757             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
758             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
759             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
760             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
761             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
762             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
763             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
764             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
765
766             fjx0             = _mm256_setzero_pd();
767             fjy0             = _mm256_setzero_pd();
768             fjz0             = _mm256_setzero_pd();
769             fjx1             = _mm256_setzero_pd();
770             fjy1             = _mm256_setzero_pd();
771             fjz1             = _mm256_setzero_pd();
772             fjx2             = _mm256_setzero_pd();
773             fjy2             = _mm256_setzero_pd();
774             fjz2             = _mm256_setzero_pd();
775
776             /**************************
777              * CALCULATE INTERACTIONS *
778              **************************/
779
780             r00              = _mm256_mul_pd(rsq00,rinv00);
781             r00              = _mm256_andnot_pd(dummy_mask,r00);
782
783             /* Calculate table index by multiplying r with table scale and truncate to integer */
784             rt               = _mm256_mul_pd(r00,vftabscale);
785             vfitab           = _mm256_cvttpd_epi32(rt);
786             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
787             vfitab           = _mm_slli_epi32(vfitab,2);
788
789             /* CUBIC SPLINE TABLE ELECTROSTATICS */
790             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
791             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
792             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
793             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
794             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
795             Heps             = _mm256_mul_pd(vfeps,H);
796             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
797             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
798             velec            = _mm256_mul_pd(qq00,VV);
799             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
800             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
801
802             /* Update potential sum for this i atom from the interaction with this j atom. */
803             velec            = _mm256_andnot_pd(dummy_mask,velec);
804             velecsum         = _mm256_add_pd(velecsum,velec);
805
806             fscal            = felec;
807
808             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
809
810             /* Calculate temporary vectorial force */
811             tx               = _mm256_mul_pd(fscal,dx00);
812             ty               = _mm256_mul_pd(fscal,dy00);
813             tz               = _mm256_mul_pd(fscal,dz00);
814
815             /* Update vectorial force */
816             fix0             = _mm256_add_pd(fix0,tx);
817             fiy0             = _mm256_add_pd(fiy0,ty);
818             fiz0             = _mm256_add_pd(fiz0,tz);
819
820             fjx0             = _mm256_add_pd(fjx0,tx);
821             fjy0             = _mm256_add_pd(fjy0,ty);
822             fjz0             = _mm256_add_pd(fjz0,tz);
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             r01              = _mm256_mul_pd(rsq01,rinv01);
829             r01              = _mm256_andnot_pd(dummy_mask,r01);
830
831             /* Calculate table index by multiplying r with table scale and truncate to integer */
832             rt               = _mm256_mul_pd(r01,vftabscale);
833             vfitab           = _mm256_cvttpd_epi32(rt);
834             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
835             vfitab           = _mm_slli_epi32(vfitab,2);
836
837             /* CUBIC SPLINE TABLE ELECTROSTATICS */
838             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
839             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
840             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
841             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
842             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
843             Heps             = _mm256_mul_pd(vfeps,H);
844             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
845             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
846             velec            = _mm256_mul_pd(qq01,VV);
847             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
848             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
849
850             /* Update potential sum for this i atom from the interaction with this j atom. */
851             velec            = _mm256_andnot_pd(dummy_mask,velec);
852             velecsum         = _mm256_add_pd(velecsum,velec);
853
854             fscal            = felec;
855
856             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
857
858             /* Calculate temporary vectorial force */
859             tx               = _mm256_mul_pd(fscal,dx01);
860             ty               = _mm256_mul_pd(fscal,dy01);
861             tz               = _mm256_mul_pd(fscal,dz01);
862
863             /* Update vectorial force */
864             fix0             = _mm256_add_pd(fix0,tx);
865             fiy0             = _mm256_add_pd(fiy0,ty);
866             fiz0             = _mm256_add_pd(fiz0,tz);
867
868             fjx1             = _mm256_add_pd(fjx1,tx);
869             fjy1             = _mm256_add_pd(fjy1,ty);
870             fjz1             = _mm256_add_pd(fjz1,tz);
871
872             /**************************
873              * CALCULATE INTERACTIONS *
874              **************************/
875
876             r02              = _mm256_mul_pd(rsq02,rinv02);
877             r02              = _mm256_andnot_pd(dummy_mask,r02);
878
879             /* Calculate table index by multiplying r with table scale and truncate to integer */
880             rt               = _mm256_mul_pd(r02,vftabscale);
881             vfitab           = _mm256_cvttpd_epi32(rt);
882             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
883             vfitab           = _mm_slli_epi32(vfitab,2);
884
885             /* CUBIC SPLINE TABLE ELECTROSTATICS */
886             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
887             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
888             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
889             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
890             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
891             Heps             = _mm256_mul_pd(vfeps,H);
892             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
893             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
894             velec            = _mm256_mul_pd(qq02,VV);
895             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
896             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
897
898             /* Update potential sum for this i atom from the interaction with this j atom. */
899             velec            = _mm256_andnot_pd(dummy_mask,velec);
900             velecsum         = _mm256_add_pd(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
905
906             /* Calculate temporary vectorial force */
907             tx               = _mm256_mul_pd(fscal,dx02);
908             ty               = _mm256_mul_pd(fscal,dy02);
909             tz               = _mm256_mul_pd(fscal,dz02);
910
911             /* Update vectorial force */
912             fix0             = _mm256_add_pd(fix0,tx);
913             fiy0             = _mm256_add_pd(fiy0,ty);
914             fiz0             = _mm256_add_pd(fiz0,tz);
915
916             fjx2             = _mm256_add_pd(fjx2,tx);
917             fjy2             = _mm256_add_pd(fjy2,ty);
918             fjz2             = _mm256_add_pd(fjz2,tz);
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             r10              = _mm256_mul_pd(rsq10,rinv10);
925             r10              = _mm256_andnot_pd(dummy_mask,r10);
926
927             /* Calculate table index by multiplying r with table scale and truncate to integer */
928             rt               = _mm256_mul_pd(r10,vftabscale);
929             vfitab           = _mm256_cvttpd_epi32(rt);
930             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
931             vfitab           = _mm_slli_epi32(vfitab,2);
932
933             /* CUBIC SPLINE TABLE ELECTROSTATICS */
934             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
935             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
936             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
937             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
938             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
939             Heps             = _mm256_mul_pd(vfeps,H);
940             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
941             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
942             velec            = _mm256_mul_pd(qq10,VV);
943             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
944             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
945
946             /* Update potential sum for this i atom from the interaction with this j atom. */
947             velec            = _mm256_andnot_pd(dummy_mask,velec);
948             velecsum         = _mm256_add_pd(velecsum,velec);
949
950             fscal            = felec;
951
952             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
953
954             /* Calculate temporary vectorial force */
955             tx               = _mm256_mul_pd(fscal,dx10);
956             ty               = _mm256_mul_pd(fscal,dy10);
957             tz               = _mm256_mul_pd(fscal,dz10);
958
959             /* Update vectorial force */
960             fix1             = _mm256_add_pd(fix1,tx);
961             fiy1             = _mm256_add_pd(fiy1,ty);
962             fiz1             = _mm256_add_pd(fiz1,tz);
963
964             fjx0             = _mm256_add_pd(fjx0,tx);
965             fjy0             = _mm256_add_pd(fjy0,ty);
966             fjz0             = _mm256_add_pd(fjz0,tz);
967
968             /**************************
969              * CALCULATE INTERACTIONS *
970              **************************/
971
972             r11              = _mm256_mul_pd(rsq11,rinv11);
973             r11              = _mm256_andnot_pd(dummy_mask,r11);
974
975             /* Calculate table index by multiplying r with table scale and truncate to integer */
976             rt               = _mm256_mul_pd(r11,vftabscale);
977             vfitab           = _mm256_cvttpd_epi32(rt);
978             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
979             vfitab           = _mm_slli_epi32(vfitab,2);
980
981             /* CUBIC SPLINE TABLE ELECTROSTATICS */
982             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
983             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
984             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
985             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
986             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
987             Heps             = _mm256_mul_pd(vfeps,H);
988             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
989             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
990             velec            = _mm256_mul_pd(qq11,VV);
991             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
992             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
993
994             /* Update potential sum for this i atom from the interaction with this j atom. */
995             velec            = _mm256_andnot_pd(dummy_mask,velec);
996             velecsum         = _mm256_add_pd(velecsum,velec);
997
998             fscal            = felec;
999
1000             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1001
1002             /* Calculate temporary vectorial force */
1003             tx               = _mm256_mul_pd(fscal,dx11);
1004             ty               = _mm256_mul_pd(fscal,dy11);
1005             tz               = _mm256_mul_pd(fscal,dz11);
1006
1007             /* Update vectorial force */
1008             fix1             = _mm256_add_pd(fix1,tx);
1009             fiy1             = _mm256_add_pd(fiy1,ty);
1010             fiz1             = _mm256_add_pd(fiz1,tz);
1011
1012             fjx1             = _mm256_add_pd(fjx1,tx);
1013             fjy1             = _mm256_add_pd(fjy1,ty);
1014             fjz1             = _mm256_add_pd(fjz1,tz);
1015
1016             /**************************
1017              * CALCULATE INTERACTIONS *
1018              **************************/
1019
1020             r12              = _mm256_mul_pd(rsq12,rinv12);
1021             r12              = _mm256_andnot_pd(dummy_mask,r12);
1022
1023             /* Calculate table index by multiplying r with table scale and truncate to integer */
1024             rt               = _mm256_mul_pd(r12,vftabscale);
1025             vfitab           = _mm256_cvttpd_epi32(rt);
1026             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1027             vfitab           = _mm_slli_epi32(vfitab,2);
1028
1029             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1030             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1031             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1032             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1033             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1034             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1035             Heps             = _mm256_mul_pd(vfeps,H);
1036             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1037             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1038             velec            = _mm256_mul_pd(qq12,VV);
1039             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1040             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm256_andnot_pd(dummy_mask,velec);
1044             velecsum         = _mm256_add_pd(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm256_mul_pd(fscal,dx12);
1052             ty               = _mm256_mul_pd(fscal,dy12);
1053             tz               = _mm256_mul_pd(fscal,dz12);
1054
1055             /* Update vectorial force */
1056             fix1             = _mm256_add_pd(fix1,tx);
1057             fiy1             = _mm256_add_pd(fiy1,ty);
1058             fiz1             = _mm256_add_pd(fiz1,tz);
1059
1060             fjx2             = _mm256_add_pd(fjx2,tx);
1061             fjy2             = _mm256_add_pd(fjy2,ty);
1062             fjz2             = _mm256_add_pd(fjz2,tz);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             r20              = _mm256_mul_pd(rsq20,rinv20);
1069             r20              = _mm256_andnot_pd(dummy_mask,r20);
1070
1071             /* Calculate table index by multiplying r with table scale and truncate to integer */
1072             rt               = _mm256_mul_pd(r20,vftabscale);
1073             vfitab           = _mm256_cvttpd_epi32(rt);
1074             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1075             vfitab           = _mm_slli_epi32(vfitab,2);
1076
1077             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1078             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1079             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1080             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1081             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1082             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1083             Heps             = _mm256_mul_pd(vfeps,H);
1084             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1085             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1086             velec            = _mm256_mul_pd(qq20,VV);
1087             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1088             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1089
1090             /* Update potential sum for this i atom from the interaction with this j atom. */
1091             velec            = _mm256_andnot_pd(dummy_mask,velec);
1092             velecsum         = _mm256_add_pd(velecsum,velec);
1093
1094             fscal            = felec;
1095
1096             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm256_mul_pd(fscal,dx20);
1100             ty               = _mm256_mul_pd(fscal,dy20);
1101             tz               = _mm256_mul_pd(fscal,dz20);
1102
1103             /* Update vectorial force */
1104             fix2             = _mm256_add_pd(fix2,tx);
1105             fiy2             = _mm256_add_pd(fiy2,ty);
1106             fiz2             = _mm256_add_pd(fiz2,tz);
1107
1108             fjx0             = _mm256_add_pd(fjx0,tx);
1109             fjy0             = _mm256_add_pd(fjy0,ty);
1110             fjz0             = _mm256_add_pd(fjz0,tz);
1111
1112             /**************************
1113              * CALCULATE INTERACTIONS *
1114              **************************/
1115
1116             r21              = _mm256_mul_pd(rsq21,rinv21);
1117             r21              = _mm256_andnot_pd(dummy_mask,r21);
1118
1119             /* Calculate table index by multiplying r with table scale and truncate to integer */
1120             rt               = _mm256_mul_pd(r21,vftabscale);
1121             vfitab           = _mm256_cvttpd_epi32(rt);
1122             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1123             vfitab           = _mm_slli_epi32(vfitab,2);
1124
1125             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1126             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1127             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1128             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1129             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1130             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1131             Heps             = _mm256_mul_pd(vfeps,H);
1132             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1133             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1134             velec            = _mm256_mul_pd(qq21,VV);
1135             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1136             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm256_andnot_pd(dummy_mask,velec);
1140             velecsum         = _mm256_add_pd(velecsum,velec);
1141
1142             fscal            = felec;
1143
1144             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1145
1146             /* Calculate temporary vectorial force */
1147             tx               = _mm256_mul_pd(fscal,dx21);
1148             ty               = _mm256_mul_pd(fscal,dy21);
1149             tz               = _mm256_mul_pd(fscal,dz21);
1150
1151             /* Update vectorial force */
1152             fix2             = _mm256_add_pd(fix2,tx);
1153             fiy2             = _mm256_add_pd(fiy2,ty);
1154             fiz2             = _mm256_add_pd(fiz2,tz);
1155
1156             fjx1             = _mm256_add_pd(fjx1,tx);
1157             fjy1             = _mm256_add_pd(fjy1,ty);
1158             fjz1             = _mm256_add_pd(fjz1,tz);
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             r22              = _mm256_mul_pd(rsq22,rinv22);
1165             r22              = _mm256_andnot_pd(dummy_mask,r22);
1166
1167             /* Calculate table index by multiplying r with table scale and truncate to integer */
1168             rt               = _mm256_mul_pd(r22,vftabscale);
1169             vfitab           = _mm256_cvttpd_epi32(rt);
1170             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1171             vfitab           = _mm_slli_epi32(vfitab,2);
1172
1173             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1174             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1175             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1176             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1177             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1178             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1179             Heps             = _mm256_mul_pd(vfeps,H);
1180             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1181             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
1182             velec            = _mm256_mul_pd(qq22,VV);
1183             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1184             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1185
1186             /* Update potential sum for this i atom from the interaction with this j atom. */
1187             velec            = _mm256_andnot_pd(dummy_mask,velec);
1188             velecsum         = _mm256_add_pd(velecsum,velec);
1189
1190             fscal            = felec;
1191
1192             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1193
1194             /* Calculate temporary vectorial force */
1195             tx               = _mm256_mul_pd(fscal,dx22);
1196             ty               = _mm256_mul_pd(fscal,dy22);
1197             tz               = _mm256_mul_pd(fscal,dz22);
1198
1199             /* Update vectorial force */
1200             fix2             = _mm256_add_pd(fix2,tx);
1201             fiy2             = _mm256_add_pd(fiy2,ty);
1202             fiz2             = _mm256_add_pd(fiz2,tz);
1203
1204             fjx2             = _mm256_add_pd(fjx2,tx);
1205             fjy2             = _mm256_add_pd(fjy2,ty);
1206             fjz2             = _mm256_add_pd(fjz2,tz);
1207
1208             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1209             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1210             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1211             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1212
1213             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1214                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1215
1216             /* Inner loop uses 396 flops */
1217         }
1218
1219         /* End of innermost loop */
1220
1221         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1222                                                  f+i_coord_offset,fshift+i_shift_offset);
1223
1224         ggid                        = gid[iidx];
1225         /* Update potential energies */
1226         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1227
1228         /* Increment number of inner iterations */
1229         inneriter                  += j_index_end - j_index_start;
1230
1231         /* Outer loop uses 19 flops */
1232     }
1233
1234     /* Increment number of outer iterations */
1235     outeriter        += nri;
1236
1237     /* Update outer/inner flops */
1238
1239     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1240 }
1241 /*
1242  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
1243  * Electrostatics interaction: CubicSplineTable
1244  * VdW interaction:            None
1245  * Geometry:                   Water3-Water3
1246  * Calculate force/pot:        Force
1247  */
1248 void
1249 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_double
1250                     (t_nblist                    * gmx_restrict       nlist,
1251                      rvec                        * gmx_restrict          xx,
1252                      rvec                        * gmx_restrict          ff,
1253                      t_forcerec                  * gmx_restrict          fr,
1254                      t_mdatoms                   * gmx_restrict     mdatoms,
1255                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1256                      t_nrnb                      * gmx_restrict        nrnb)
1257 {
1258     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1259      * just 0 for non-waters.
1260      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1261      * jnr indices corresponding to data put in the four positions in the SIMD register.
1262      */
1263     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1264     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1265     int              jnrA,jnrB,jnrC,jnrD;
1266     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1267     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1268     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1269     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1270     real             rcutoff_scalar;
1271     real             *shiftvec,*fshift,*x,*f;
1272     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1273     real             scratch[4*DIM];
1274     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1275     real *           vdwioffsetptr0;
1276     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1277     real *           vdwioffsetptr1;
1278     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1279     real *           vdwioffsetptr2;
1280     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1281     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1282     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1283     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1284     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1285     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1286     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1287     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1288     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1289     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1290     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1291     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1292     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1293     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1294     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1295     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1296     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1297     real             *charge;
1298     __m128i          vfitab;
1299     __m128i          ifour       = _mm_set1_epi32(4);
1300     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1301     real             *vftab;
1302     __m256d          dummy_mask,cutoff_mask;
1303     __m128           tmpmask0,tmpmask1;
1304     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1305     __m256d          one     = _mm256_set1_pd(1.0);
1306     __m256d          two     = _mm256_set1_pd(2.0);
1307     x                = xx[0];
1308     f                = ff[0];
1309
1310     nri              = nlist->nri;
1311     iinr             = nlist->iinr;
1312     jindex           = nlist->jindex;
1313     jjnr             = nlist->jjnr;
1314     shiftidx         = nlist->shift;
1315     gid              = nlist->gid;
1316     shiftvec         = fr->shift_vec[0];
1317     fshift           = fr->fshift[0];
1318     facel            = _mm256_set1_pd(fr->epsfac);
1319     charge           = mdatoms->chargeA;
1320
1321     vftab            = kernel_data->table_elec->data;
1322     vftabscale       = _mm256_set1_pd(kernel_data->table_elec->scale);
1323
1324     /* Setup water-specific parameters */
1325     inr              = nlist->iinr[0];
1326     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1327     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1328     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1329
1330     jq0              = _mm256_set1_pd(charge[inr+0]);
1331     jq1              = _mm256_set1_pd(charge[inr+1]);
1332     jq2              = _mm256_set1_pd(charge[inr+2]);
1333     qq00             = _mm256_mul_pd(iq0,jq0);
1334     qq01             = _mm256_mul_pd(iq0,jq1);
1335     qq02             = _mm256_mul_pd(iq0,jq2);
1336     qq10             = _mm256_mul_pd(iq1,jq0);
1337     qq11             = _mm256_mul_pd(iq1,jq1);
1338     qq12             = _mm256_mul_pd(iq1,jq2);
1339     qq20             = _mm256_mul_pd(iq2,jq0);
1340     qq21             = _mm256_mul_pd(iq2,jq1);
1341     qq22             = _mm256_mul_pd(iq2,jq2);
1342
1343     /* Avoid stupid compiler warnings */
1344     jnrA = jnrB = jnrC = jnrD = 0;
1345     j_coord_offsetA = 0;
1346     j_coord_offsetB = 0;
1347     j_coord_offsetC = 0;
1348     j_coord_offsetD = 0;
1349
1350     outeriter        = 0;
1351     inneriter        = 0;
1352
1353     for(iidx=0;iidx<4*DIM;iidx++)
1354     {
1355         scratch[iidx] = 0.0;
1356     }
1357
1358     /* Start outer loop over neighborlists */
1359     for(iidx=0; iidx<nri; iidx++)
1360     {
1361         /* Load shift vector for this list */
1362         i_shift_offset   = DIM*shiftidx[iidx];
1363
1364         /* Load limits for loop over neighbors */
1365         j_index_start    = jindex[iidx];
1366         j_index_end      = jindex[iidx+1];
1367
1368         /* Get outer coordinate index */
1369         inr              = iinr[iidx];
1370         i_coord_offset   = DIM*inr;
1371
1372         /* Load i particle coords and add shift vector */
1373         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1374                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1375
1376         fix0             = _mm256_setzero_pd();
1377         fiy0             = _mm256_setzero_pd();
1378         fiz0             = _mm256_setzero_pd();
1379         fix1             = _mm256_setzero_pd();
1380         fiy1             = _mm256_setzero_pd();
1381         fiz1             = _mm256_setzero_pd();
1382         fix2             = _mm256_setzero_pd();
1383         fiy2             = _mm256_setzero_pd();
1384         fiz2             = _mm256_setzero_pd();
1385
1386         /* Start inner kernel loop */
1387         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1388         {
1389
1390             /* Get j neighbor index, and coordinate index */
1391             jnrA             = jjnr[jidx];
1392             jnrB             = jjnr[jidx+1];
1393             jnrC             = jjnr[jidx+2];
1394             jnrD             = jjnr[jidx+3];
1395             j_coord_offsetA  = DIM*jnrA;
1396             j_coord_offsetB  = DIM*jnrB;
1397             j_coord_offsetC  = DIM*jnrC;
1398             j_coord_offsetD  = DIM*jnrD;
1399
1400             /* load j atom coordinates */
1401             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1402                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1403                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1404
1405             /* Calculate displacement vector */
1406             dx00             = _mm256_sub_pd(ix0,jx0);
1407             dy00             = _mm256_sub_pd(iy0,jy0);
1408             dz00             = _mm256_sub_pd(iz0,jz0);
1409             dx01             = _mm256_sub_pd(ix0,jx1);
1410             dy01             = _mm256_sub_pd(iy0,jy1);
1411             dz01             = _mm256_sub_pd(iz0,jz1);
1412             dx02             = _mm256_sub_pd(ix0,jx2);
1413             dy02             = _mm256_sub_pd(iy0,jy2);
1414             dz02             = _mm256_sub_pd(iz0,jz2);
1415             dx10             = _mm256_sub_pd(ix1,jx0);
1416             dy10             = _mm256_sub_pd(iy1,jy0);
1417             dz10             = _mm256_sub_pd(iz1,jz0);
1418             dx11             = _mm256_sub_pd(ix1,jx1);
1419             dy11             = _mm256_sub_pd(iy1,jy1);
1420             dz11             = _mm256_sub_pd(iz1,jz1);
1421             dx12             = _mm256_sub_pd(ix1,jx2);
1422             dy12             = _mm256_sub_pd(iy1,jy2);
1423             dz12             = _mm256_sub_pd(iz1,jz2);
1424             dx20             = _mm256_sub_pd(ix2,jx0);
1425             dy20             = _mm256_sub_pd(iy2,jy0);
1426             dz20             = _mm256_sub_pd(iz2,jz0);
1427             dx21             = _mm256_sub_pd(ix2,jx1);
1428             dy21             = _mm256_sub_pd(iy2,jy1);
1429             dz21             = _mm256_sub_pd(iz2,jz1);
1430             dx22             = _mm256_sub_pd(ix2,jx2);
1431             dy22             = _mm256_sub_pd(iy2,jy2);
1432             dz22             = _mm256_sub_pd(iz2,jz2);
1433
1434             /* Calculate squared distance and things based on it */
1435             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1436             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1437             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1438             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1439             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1440             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1441             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1442             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1443             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1444
1445             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1446             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1447             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1448             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1449             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1450             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1451             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1452             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1453             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1454
1455             fjx0             = _mm256_setzero_pd();
1456             fjy0             = _mm256_setzero_pd();
1457             fjz0             = _mm256_setzero_pd();
1458             fjx1             = _mm256_setzero_pd();
1459             fjy1             = _mm256_setzero_pd();
1460             fjz1             = _mm256_setzero_pd();
1461             fjx2             = _mm256_setzero_pd();
1462             fjy2             = _mm256_setzero_pd();
1463             fjz2             = _mm256_setzero_pd();
1464
1465             /**************************
1466              * CALCULATE INTERACTIONS *
1467              **************************/
1468
1469             r00              = _mm256_mul_pd(rsq00,rinv00);
1470
1471             /* Calculate table index by multiplying r with table scale and truncate to integer */
1472             rt               = _mm256_mul_pd(r00,vftabscale);
1473             vfitab           = _mm256_cvttpd_epi32(rt);
1474             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1475             vfitab           = _mm_slli_epi32(vfitab,2);
1476
1477             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1478             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1479             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1480             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1481             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1482             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1483             Heps             = _mm256_mul_pd(vfeps,H);
1484             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1485             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1486             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1487
1488             fscal            = felec;
1489
1490             /* Calculate temporary vectorial force */
1491             tx               = _mm256_mul_pd(fscal,dx00);
1492             ty               = _mm256_mul_pd(fscal,dy00);
1493             tz               = _mm256_mul_pd(fscal,dz00);
1494
1495             /* Update vectorial force */
1496             fix0             = _mm256_add_pd(fix0,tx);
1497             fiy0             = _mm256_add_pd(fiy0,ty);
1498             fiz0             = _mm256_add_pd(fiz0,tz);
1499
1500             fjx0             = _mm256_add_pd(fjx0,tx);
1501             fjy0             = _mm256_add_pd(fjy0,ty);
1502             fjz0             = _mm256_add_pd(fjz0,tz);
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             r01              = _mm256_mul_pd(rsq01,rinv01);
1509
1510             /* Calculate table index by multiplying r with table scale and truncate to integer */
1511             rt               = _mm256_mul_pd(r01,vftabscale);
1512             vfitab           = _mm256_cvttpd_epi32(rt);
1513             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1514             vfitab           = _mm_slli_epi32(vfitab,2);
1515
1516             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1517             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1518             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1519             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1520             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1521             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1522             Heps             = _mm256_mul_pd(vfeps,H);
1523             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1524             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1525             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1526
1527             fscal            = felec;
1528
1529             /* Calculate temporary vectorial force */
1530             tx               = _mm256_mul_pd(fscal,dx01);
1531             ty               = _mm256_mul_pd(fscal,dy01);
1532             tz               = _mm256_mul_pd(fscal,dz01);
1533
1534             /* Update vectorial force */
1535             fix0             = _mm256_add_pd(fix0,tx);
1536             fiy0             = _mm256_add_pd(fiy0,ty);
1537             fiz0             = _mm256_add_pd(fiz0,tz);
1538
1539             fjx1             = _mm256_add_pd(fjx1,tx);
1540             fjy1             = _mm256_add_pd(fjy1,ty);
1541             fjz1             = _mm256_add_pd(fjz1,tz);
1542
1543             /**************************
1544              * CALCULATE INTERACTIONS *
1545              **************************/
1546
1547             r02              = _mm256_mul_pd(rsq02,rinv02);
1548
1549             /* Calculate table index by multiplying r with table scale and truncate to integer */
1550             rt               = _mm256_mul_pd(r02,vftabscale);
1551             vfitab           = _mm256_cvttpd_epi32(rt);
1552             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1553             vfitab           = _mm_slli_epi32(vfitab,2);
1554
1555             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1556             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1557             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1558             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1559             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1560             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1561             Heps             = _mm256_mul_pd(vfeps,H);
1562             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1563             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1564             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
1565
1566             fscal            = felec;
1567
1568             /* Calculate temporary vectorial force */
1569             tx               = _mm256_mul_pd(fscal,dx02);
1570             ty               = _mm256_mul_pd(fscal,dy02);
1571             tz               = _mm256_mul_pd(fscal,dz02);
1572
1573             /* Update vectorial force */
1574             fix0             = _mm256_add_pd(fix0,tx);
1575             fiy0             = _mm256_add_pd(fiy0,ty);
1576             fiz0             = _mm256_add_pd(fiz0,tz);
1577
1578             fjx2             = _mm256_add_pd(fjx2,tx);
1579             fjy2             = _mm256_add_pd(fjy2,ty);
1580             fjz2             = _mm256_add_pd(fjz2,tz);
1581
1582             /**************************
1583              * CALCULATE INTERACTIONS *
1584              **************************/
1585
1586             r10              = _mm256_mul_pd(rsq10,rinv10);
1587
1588             /* Calculate table index by multiplying r with table scale and truncate to integer */
1589             rt               = _mm256_mul_pd(r10,vftabscale);
1590             vfitab           = _mm256_cvttpd_epi32(rt);
1591             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1592             vfitab           = _mm_slli_epi32(vfitab,2);
1593
1594             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1595             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1596             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1597             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1598             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1599             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1600             Heps             = _mm256_mul_pd(vfeps,H);
1601             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1602             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1603             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
1604
1605             fscal            = felec;
1606
1607             /* Calculate temporary vectorial force */
1608             tx               = _mm256_mul_pd(fscal,dx10);
1609             ty               = _mm256_mul_pd(fscal,dy10);
1610             tz               = _mm256_mul_pd(fscal,dz10);
1611
1612             /* Update vectorial force */
1613             fix1             = _mm256_add_pd(fix1,tx);
1614             fiy1             = _mm256_add_pd(fiy1,ty);
1615             fiz1             = _mm256_add_pd(fiz1,tz);
1616
1617             fjx0             = _mm256_add_pd(fjx0,tx);
1618             fjy0             = _mm256_add_pd(fjy0,ty);
1619             fjz0             = _mm256_add_pd(fjz0,tz);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             r11              = _mm256_mul_pd(rsq11,rinv11);
1626
1627             /* Calculate table index by multiplying r with table scale and truncate to integer */
1628             rt               = _mm256_mul_pd(r11,vftabscale);
1629             vfitab           = _mm256_cvttpd_epi32(rt);
1630             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1631             vfitab           = _mm_slli_epi32(vfitab,2);
1632
1633             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1634             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1635             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1636             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1637             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1638             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1639             Heps             = _mm256_mul_pd(vfeps,H);
1640             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1641             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1642             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
1643
1644             fscal            = felec;
1645
1646             /* Calculate temporary vectorial force */
1647             tx               = _mm256_mul_pd(fscal,dx11);
1648             ty               = _mm256_mul_pd(fscal,dy11);
1649             tz               = _mm256_mul_pd(fscal,dz11);
1650
1651             /* Update vectorial force */
1652             fix1             = _mm256_add_pd(fix1,tx);
1653             fiy1             = _mm256_add_pd(fiy1,ty);
1654             fiz1             = _mm256_add_pd(fiz1,tz);
1655
1656             fjx1             = _mm256_add_pd(fjx1,tx);
1657             fjy1             = _mm256_add_pd(fjy1,ty);
1658             fjz1             = _mm256_add_pd(fjz1,tz);
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             r12              = _mm256_mul_pd(rsq12,rinv12);
1665
1666             /* Calculate table index by multiplying r with table scale and truncate to integer */
1667             rt               = _mm256_mul_pd(r12,vftabscale);
1668             vfitab           = _mm256_cvttpd_epi32(rt);
1669             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1670             vfitab           = _mm_slli_epi32(vfitab,2);
1671
1672             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1673             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1674             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1675             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1676             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1677             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1678             Heps             = _mm256_mul_pd(vfeps,H);
1679             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1680             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1681             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
1682
1683             fscal            = felec;
1684
1685             /* Calculate temporary vectorial force */
1686             tx               = _mm256_mul_pd(fscal,dx12);
1687             ty               = _mm256_mul_pd(fscal,dy12);
1688             tz               = _mm256_mul_pd(fscal,dz12);
1689
1690             /* Update vectorial force */
1691             fix1             = _mm256_add_pd(fix1,tx);
1692             fiy1             = _mm256_add_pd(fiy1,ty);
1693             fiz1             = _mm256_add_pd(fiz1,tz);
1694
1695             fjx2             = _mm256_add_pd(fjx2,tx);
1696             fjy2             = _mm256_add_pd(fjy2,ty);
1697             fjz2             = _mm256_add_pd(fjz2,tz);
1698
1699             /**************************
1700              * CALCULATE INTERACTIONS *
1701              **************************/
1702
1703             r20              = _mm256_mul_pd(rsq20,rinv20);
1704
1705             /* Calculate table index by multiplying r with table scale and truncate to integer */
1706             rt               = _mm256_mul_pd(r20,vftabscale);
1707             vfitab           = _mm256_cvttpd_epi32(rt);
1708             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1709             vfitab           = _mm_slli_epi32(vfitab,2);
1710
1711             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1712             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1713             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1714             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1715             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1716             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1717             Heps             = _mm256_mul_pd(vfeps,H);
1718             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1719             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1720             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
1721
1722             fscal            = felec;
1723
1724             /* Calculate temporary vectorial force */
1725             tx               = _mm256_mul_pd(fscal,dx20);
1726             ty               = _mm256_mul_pd(fscal,dy20);
1727             tz               = _mm256_mul_pd(fscal,dz20);
1728
1729             /* Update vectorial force */
1730             fix2             = _mm256_add_pd(fix2,tx);
1731             fiy2             = _mm256_add_pd(fiy2,ty);
1732             fiz2             = _mm256_add_pd(fiz2,tz);
1733
1734             fjx0             = _mm256_add_pd(fjx0,tx);
1735             fjy0             = _mm256_add_pd(fjy0,ty);
1736             fjz0             = _mm256_add_pd(fjz0,tz);
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             r21              = _mm256_mul_pd(rsq21,rinv21);
1743
1744             /* Calculate table index by multiplying r with table scale and truncate to integer */
1745             rt               = _mm256_mul_pd(r21,vftabscale);
1746             vfitab           = _mm256_cvttpd_epi32(rt);
1747             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1748             vfitab           = _mm_slli_epi32(vfitab,2);
1749
1750             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1751             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1752             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1753             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1754             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1755             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1756             Heps             = _mm256_mul_pd(vfeps,H);
1757             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1758             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1759             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
1760
1761             fscal            = felec;
1762
1763             /* Calculate temporary vectorial force */
1764             tx               = _mm256_mul_pd(fscal,dx21);
1765             ty               = _mm256_mul_pd(fscal,dy21);
1766             tz               = _mm256_mul_pd(fscal,dz21);
1767
1768             /* Update vectorial force */
1769             fix2             = _mm256_add_pd(fix2,tx);
1770             fiy2             = _mm256_add_pd(fiy2,ty);
1771             fiz2             = _mm256_add_pd(fiz2,tz);
1772
1773             fjx1             = _mm256_add_pd(fjx1,tx);
1774             fjy1             = _mm256_add_pd(fjy1,ty);
1775             fjz1             = _mm256_add_pd(fjz1,tz);
1776
1777             /**************************
1778              * CALCULATE INTERACTIONS *
1779              **************************/
1780
1781             r22              = _mm256_mul_pd(rsq22,rinv22);
1782
1783             /* Calculate table index by multiplying r with table scale and truncate to integer */
1784             rt               = _mm256_mul_pd(r22,vftabscale);
1785             vfitab           = _mm256_cvttpd_epi32(rt);
1786             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1787             vfitab           = _mm_slli_epi32(vfitab,2);
1788
1789             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1790             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1791             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1792             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1793             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1794             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1795             Heps             = _mm256_mul_pd(vfeps,H);
1796             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1797             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1798             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
1799
1800             fscal            = felec;
1801
1802             /* Calculate temporary vectorial force */
1803             tx               = _mm256_mul_pd(fscal,dx22);
1804             ty               = _mm256_mul_pd(fscal,dy22);
1805             tz               = _mm256_mul_pd(fscal,dz22);
1806
1807             /* Update vectorial force */
1808             fix2             = _mm256_add_pd(fix2,tx);
1809             fiy2             = _mm256_add_pd(fiy2,ty);
1810             fiz2             = _mm256_add_pd(fiz2,tz);
1811
1812             fjx2             = _mm256_add_pd(fjx2,tx);
1813             fjy2             = _mm256_add_pd(fjy2,ty);
1814             fjz2             = _mm256_add_pd(fjz2,tz);
1815
1816             fjptrA             = f+j_coord_offsetA;
1817             fjptrB             = f+j_coord_offsetB;
1818             fjptrC             = f+j_coord_offsetC;
1819             fjptrD             = f+j_coord_offsetD;
1820
1821             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1822                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1823
1824             /* Inner loop uses 351 flops */
1825         }
1826
1827         if(jidx<j_index_end)
1828         {
1829
1830             /* Get j neighbor index, and coordinate index */
1831             jnrlistA         = jjnr[jidx];
1832             jnrlistB         = jjnr[jidx+1];
1833             jnrlistC         = jjnr[jidx+2];
1834             jnrlistD         = jjnr[jidx+3];
1835             /* Sign of each element will be negative for non-real atoms.
1836              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1837              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1838              */
1839             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1840
1841             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1842             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1843             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1844
1845             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1846             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1847             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1848             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1849             j_coord_offsetA  = DIM*jnrA;
1850             j_coord_offsetB  = DIM*jnrB;
1851             j_coord_offsetC  = DIM*jnrC;
1852             j_coord_offsetD  = DIM*jnrD;
1853
1854             /* load j atom coordinates */
1855             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1856                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1857                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1858
1859             /* Calculate displacement vector */
1860             dx00             = _mm256_sub_pd(ix0,jx0);
1861             dy00             = _mm256_sub_pd(iy0,jy0);
1862             dz00             = _mm256_sub_pd(iz0,jz0);
1863             dx01             = _mm256_sub_pd(ix0,jx1);
1864             dy01             = _mm256_sub_pd(iy0,jy1);
1865             dz01             = _mm256_sub_pd(iz0,jz1);
1866             dx02             = _mm256_sub_pd(ix0,jx2);
1867             dy02             = _mm256_sub_pd(iy0,jy2);
1868             dz02             = _mm256_sub_pd(iz0,jz2);
1869             dx10             = _mm256_sub_pd(ix1,jx0);
1870             dy10             = _mm256_sub_pd(iy1,jy0);
1871             dz10             = _mm256_sub_pd(iz1,jz0);
1872             dx11             = _mm256_sub_pd(ix1,jx1);
1873             dy11             = _mm256_sub_pd(iy1,jy1);
1874             dz11             = _mm256_sub_pd(iz1,jz1);
1875             dx12             = _mm256_sub_pd(ix1,jx2);
1876             dy12             = _mm256_sub_pd(iy1,jy2);
1877             dz12             = _mm256_sub_pd(iz1,jz2);
1878             dx20             = _mm256_sub_pd(ix2,jx0);
1879             dy20             = _mm256_sub_pd(iy2,jy0);
1880             dz20             = _mm256_sub_pd(iz2,jz0);
1881             dx21             = _mm256_sub_pd(ix2,jx1);
1882             dy21             = _mm256_sub_pd(iy2,jy1);
1883             dz21             = _mm256_sub_pd(iz2,jz1);
1884             dx22             = _mm256_sub_pd(ix2,jx2);
1885             dy22             = _mm256_sub_pd(iy2,jy2);
1886             dz22             = _mm256_sub_pd(iz2,jz2);
1887
1888             /* Calculate squared distance and things based on it */
1889             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1890             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1891             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1892             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1893             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1894             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1895             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1896             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1897             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1898
1899             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1900             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1901             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1902             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1903             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1904             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1905             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1906             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1907             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1908
1909             fjx0             = _mm256_setzero_pd();
1910             fjy0             = _mm256_setzero_pd();
1911             fjz0             = _mm256_setzero_pd();
1912             fjx1             = _mm256_setzero_pd();
1913             fjy1             = _mm256_setzero_pd();
1914             fjz1             = _mm256_setzero_pd();
1915             fjx2             = _mm256_setzero_pd();
1916             fjy2             = _mm256_setzero_pd();
1917             fjz2             = _mm256_setzero_pd();
1918
1919             /**************************
1920              * CALCULATE INTERACTIONS *
1921              **************************/
1922
1923             r00              = _mm256_mul_pd(rsq00,rinv00);
1924             r00              = _mm256_andnot_pd(dummy_mask,r00);
1925
1926             /* Calculate table index by multiplying r with table scale and truncate to integer */
1927             rt               = _mm256_mul_pd(r00,vftabscale);
1928             vfitab           = _mm256_cvttpd_epi32(rt);
1929             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1930             vfitab           = _mm_slli_epi32(vfitab,2);
1931
1932             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1933             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1934             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1935             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1936             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1937             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1938             Heps             = _mm256_mul_pd(vfeps,H);
1939             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1940             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1941             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq00,FF),_mm256_mul_pd(vftabscale,rinv00)));
1942
1943             fscal            = felec;
1944
1945             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1946
1947             /* Calculate temporary vectorial force */
1948             tx               = _mm256_mul_pd(fscal,dx00);
1949             ty               = _mm256_mul_pd(fscal,dy00);
1950             tz               = _mm256_mul_pd(fscal,dz00);
1951
1952             /* Update vectorial force */
1953             fix0             = _mm256_add_pd(fix0,tx);
1954             fiy0             = _mm256_add_pd(fiy0,ty);
1955             fiz0             = _mm256_add_pd(fiz0,tz);
1956
1957             fjx0             = _mm256_add_pd(fjx0,tx);
1958             fjy0             = _mm256_add_pd(fjy0,ty);
1959             fjz0             = _mm256_add_pd(fjz0,tz);
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             r01              = _mm256_mul_pd(rsq01,rinv01);
1966             r01              = _mm256_andnot_pd(dummy_mask,r01);
1967
1968             /* Calculate table index by multiplying r with table scale and truncate to integer */
1969             rt               = _mm256_mul_pd(r01,vftabscale);
1970             vfitab           = _mm256_cvttpd_epi32(rt);
1971             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1972             vfitab           = _mm_slli_epi32(vfitab,2);
1973
1974             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1975             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1976             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1977             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1978             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1979             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1980             Heps             = _mm256_mul_pd(vfeps,H);
1981             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1982             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1983             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq01,FF),_mm256_mul_pd(vftabscale,rinv01)));
1984
1985             fscal            = felec;
1986
1987             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1988
1989             /* Calculate temporary vectorial force */
1990             tx               = _mm256_mul_pd(fscal,dx01);
1991             ty               = _mm256_mul_pd(fscal,dy01);
1992             tz               = _mm256_mul_pd(fscal,dz01);
1993
1994             /* Update vectorial force */
1995             fix0             = _mm256_add_pd(fix0,tx);
1996             fiy0             = _mm256_add_pd(fiy0,ty);
1997             fiz0             = _mm256_add_pd(fiz0,tz);
1998
1999             fjx1             = _mm256_add_pd(fjx1,tx);
2000             fjy1             = _mm256_add_pd(fjy1,ty);
2001             fjz1             = _mm256_add_pd(fjz1,tz);
2002
2003             /**************************
2004              * CALCULATE INTERACTIONS *
2005              **************************/
2006
2007             r02              = _mm256_mul_pd(rsq02,rinv02);
2008             r02              = _mm256_andnot_pd(dummy_mask,r02);
2009
2010             /* Calculate table index by multiplying r with table scale and truncate to integer */
2011             rt               = _mm256_mul_pd(r02,vftabscale);
2012             vfitab           = _mm256_cvttpd_epi32(rt);
2013             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2014             vfitab           = _mm_slli_epi32(vfitab,2);
2015
2016             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2017             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2018             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2019             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2020             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2021             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2022             Heps             = _mm256_mul_pd(vfeps,H);
2023             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2024             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2025             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq02,FF),_mm256_mul_pd(vftabscale,rinv02)));
2026
2027             fscal            = felec;
2028
2029             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2030
2031             /* Calculate temporary vectorial force */
2032             tx               = _mm256_mul_pd(fscal,dx02);
2033             ty               = _mm256_mul_pd(fscal,dy02);
2034             tz               = _mm256_mul_pd(fscal,dz02);
2035
2036             /* Update vectorial force */
2037             fix0             = _mm256_add_pd(fix0,tx);
2038             fiy0             = _mm256_add_pd(fiy0,ty);
2039             fiz0             = _mm256_add_pd(fiz0,tz);
2040
2041             fjx2             = _mm256_add_pd(fjx2,tx);
2042             fjy2             = _mm256_add_pd(fjy2,ty);
2043             fjz2             = _mm256_add_pd(fjz2,tz);
2044
2045             /**************************
2046              * CALCULATE INTERACTIONS *
2047              **************************/
2048
2049             r10              = _mm256_mul_pd(rsq10,rinv10);
2050             r10              = _mm256_andnot_pd(dummy_mask,r10);
2051
2052             /* Calculate table index by multiplying r with table scale and truncate to integer */
2053             rt               = _mm256_mul_pd(r10,vftabscale);
2054             vfitab           = _mm256_cvttpd_epi32(rt);
2055             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2056             vfitab           = _mm_slli_epi32(vfitab,2);
2057
2058             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2059             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2060             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2061             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2062             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2063             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2064             Heps             = _mm256_mul_pd(vfeps,H);
2065             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2066             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2067             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq10,FF),_mm256_mul_pd(vftabscale,rinv10)));
2068
2069             fscal            = felec;
2070
2071             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2072
2073             /* Calculate temporary vectorial force */
2074             tx               = _mm256_mul_pd(fscal,dx10);
2075             ty               = _mm256_mul_pd(fscal,dy10);
2076             tz               = _mm256_mul_pd(fscal,dz10);
2077
2078             /* Update vectorial force */
2079             fix1             = _mm256_add_pd(fix1,tx);
2080             fiy1             = _mm256_add_pd(fiy1,ty);
2081             fiz1             = _mm256_add_pd(fiz1,tz);
2082
2083             fjx0             = _mm256_add_pd(fjx0,tx);
2084             fjy0             = _mm256_add_pd(fjy0,ty);
2085             fjz0             = _mm256_add_pd(fjz0,tz);
2086
2087             /**************************
2088              * CALCULATE INTERACTIONS *
2089              **************************/
2090
2091             r11              = _mm256_mul_pd(rsq11,rinv11);
2092             r11              = _mm256_andnot_pd(dummy_mask,r11);
2093
2094             /* Calculate table index by multiplying r with table scale and truncate to integer */
2095             rt               = _mm256_mul_pd(r11,vftabscale);
2096             vfitab           = _mm256_cvttpd_epi32(rt);
2097             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2098             vfitab           = _mm_slli_epi32(vfitab,2);
2099
2100             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2101             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2102             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2103             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2104             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2105             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2106             Heps             = _mm256_mul_pd(vfeps,H);
2107             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2108             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2109             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq11,FF),_mm256_mul_pd(vftabscale,rinv11)));
2110
2111             fscal            = felec;
2112
2113             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2114
2115             /* Calculate temporary vectorial force */
2116             tx               = _mm256_mul_pd(fscal,dx11);
2117             ty               = _mm256_mul_pd(fscal,dy11);
2118             tz               = _mm256_mul_pd(fscal,dz11);
2119
2120             /* Update vectorial force */
2121             fix1             = _mm256_add_pd(fix1,tx);
2122             fiy1             = _mm256_add_pd(fiy1,ty);
2123             fiz1             = _mm256_add_pd(fiz1,tz);
2124
2125             fjx1             = _mm256_add_pd(fjx1,tx);
2126             fjy1             = _mm256_add_pd(fjy1,ty);
2127             fjz1             = _mm256_add_pd(fjz1,tz);
2128
2129             /**************************
2130              * CALCULATE INTERACTIONS *
2131              **************************/
2132
2133             r12              = _mm256_mul_pd(rsq12,rinv12);
2134             r12              = _mm256_andnot_pd(dummy_mask,r12);
2135
2136             /* Calculate table index by multiplying r with table scale and truncate to integer */
2137             rt               = _mm256_mul_pd(r12,vftabscale);
2138             vfitab           = _mm256_cvttpd_epi32(rt);
2139             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2140             vfitab           = _mm_slli_epi32(vfitab,2);
2141
2142             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2143             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2144             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2145             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2146             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2147             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2148             Heps             = _mm256_mul_pd(vfeps,H);
2149             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2150             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2151             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq12,FF),_mm256_mul_pd(vftabscale,rinv12)));
2152
2153             fscal            = felec;
2154
2155             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2156
2157             /* Calculate temporary vectorial force */
2158             tx               = _mm256_mul_pd(fscal,dx12);
2159             ty               = _mm256_mul_pd(fscal,dy12);
2160             tz               = _mm256_mul_pd(fscal,dz12);
2161
2162             /* Update vectorial force */
2163             fix1             = _mm256_add_pd(fix1,tx);
2164             fiy1             = _mm256_add_pd(fiy1,ty);
2165             fiz1             = _mm256_add_pd(fiz1,tz);
2166
2167             fjx2             = _mm256_add_pd(fjx2,tx);
2168             fjy2             = _mm256_add_pd(fjy2,ty);
2169             fjz2             = _mm256_add_pd(fjz2,tz);
2170
2171             /**************************
2172              * CALCULATE INTERACTIONS *
2173              **************************/
2174
2175             r20              = _mm256_mul_pd(rsq20,rinv20);
2176             r20              = _mm256_andnot_pd(dummy_mask,r20);
2177
2178             /* Calculate table index by multiplying r with table scale and truncate to integer */
2179             rt               = _mm256_mul_pd(r20,vftabscale);
2180             vfitab           = _mm256_cvttpd_epi32(rt);
2181             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2182             vfitab           = _mm_slli_epi32(vfitab,2);
2183
2184             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2185             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2186             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2187             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2188             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2189             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2190             Heps             = _mm256_mul_pd(vfeps,H);
2191             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2192             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2193             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq20,FF),_mm256_mul_pd(vftabscale,rinv20)));
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm256_mul_pd(fscal,dx20);
2201             ty               = _mm256_mul_pd(fscal,dy20);
2202             tz               = _mm256_mul_pd(fscal,dz20);
2203
2204             /* Update vectorial force */
2205             fix2             = _mm256_add_pd(fix2,tx);
2206             fiy2             = _mm256_add_pd(fiy2,ty);
2207             fiz2             = _mm256_add_pd(fiz2,tz);
2208
2209             fjx0             = _mm256_add_pd(fjx0,tx);
2210             fjy0             = _mm256_add_pd(fjy0,ty);
2211             fjz0             = _mm256_add_pd(fjz0,tz);
2212
2213             /**************************
2214              * CALCULATE INTERACTIONS *
2215              **************************/
2216
2217             r21              = _mm256_mul_pd(rsq21,rinv21);
2218             r21              = _mm256_andnot_pd(dummy_mask,r21);
2219
2220             /* Calculate table index by multiplying r with table scale and truncate to integer */
2221             rt               = _mm256_mul_pd(r21,vftabscale);
2222             vfitab           = _mm256_cvttpd_epi32(rt);
2223             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2224             vfitab           = _mm_slli_epi32(vfitab,2);
2225
2226             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2228             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2229             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2230             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2231             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2232             Heps             = _mm256_mul_pd(vfeps,H);
2233             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2234             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2235             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq21,FF),_mm256_mul_pd(vftabscale,rinv21)));
2236
2237             fscal            = felec;
2238
2239             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2240
2241             /* Calculate temporary vectorial force */
2242             tx               = _mm256_mul_pd(fscal,dx21);
2243             ty               = _mm256_mul_pd(fscal,dy21);
2244             tz               = _mm256_mul_pd(fscal,dz21);
2245
2246             /* Update vectorial force */
2247             fix2             = _mm256_add_pd(fix2,tx);
2248             fiy2             = _mm256_add_pd(fiy2,ty);
2249             fiz2             = _mm256_add_pd(fiz2,tz);
2250
2251             fjx1             = _mm256_add_pd(fjx1,tx);
2252             fjy1             = _mm256_add_pd(fjy1,ty);
2253             fjz1             = _mm256_add_pd(fjz1,tz);
2254
2255             /**************************
2256              * CALCULATE INTERACTIONS *
2257              **************************/
2258
2259             r22              = _mm256_mul_pd(rsq22,rinv22);
2260             r22              = _mm256_andnot_pd(dummy_mask,r22);
2261
2262             /* Calculate table index by multiplying r with table scale and truncate to integer */
2263             rt               = _mm256_mul_pd(r22,vftabscale);
2264             vfitab           = _mm256_cvttpd_epi32(rt);
2265             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
2266             vfitab           = _mm_slli_epi32(vfitab,2);
2267
2268             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2269             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2270             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
2271             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
2272             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
2273             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
2274             Heps             = _mm256_mul_pd(vfeps,H);
2275             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
2276             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
2277             felec            = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_mul_pd(qq22,FF),_mm256_mul_pd(vftabscale,rinv22)));
2278
2279             fscal            = felec;
2280
2281             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2282
2283             /* Calculate temporary vectorial force */
2284             tx               = _mm256_mul_pd(fscal,dx22);
2285             ty               = _mm256_mul_pd(fscal,dy22);
2286             tz               = _mm256_mul_pd(fscal,dz22);
2287
2288             /* Update vectorial force */
2289             fix2             = _mm256_add_pd(fix2,tx);
2290             fiy2             = _mm256_add_pd(fiy2,ty);
2291             fiz2             = _mm256_add_pd(fiz2,tz);
2292
2293             fjx2             = _mm256_add_pd(fjx2,tx);
2294             fjy2             = _mm256_add_pd(fjy2,ty);
2295             fjz2             = _mm256_add_pd(fjz2,tz);
2296
2297             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2298             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2299             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2300             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2301
2302             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2303                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2304
2305             /* Inner loop uses 360 flops */
2306         }
2307
2308         /* End of innermost loop */
2309
2310         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2311                                                  f+i_coord_offset,fshift+i_shift_offset);
2312
2313         /* Increment number of inner iterations */
2314         inneriter                  += j_index_end - j_index_start;
2315
2316         /* Outer loop uses 18 flops */
2317     }
2318
2319     /* Increment number of outer iterations */
2320     outeriter        += nri;
2321
2322     /* Update outer/inner flops */
2323
2324     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2325 }