made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            None
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     __m256i          vfitab;
96     __m128i          vfitab_lo,vfitab_hi;
97     __m128i          ifour       = _mm_set1_epi32(4);
98     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99     real             *vftab;
100     __m256           dummy_mask,cutoff_mask;
101     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
102     __m256           one     = _mm256_set1_ps(1.0);
103     __m256           two     = _mm256_set1_ps(2.0);
104     x                = xx[0];
105     f                = ff[0];
106
107     nri              = nlist->nri;
108     iinr             = nlist->iinr;
109     jindex           = nlist->jindex;
110     jjnr             = nlist->jjnr;
111     shiftidx         = nlist->shift;
112     gid              = nlist->gid;
113     shiftvec         = fr->shift_vec[0];
114     fshift           = fr->fshift[0];
115     facel            = _mm256_set1_ps(fr->epsfac);
116     charge           = mdatoms->chargeA;
117
118     vftab            = kernel_data->table_elec->data;
119     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
120
121     /* Setup water-specific parameters */
122     inr              = nlist->iinr[0];
123     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
124     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
125     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
126
127     jq0              = _mm256_set1_ps(charge[inr+0]);
128     jq1              = _mm256_set1_ps(charge[inr+1]);
129     jq2              = _mm256_set1_ps(charge[inr+2]);
130     qq00             = _mm256_mul_ps(iq0,jq0);
131     qq01             = _mm256_mul_ps(iq0,jq1);
132     qq02             = _mm256_mul_ps(iq0,jq2);
133     qq10             = _mm256_mul_ps(iq1,jq0);
134     qq11             = _mm256_mul_ps(iq1,jq1);
135     qq12             = _mm256_mul_ps(iq1,jq2);
136     qq20             = _mm256_mul_ps(iq2,jq0);
137     qq21             = _mm256_mul_ps(iq2,jq1);
138     qq22             = _mm256_mul_ps(iq2,jq2);
139
140     /* Avoid stupid compiler warnings */
141     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
142     j_coord_offsetA = 0;
143     j_coord_offsetB = 0;
144     j_coord_offsetC = 0;
145     j_coord_offsetD = 0;
146     j_coord_offsetE = 0;
147     j_coord_offsetF = 0;
148     j_coord_offsetG = 0;
149     j_coord_offsetH = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     for(iidx=0;iidx<4*DIM;iidx++)
155     {
156         scratch[iidx] = 0.0;
157     }
158
159     /* Start outer loop over neighborlists */
160     for(iidx=0; iidx<nri; iidx++)
161     {
162         /* Load shift vector for this list */
163         i_shift_offset   = DIM*shiftidx[iidx];
164
165         /* Load limits for loop over neighbors */
166         j_index_start    = jindex[iidx];
167         j_index_end      = jindex[iidx+1];
168
169         /* Get outer coordinate index */
170         inr              = iinr[iidx];
171         i_coord_offset   = DIM*inr;
172
173         /* Load i particle coords and add shift vector */
174         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
175                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
176
177         fix0             = _mm256_setzero_ps();
178         fiy0             = _mm256_setzero_ps();
179         fiz0             = _mm256_setzero_ps();
180         fix1             = _mm256_setzero_ps();
181         fiy1             = _mm256_setzero_ps();
182         fiz1             = _mm256_setzero_ps();
183         fix2             = _mm256_setzero_ps();
184         fiy2             = _mm256_setzero_ps();
185         fiz2             = _mm256_setzero_ps();
186
187         /* Reset potential sums */
188         velecsum         = _mm256_setzero_ps();
189
190         /* Start inner kernel loop */
191         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
192         {
193
194             /* Get j neighbor index, and coordinate index */
195             jnrA             = jjnr[jidx];
196             jnrB             = jjnr[jidx+1];
197             jnrC             = jjnr[jidx+2];
198             jnrD             = jjnr[jidx+3];
199             jnrE             = jjnr[jidx+4];
200             jnrF             = jjnr[jidx+5];
201             jnrG             = jjnr[jidx+6];
202             jnrH             = jjnr[jidx+7];
203             j_coord_offsetA  = DIM*jnrA;
204             j_coord_offsetB  = DIM*jnrB;
205             j_coord_offsetC  = DIM*jnrC;
206             j_coord_offsetD  = DIM*jnrD;
207             j_coord_offsetE  = DIM*jnrE;
208             j_coord_offsetF  = DIM*jnrF;
209             j_coord_offsetG  = DIM*jnrG;
210             j_coord_offsetH  = DIM*jnrH;
211
212             /* load j atom coordinates */
213             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
214                                                  x+j_coord_offsetC,x+j_coord_offsetD,
215                                                  x+j_coord_offsetE,x+j_coord_offsetF,
216                                                  x+j_coord_offsetG,x+j_coord_offsetH,
217                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218
219             /* Calculate displacement vector */
220             dx00             = _mm256_sub_ps(ix0,jx0);
221             dy00             = _mm256_sub_ps(iy0,jy0);
222             dz00             = _mm256_sub_ps(iz0,jz0);
223             dx01             = _mm256_sub_ps(ix0,jx1);
224             dy01             = _mm256_sub_ps(iy0,jy1);
225             dz01             = _mm256_sub_ps(iz0,jz1);
226             dx02             = _mm256_sub_ps(ix0,jx2);
227             dy02             = _mm256_sub_ps(iy0,jy2);
228             dz02             = _mm256_sub_ps(iz0,jz2);
229             dx10             = _mm256_sub_ps(ix1,jx0);
230             dy10             = _mm256_sub_ps(iy1,jy0);
231             dz10             = _mm256_sub_ps(iz1,jz0);
232             dx11             = _mm256_sub_ps(ix1,jx1);
233             dy11             = _mm256_sub_ps(iy1,jy1);
234             dz11             = _mm256_sub_ps(iz1,jz1);
235             dx12             = _mm256_sub_ps(ix1,jx2);
236             dy12             = _mm256_sub_ps(iy1,jy2);
237             dz12             = _mm256_sub_ps(iz1,jz2);
238             dx20             = _mm256_sub_ps(ix2,jx0);
239             dy20             = _mm256_sub_ps(iy2,jy0);
240             dz20             = _mm256_sub_ps(iz2,jz0);
241             dx21             = _mm256_sub_ps(ix2,jx1);
242             dy21             = _mm256_sub_ps(iy2,jy1);
243             dz21             = _mm256_sub_ps(iz2,jz1);
244             dx22             = _mm256_sub_ps(ix2,jx2);
245             dy22             = _mm256_sub_ps(iy2,jy2);
246             dz22             = _mm256_sub_ps(iz2,jz2);
247
248             /* Calculate squared distance and things based on it */
249             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
250             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
251             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
252             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
253             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
254             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
255             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
256             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
257             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
258
259             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
260             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
261             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
262             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
263             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
264             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
265             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
266             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
267             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
268
269             fjx0             = _mm256_setzero_ps();
270             fjy0             = _mm256_setzero_ps();
271             fjz0             = _mm256_setzero_ps();
272             fjx1             = _mm256_setzero_ps();
273             fjy1             = _mm256_setzero_ps();
274             fjz1             = _mm256_setzero_ps();
275             fjx2             = _mm256_setzero_ps();
276             fjy2             = _mm256_setzero_ps();
277             fjz2             = _mm256_setzero_ps();
278
279             /**************************
280              * CALCULATE INTERACTIONS *
281              **************************/
282
283             r00              = _mm256_mul_ps(rsq00,rinv00);
284
285             /* Calculate table index by multiplying r with table scale and truncate to integer */
286             rt               = _mm256_mul_ps(r00,vftabscale);
287             vfitab           = _mm256_cvttps_epi32(rt);
288             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
289             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
290             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
291             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
292             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
293             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
294
295             /* CUBIC SPLINE TABLE ELECTROSTATICS */
296             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
297                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
298             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
299                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
300             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
301                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
302             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
303                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
304             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
305             Heps             = _mm256_mul_ps(vfeps,H);
306             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
307             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
308             velec            = _mm256_mul_ps(qq00,VV);
309             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
310             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
311
312             /* Update potential sum for this i atom from the interaction with this j atom. */
313             velecsum         = _mm256_add_ps(velecsum,velec);
314
315             fscal            = felec;
316
317             /* Calculate temporary vectorial force */
318             tx               = _mm256_mul_ps(fscal,dx00);
319             ty               = _mm256_mul_ps(fscal,dy00);
320             tz               = _mm256_mul_ps(fscal,dz00);
321
322             /* Update vectorial force */
323             fix0             = _mm256_add_ps(fix0,tx);
324             fiy0             = _mm256_add_ps(fiy0,ty);
325             fiz0             = _mm256_add_ps(fiz0,tz);
326
327             fjx0             = _mm256_add_ps(fjx0,tx);
328             fjy0             = _mm256_add_ps(fjy0,ty);
329             fjz0             = _mm256_add_ps(fjz0,tz);
330
331             /**************************
332              * CALCULATE INTERACTIONS *
333              **************************/
334
335             r01              = _mm256_mul_ps(rsq01,rinv01);
336
337             /* Calculate table index by multiplying r with table scale and truncate to integer */
338             rt               = _mm256_mul_ps(r01,vftabscale);
339             vfitab           = _mm256_cvttps_epi32(rt);
340             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
341             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
342             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
343             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
344             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
345             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
346
347             /* CUBIC SPLINE TABLE ELECTROSTATICS */
348             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
349                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
350             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
351                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
352             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
353                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
354             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
355                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
356             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
357             Heps             = _mm256_mul_ps(vfeps,H);
358             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
359             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
360             velec            = _mm256_mul_ps(qq01,VV);
361             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
362             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             velecsum         = _mm256_add_ps(velecsum,velec);
366
367             fscal            = felec;
368
369             /* Calculate temporary vectorial force */
370             tx               = _mm256_mul_ps(fscal,dx01);
371             ty               = _mm256_mul_ps(fscal,dy01);
372             tz               = _mm256_mul_ps(fscal,dz01);
373
374             /* Update vectorial force */
375             fix0             = _mm256_add_ps(fix0,tx);
376             fiy0             = _mm256_add_ps(fiy0,ty);
377             fiz0             = _mm256_add_ps(fiz0,tz);
378
379             fjx1             = _mm256_add_ps(fjx1,tx);
380             fjy1             = _mm256_add_ps(fjy1,ty);
381             fjz1             = _mm256_add_ps(fjz1,tz);
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             r02              = _mm256_mul_ps(rsq02,rinv02);
388
389             /* Calculate table index by multiplying r with table scale and truncate to integer */
390             rt               = _mm256_mul_ps(r02,vftabscale);
391             vfitab           = _mm256_cvttps_epi32(rt);
392             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
393             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
394             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
395             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
396             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
397             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
398
399             /* CUBIC SPLINE TABLE ELECTROSTATICS */
400             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
401                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
402             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
403                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
404             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
405                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
406             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
407                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
408             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
409             Heps             = _mm256_mul_ps(vfeps,H);
410             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
411             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
412             velec            = _mm256_mul_ps(qq02,VV);
413             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
414             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velecsum         = _mm256_add_ps(velecsum,velec);
418
419             fscal            = felec;
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm256_mul_ps(fscal,dx02);
423             ty               = _mm256_mul_ps(fscal,dy02);
424             tz               = _mm256_mul_ps(fscal,dz02);
425
426             /* Update vectorial force */
427             fix0             = _mm256_add_ps(fix0,tx);
428             fiy0             = _mm256_add_ps(fiy0,ty);
429             fiz0             = _mm256_add_ps(fiz0,tz);
430
431             fjx2             = _mm256_add_ps(fjx2,tx);
432             fjy2             = _mm256_add_ps(fjy2,ty);
433             fjz2             = _mm256_add_ps(fjz2,tz);
434
435             /**************************
436              * CALCULATE INTERACTIONS *
437              **************************/
438
439             r10              = _mm256_mul_ps(rsq10,rinv10);
440
441             /* Calculate table index by multiplying r with table scale and truncate to integer */
442             rt               = _mm256_mul_ps(r10,vftabscale);
443             vfitab           = _mm256_cvttps_epi32(rt);
444             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
445             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
446             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
447             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
448             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
449             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
450
451             /* CUBIC SPLINE TABLE ELECTROSTATICS */
452             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
453                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
454             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
455                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
456             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
457                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
458             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
459                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
460             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
461             Heps             = _mm256_mul_ps(vfeps,H);
462             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
463             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
464             velec            = _mm256_mul_ps(qq10,VV);
465             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
466             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
467
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velecsum         = _mm256_add_ps(velecsum,velec);
470
471             fscal            = felec;
472
473             /* Calculate temporary vectorial force */
474             tx               = _mm256_mul_ps(fscal,dx10);
475             ty               = _mm256_mul_ps(fscal,dy10);
476             tz               = _mm256_mul_ps(fscal,dz10);
477
478             /* Update vectorial force */
479             fix1             = _mm256_add_ps(fix1,tx);
480             fiy1             = _mm256_add_ps(fiy1,ty);
481             fiz1             = _mm256_add_ps(fiz1,tz);
482
483             fjx0             = _mm256_add_ps(fjx0,tx);
484             fjy0             = _mm256_add_ps(fjy0,ty);
485             fjz0             = _mm256_add_ps(fjz0,tz);
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             r11              = _mm256_mul_ps(rsq11,rinv11);
492
493             /* Calculate table index by multiplying r with table scale and truncate to integer */
494             rt               = _mm256_mul_ps(r11,vftabscale);
495             vfitab           = _mm256_cvttps_epi32(rt);
496             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
497             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
498             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
499             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
500             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
501             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
502
503             /* CUBIC SPLINE TABLE ELECTROSTATICS */
504             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
505                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
506             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
507                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
508             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
509                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
510             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
511                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
512             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
513             Heps             = _mm256_mul_ps(vfeps,H);
514             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
515             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
516             velec            = _mm256_mul_ps(qq11,VV);
517             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
518             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
519
520             /* Update potential sum for this i atom from the interaction with this j atom. */
521             velecsum         = _mm256_add_ps(velecsum,velec);
522
523             fscal            = felec;
524
525             /* Calculate temporary vectorial force */
526             tx               = _mm256_mul_ps(fscal,dx11);
527             ty               = _mm256_mul_ps(fscal,dy11);
528             tz               = _mm256_mul_ps(fscal,dz11);
529
530             /* Update vectorial force */
531             fix1             = _mm256_add_ps(fix1,tx);
532             fiy1             = _mm256_add_ps(fiy1,ty);
533             fiz1             = _mm256_add_ps(fiz1,tz);
534
535             fjx1             = _mm256_add_ps(fjx1,tx);
536             fjy1             = _mm256_add_ps(fjy1,ty);
537             fjz1             = _mm256_add_ps(fjz1,tz);
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             r12              = _mm256_mul_ps(rsq12,rinv12);
544
545             /* Calculate table index by multiplying r with table scale and truncate to integer */
546             rt               = _mm256_mul_ps(r12,vftabscale);
547             vfitab           = _mm256_cvttps_epi32(rt);
548             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
549             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
550             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
551             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
552             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
553             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
554
555             /* CUBIC SPLINE TABLE ELECTROSTATICS */
556             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
557                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
558             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
559                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
560             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
561                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
562             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
563                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
564             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
565             Heps             = _mm256_mul_ps(vfeps,H);
566             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
567             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
568             velec            = _mm256_mul_ps(qq12,VV);
569             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
570             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
571
572             /* Update potential sum for this i atom from the interaction with this j atom. */
573             velecsum         = _mm256_add_ps(velecsum,velec);
574
575             fscal            = felec;
576
577             /* Calculate temporary vectorial force */
578             tx               = _mm256_mul_ps(fscal,dx12);
579             ty               = _mm256_mul_ps(fscal,dy12);
580             tz               = _mm256_mul_ps(fscal,dz12);
581
582             /* Update vectorial force */
583             fix1             = _mm256_add_ps(fix1,tx);
584             fiy1             = _mm256_add_ps(fiy1,ty);
585             fiz1             = _mm256_add_ps(fiz1,tz);
586
587             fjx2             = _mm256_add_ps(fjx2,tx);
588             fjy2             = _mm256_add_ps(fjy2,ty);
589             fjz2             = _mm256_add_ps(fjz2,tz);
590
591             /**************************
592              * CALCULATE INTERACTIONS *
593              **************************/
594
595             r20              = _mm256_mul_ps(rsq20,rinv20);
596
597             /* Calculate table index by multiplying r with table scale and truncate to integer */
598             rt               = _mm256_mul_ps(r20,vftabscale);
599             vfitab           = _mm256_cvttps_epi32(rt);
600             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
601             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
602             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
603             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
604             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
605             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
606
607             /* CUBIC SPLINE TABLE ELECTROSTATICS */
608             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
609                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
610             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
611                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
612             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
613                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
614             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
615                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
616             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
617             Heps             = _mm256_mul_ps(vfeps,H);
618             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
619             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
620             velec            = _mm256_mul_ps(qq20,VV);
621             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
622             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
623
624             /* Update potential sum for this i atom from the interaction with this j atom. */
625             velecsum         = _mm256_add_ps(velecsum,velec);
626
627             fscal            = felec;
628
629             /* Calculate temporary vectorial force */
630             tx               = _mm256_mul_ps(fscal,dx20);
631             ty               = _mm256_mul_ps(fscal,dy20);
632             tz               = _mm256_mul_ps(fscal,dz20);
633
634             /* Update vectorial force */
635             fix2             = _mm256_add_ps(fix2,tx);
636             fiy2             = _mm256_add_ps(fiy2,ty);
637             fiz2             = _mm256_add_ps(fiz2,tz);
638
639             fjx0             = _mm256_add_ps(fjx0,tx);
640             fjy0             = _mm256_add_ps(fjy0,ty);
641             fjz0             = _mm256_add_ps(fjz0,tz);
642
643             /**************************
644              * CALCULATE INTERACTIONS *
645              **************************/
646
647             r21              = _mm256_mul_ps(rsq21,rinv21);
648
649             /* Calculate table index by multiplying r with table scale and truncate to integer */
650             rt               = _mm256_mul_ps(r21,vftabscale);
651             vfitab           = _mm256_cvttps_epi32(rt);
652             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
653             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
654             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
655             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
656             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
657             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
658
659             /* CUBIC SPLINE TABLE ELECTROSTATICS */
660             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
661                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
662             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
663                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
664             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
665                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
666             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
667                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
668             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
669             Heps             = _mm256_mul_ps(vfeps,H);
670             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
671             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
672             velec            = _mm256_mul_ps(qq21,VV);
673             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
674             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
675
676             /* Update potential sum for this i atom from the interaction with this j atom. */
677             velecsum         = _mm256_add_ps(velecsum,velec);
678
679             fscal            = felec;
680
681             /* Calculate temporary vectorial force */
682             tx               = _mm256_mul_ps(fscal,dx21);
683             ty               = _mm256_mul_ps(fscal,dy21);
684             tz               = _mm256_mul_ps(fscal,dz21);
685
686             /* Update vectorial force */
687             fix2             = _mm256_add_ps(fix2,tx);
688             fiy2             = _mm256_add_ps(fiy2,ty);
689             fiz2             = _mm256_add_ps(fiz2,tz);
690
691             fjx1             = _mm256_add_ps(fjx1,tx);
692             fjy1             = _mm256_add_ps(fjy1,ty);
693             fjz1             = _mm256_add_ps(fjz1,tz);
694
695             /**************************
696              * CALCULATE INTERACTIONS *
697              **************************/
698
699             r22              = _mm256_mul_ps(rsq22,rinv22);
700
701             /* Calculate table index by multiplying r with table scale and truncate to integer */
702             rt               = _mm256_mul_ps(r22,vftabscale);
703             vfitab           = _mm256_cvttps_epi32(rt);
704             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
705             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
706             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
707             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
708             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
709             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
710
711             /* CUBIC SPLINE TABLE ELECTROSTATICS */
712             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
713                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
714             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
715                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
716             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
717                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
718             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
719                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
720             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
721             Heps             = _mm256_mul_ps(vfeps,H);
722             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
723             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
724             velec            = _mm256_mul_ps(qq22,VV);
725             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
726             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
727
728             /* Update potential sum for this i atom from the interaction with this j atom. */
729             velecsum         = _mm256_add_ps(velecsum,velec);
730
731             fscal            = felec;
732
733             /* Calculate temporary vectorial force */
734             tx               = _mm256_mul_ps(fscal,dx22);
735             ty               = _mm256_mul_ps(fscal,dy22);
736             tz               = _mm256_mul_ps(fscal,dz22);
737
738             /* Update vectorial force */
739             fix2             = _mm256_add_ps(fix2,tx);
740             fiy2             = _mm256_add_ps(fiy2,ty);
741             fiz2             = _mm256_add_ps(fiz2,tz);
742
743             fjx2             = _mm256_add_ps(fjx2,tx);
744             fjy2             = _mm256_add_ps(fjy2,ty);
745             fjz2             = _mm256_add_ps(fjz2,tz);
746
747             fjptrA             = f+j_coord_offsetA;
748             fjptrB             = f+j_coord_offsetB;
749             fjptrC             = f+j_coord_offsetC;
750             fjptrD             = f+j_coord_offsetD;
751             fjptrE             = f+j_coord_offsetE;
752             fjptrF             = f+j_coord_offsetF;
753             fjptrG             = f+j_coord_offsetG;
754             fjptrH             = f+j_coord_offsetH;
755
756             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
757                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
758
759             /* Inner loop uses 387 flops */
760         }
761
762         if(jidx<j_index_end)
763         {
764
765             /* Get j neighbor index, and coordinate index */
766             jnrlistA         = jjnr[jidx];
767             jnrlistB         = jjnr[jidx+1];
768             jnrlistC         = jjnr[jidx+2];
769             jnrlistD         = jjnr[jidx+3];
770             jnrlistE         = jjnr[jidx+4];
771             jnrlistF         = jjnr[jidx+5];
772             jnrlistG         = jjnr[jidx+6];
773             jnrlistH         = jjnr[jidx+7];
774             /* Sign of each element will be negative for non-real atoms.
775              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
776              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
777              */
778             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
779                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
780                                             
781             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
782             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
783             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
784             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
785             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
786             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
787             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
788             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
789             j_coord_offsetA  = DIM*jnrA;
790             j_coord_offsetB  = DIM*jnrB;
791             j_coord_offsetC  = DIM*jnrC;
792             j_coord_offsetD  = DIM*jnrD;
793             j_coord_offsetE  = DIM*jnrE;
794             j_coord_offsetF  = DIM*jnrF;
795             j_coord_offsetG  = DIM*jnrG;
796             j_coord_offsetH  = DIM*jnrH;
797
798             /* load j atom coordinates */
799             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
800                                                  x+j_coord_offsetC,x+j_coord_offsetD,
801                                                  x+j_coord_offsetE,x+j_coord_offsetF,
802                                                  x+j_coord_offsetG,x+j_coord_offsetH,
803                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
804
805             /* Calculate displacement vector */
806             dx00             = _mm256_sub_ps(ix0,jx0);
807             dy00             = _mm256_sub_ps(iy0,jy0);
808             dz00             = _mm256_sub_ps(iz0,jz0);
809             dx01             = _mm256_sub_ps(ix0,jx1);
810             dy01             = _mm256_sub_ps(iy0,jy1);
811             dz01             = _mm256_sub_ps(iz0,jz1);
812             dx02             = _mm256_sub_ps(ix0,jx2);
813             dy02             = _mm256_sub_ps(iy0,jy2);
814             dz02             = _mm256_sub_ps(iz0,jz2);
815             dx10             = _mm256_sub_ps(ix1,jx0);
816             dy10             = _mm256_sub_ps(iy1,jy0);
817             dz10             = _mm256_sub_ps(iz1,jz0);
818             dx11             = _mm256_sub_ps(ix1,jx1);
819             dy11             = _mm256_sub_ps(iy1,jy1);
820             dz11             = _mm256_sub_ps(iz1,jz1);
821             dx12             = _mm256_sub_ps(ix1,jx2);
822             dy12             = _mm256_sub_ps(iy1,jy2);
823             dz12             = _mm256_sub_ps(iz1,jz2);
824             dx20             = _mm256_sub_ps(ix2,jx0);
825             dy20             = _mm256_sub_ps(iy2,jy0);
826             dz20             = _mm256_sub_ps(iz2,jz0);
827             dx21             = _mm256_sub_ps(ix2,jx1);
828             dy21             = _mm256_sub_ps(iy2,jy1);
829             dz21             = _mm256_sub_ps(iz2,jz1);
830             dx22             = _mm256_sub_ps(ix2,jx2);
831             dy22             = _mm256_sub_ps(iy2,jy2);
832             dz22             = _mm256_sub_ps(iz2,jz2);
833
834             /* Calculate squared distance and things based on it */
835             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
836             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
837             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
838             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
839             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
840             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
841             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
842             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
843             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
844
845             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
846             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
847             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
848             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
849             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
850             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
851             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
852             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
853             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
854
855             fjx0             = _mm256_setzero_ps();
856             fjy0             = _mm256_setzero_ps();
857             fjz0             = _mm256_setzero_ps();
858             fjx1             = _mm256_setzero_ps();
859             fjy1             = _mm256_setzero_ps();
860             fjz1             = _mm256_setzero_ps();
861             fjx2             = _mm256_setzero_ps();
862             fjy2             = _mm256_setzero_ps();
863             fjz2             = _mm256_setzero_ps();
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             r00              = _mm256_mul_ps(rsq00,rinv00);
870             r00              = _mm256_andnot_ps(dummy_mask,r00);
871
872             /* Calculate table index by multiplying r with table scale and truncate to integer */
873             rt               = _mm256_mul_ps(r00,vftabscale);
874             vfitab           = _mm256_cvttps_epi32(rt);
875             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
876             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
877             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
878             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
879             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
880             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
881
882             /* CUBIC SPLINE TABLE ELECTROSTATICS */
883             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
884                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
885             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
886                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
887             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
888                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
889             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
890                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
891             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
892             Heps             = _mm256_mul_ps(vfeps,H);
893             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
894             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
895             velec            = _mm256_mul_ps(qq00,VV);
896             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
897             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
898
899             /* Update potential sum for this i atom from the interaction with this j atom. */
900             velec            = _mm256_andnot_ps(dummy_mask,velec);
901             velecsum         = _mm256_add_ps(velecsum,velec);
902
903             fscal            = felec;
904
905             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm256_mul_ps(fscal,dx00);
909             ty               = _mm256_mul_ps(fscal,dy00);
910             tz               = _mm256_mul_ps(fscal,dz00);
911
912             /* Update vectorial force */
913             fix0             = _mm256_add_ps(fix0,tx);
914             fiy0             = _mm256_add_ps(fiy0,ty);
915             fiz0             = _mm256_add_ps(fiz0,tz);
916
917             fjx0             = _mm256_add_ps(fjx0,tx);
918             fjy0             = _mm256_add_ps(fjy0,ty);
919             fjz0             = _mm256_add_ps(fjz0,tz);
920
921             /**************************
922              * CALCULATE INTERACTIONS *
923              **************************/
924
925             r01              = _mm256_mul_ps(rsq01,rinv01);
926             r01              = _mm256_andnot_ps(dummy_mask,r01);
927
928             /* Calculate table index by multiplying r with table scale and truncate to integer */
929             rt               = _mm256_mul_ps(r01,vftabscale);
930             vfitab           = _mm256_cvttps_epi32(rt);
931             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
932             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
933             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
934             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
935             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
936             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
937
938             /* CUBIC SPLINE TABLE ELECTROSTATICS */
939             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
940                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
941             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
942                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
943             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
944                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
945             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
946                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
947             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
948             Heps             = _mm256_mul_ps(vfeps,H);
949             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
950             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
951             velec            = _mm256_mul_ps(qq01,VV);
952             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
953             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
954
955             /* Update potential sum for this i atom from the interaction with this j atom. */
956             velec            = _mm256_andnot_ps(dummy_mask,velec);
957             velecsum         = _mm256_add_ps(velecsum,velec);
958
959             fscal            = felec;
960
961             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
962
963             /* Calculate temporary vectorial force */
964             tx               = _mm256_mul_ps(fscal,dx01);
965             ty               = _mm256_mul_ps(fscal,dy01);
966             tz               = _mm256_mul_ps(fscal,dz01);
967
968             /* Update vectorial force */
969             fix0             = _mm256_add_ps(fix0,tx);
970             fiy0             = _mm256_add_ps(fiy0,ty);
971             fiz0             = _mm256_add_ps(fiz0,tz);
972
973             fjx1             = _mm256_add_ps(fjx1,tx);
974             fjy1             = _mm256_add_ps(fjy1,ty);
975             fjz1             = _mm256_add_ps(fjz1,tz);
976
977             /**************************
978              * CALCULATE INTERACTIONS *
979              **************************/
980
981             r02              = _mm256_mul_ps(rsq02,rinv02);
982             r02              = _mm256_andnot_ps(dummy_mask,r02);
983
984             /* Calculate table index by multiplying r with table scale and truncate to integer */
985             rt               = _mm256_mul_ps(r02,vftabscale);
986             vfitab           = _mm256_cvttps_epi32(rt);
987             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
988             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
989             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
990             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
991             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
992             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
993
994             /* CUBIC SPLINE TABLE ELECTROSTATICS */
995             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
996                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
997             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
998                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
999             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1000                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1001             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1002                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1003             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1004             Heps             = _mm256_mul_ps(vfeps,H);
1005             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1006             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1007             velec            = _mm256_mul_ps(qq02,VV);
1008             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1009             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1010
1011             /* Update potential sum for this i atom from the interaction with this j atom. */
1012             velec            = _mm256_andnot_ps(dummy_mask,velec);
1013             velecsum         = _mm256_add_ps(velecsum,velec);
1014
1015             fscal            = felec;
1016
1017             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1018
1019             /* Calculate temporary vectorial force */
1020             tx               = _mm256_mul_ps(fscal,dx02);
1021             ty               = _mm256_mul_ps(fscal,dy02);
1022             tz               = _mm256_mul_ps(fscal,dz02);
1023
1024             /* Update vectorial force */
1025             fix0             = _mm256_add_ps(fix0,tx);
1026             fiy0             = _mm256_add_ps(fiy0,ty);
1027             fiz0             = _mm256_add_ps(fiz0,tz);
1028
1029             fjx2             = _mm256_add_ps(fjx2,tx);
1030             fjy2             = _mm256_add_ps(fjy2,ty);
1031             fjz2             = _mm256_add_ps(fjz2,tz);
1032
1033             /**************************
1034              * CALCULATE INTERACTIONS *
1035              **************************/
1036
1037             r10              = _mm256_mul_ps(rsq10,rinv10);
1038             r10              = _mm256_andnot_ps(dummy_mask,r10);
1039
1040             /* Calculate table index by multiplying r with table scale and truncate to integer */
1041             rt               = _mm256_mul_ps(r10,vftabscale);
1042             vfitab           = _mm256_cvttps_epi32(rt);
1043             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1044             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1045             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1046             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1047             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1048             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1049
1050             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1052                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1053             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1054                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1055             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1056                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1057             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1058                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1059             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1060             Heps             = _mm256_mul_ps(vfeps,H);
1061             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1062             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1063             velec            = _mm256_mul_ps(qq10,VV);
1064             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1065             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1066
1067             /* Update potential sum for this i atom from the interaction with this j atom. */
1068             velec            = _mm256_andnot_ps(dummy_mask,velec);
1069             velecsum         = _mm256_add_ps(velecsum,velec);
1070
1071             fscal            = felec;
1072
1073             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1074
1075             /* Calculate temporary vectorial force */
1076             tx               = _mm256_mul_ps(fscal,dx10);
1077             ty               = _mm256_mul_ps(fscal,dy10);
1078             tz               = _mm256_mul_ps(fscal,dz10);
1079
1080             /* Update vectorial force */
1081             fix1             = _mm256_add_ps(fix1,tx);
1082             fiy1             = _mm256_add_ps(fiy1,ty);
1083             fiz1             = _mm256_add_ps(fiz1,tz);
1084
1085             fjx0             = _mm256_add_ps(fjx0,tx);
1086             fjy0             = _mm256_add_ps(fjy0,ty);
1087             fjz0             = _mm256_add_ps(fjz0,tz);
1088
1089             /**************************
1090              * CALCULATE INTERACTIONS *
1091              **************************/
1092
1093             r11              = _mm256_mul_ps(rsq11,rinv11);
1094             r11              = _mm256_andnot_ps(dummy_mask,r11);
1095
1096             /* Calculate table index by multiplying r with table scale and truncate to integer */
1097             rt               = _mm256_mul_ps(r11,vftabscale);
1098             vfitab           = _mm256_cvttps_epi32(rt);
1099             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1100             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1101             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1102             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1103             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1104             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1105
1106             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1107             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1108                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1109             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1110                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1111             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1112                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1113             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1114                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1115             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1116             Heps             = _mm256_mul_ps(vfeps,H);
1117             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1118             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1119             velec            = _mm256_mul_ps(qq11,VV);
1120             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1121             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1122
1123             /* Update potential sum for this i atom from the interaction with this j atom. */
1124             velec            = _mm256_andnot_ps(dummy_mask,velec);
1125             velecsum         = _mm256_add_ps(velecsum,velec);
1126
1127             fscal            = felec;
1128
1129             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1130
1131             /* Calculate temporary vectorial force */
1132             tx               = _mm256_mul_ps(fscal,dx11);
1133             ty               = _mm256_mul_ps(fscal,dy11);
1134             tz               = _mm256_mul_ps(fscal,dz11);
1135
1136             /* Update vectorial force */
1137             fix1             = _mm256_add_ps(fix1,tx);
1138             fiy1             = _mm256_add_ps(fiy1,ty);
1139             fiz1             = _mm256_add_ps(fiz1,tz);
1140
1141             fjx1             = _mm256_add_ps(fjx1,tx);
1142             fjy1             = _mm256_add_ps(fjy1,ty);
1143             fjz1             = _mm256_add_ps(fjz1,tz);
1144
1145             /**************************
1146              * CALCULATE INTERACTIONS *
1147              **************************/
1148
1149             r12              = _mm256_mul_ps(rsq12,rinv12);
1150             r12              = _mm256_andnot_ps(dummy_mask,r12);
1151
1152             /* Calculate table index by multiplying r with table scale and truncate to integer */
1153             rt               = _mm256_mul_ps(r12,vftabscale);
1154             vfitab           = _mm256_cvttps_epi32(rt);
1155             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1156             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1157             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1158             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1159             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1160             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1161
1162             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1163             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1164                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1165             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1166                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1167             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1168                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1169             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1170                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1171             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1172             Heps             = _mm256_mul_ps(vfeps,H);
1173             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1174             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1175             velec            = _mm256_mul_ps(qq12,VV);
1176             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1177             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1178
1179             /* Update potential sum for this i atom from the interaction with this j atom. */
1180             velec            = _mm256_andnot_ps(dummy_mask,velec);
1181             velecsum         = _mm256_add_ps(velecsum,velec);
1182
1183             fscal            = felec;
1184
1185             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1186
1187             /* Calculate temporary vectorial force */
1188             tx               = _mm256_mul_ps(fscal,dx12);
1189             ty               = _mm256_mul_ps(fscal,dy12);
1190             tz               = _mm256_mul_ps(fscal,dz12);
1191
1192             /* Update vectorial force */
1193             fix1             = _mm256_add_ps(fix1,tx);
1194             fiy1             = _mm256_add_ps(fiy1,ty);
1195             fiz1             = _mm256_add_ps(fiz1,tz);
1196
1197             fjx2             = _mm256_add_ps(fjx2,tx);
1198             fjy2             = _mm256_add_ps(fjy2,ty);
1199             fjz2             = _mm256_add_ps(fjz2,tz);
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             r20              = _mm256_mul_ps(rsq20,rinv20);
1206             r20              = _mm256_andnot_ps(dummy_mask,r20);
1207
1208             /* Calculate table index by multiplying r with table scale and truncate to integer */
1209             rt               = _mm256_mul_ps(r20,vftabscale);
1210             vfitab           = _mm256_cvttps_epi32(rt);
1211             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1212             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1213             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1214             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1215             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1216             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1217
1218             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1219             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1220                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1221             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1222                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1223             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1224                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1225             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1226                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1227             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1228             Heps             = _mm256_mul_ps(vfeps,H);
1229             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1230             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1231             velec            = _mm256_mul_ps(qq20,VV);
1232             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1233             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1234
1235             /* Update potential sum for this i atom from the interaction with this j atom. */
1236             velec            = _mm256_andnot_ps(dummy_mask,velec);
1237             velecsum         = _mm256_add_ps(velecsum,velec);
1238
1239             fscal            = felec;
1240
1241             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm256_mul_ps(fscal,dx20);
1245             ty               = _mm256_mul_ps(fscal,dy20);
1246             tz               = _mm256_mul_ps(fscal,dz20);
1247
1248             /* Update vectorial force */
1249             fix2             = _mm256_add_ps(fix2,tx);
1250             fiy2             = _mm256_add_ps(fiy2,ty);
1251             fiz2             = _mm256_add_ps(fiz2,tz);
1252
1253             fjx0             = _mm256_add_ps(fjx0,tx);
1254             fjy0             = _mm256_add_ps(fjy0,ty);
1255             fjz0             = _mm256_add_ps(fjz0,tz);
1256
1257             /**************************
1258              * CALCULATE INTERACTIONS *
1259              **************************/
1260
1261             r21              = _mm256_mul_ps(rsq21,rinv21);
1262             r21              = _mm256_andnot_ps(dummy_mask,r21);
1263
1264             /* Calculate table index by multiplying r with table scale and truncate to integer */
1265             rt               = _mm256_mul_ps(r21,vftabscale);
1266             vfitab           = _mm256_cvttps_epi32(rt);
1267             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1268             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1269             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1270             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1271             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1272             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1273
1274             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1275             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1276                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1277             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1278                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1279             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1280                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1281             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1282                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1283             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1284             Heps             = _mm256_mul_ps(vfeps,H);
1285             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1286             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1287             velec            = _mm256_mul_ps(qq21,VV);
1288             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1289             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1290
1291             /* Update potential sum for this i atom from the interaction with this j atom. */
1292             velec            = _mm256_andnot_ps(dummy_mask,velec);
1293             velecsum         = _mm256_add_ps(velecsum,velec);
1294
1295             fscal            = felec;
1296
1297             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1298
1299             /* Calculate temporary vectorial force */
1300             tx               = _mm256_mul_ps(fscal,dx21);
1301             ty               = _mm256_mul_ps(fscal,dy21);
1302             tz               = _mm256_mul_ps(fscal,dz21);
1303
1304             /* Update vectorial force */
1305             fix2             = _mm256_add_ps(fix2,tx);
1306             fiy2             = _mm256_add_ps(fiy2,ty);
1307             fiz2             = _mm256_add_ps(fiz2,tz);
1308
1309             fjx1             = _mm256_add_ps(fjx1,tx);
1310             fjy1             = _mm256_add_ps(fjy1,ty);
1311             fjz1             = _mm256_add_ps(fjz1,tz);
1312
1313             /**************************
1314              * CALCULATE INTERACTIONS *
1315              **************************/
1316
1317             r22              = _mm256_mul_ps(rsq22,rinv22);
1318             r22              = _mm256_andnot_ps(dummy_mask,r22);
1319
1320             /* Calculate table index by multiplying r with table scale and truncate to integer */
1321             rt               = _mm256_mul_ps(r22,vftabscale);
1322             vfitab           = _mm256_cvttps_epi32(rt);
1323             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1324             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1325             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1326             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1327             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1328             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1329
1330             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1331             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1332                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1333             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1334                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1335             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1336                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1337             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1338                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1339             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1340             Heps             = _mm256_mul_ps(vfeps,H);
1341             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1342             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1343             velec            = _mm256_mul_ps(qq22,VV);
1344             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1345             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1346
1347             /* Update potential sum for this i atom from the interaction with this j atom. */
1348             velec            = _mm256_andnot_ps(dummy_mask,velec);
1349             velecsum         = _mm256_add_ps(velecsum,velec);
1350
1351             fscal            = felec;
1352
1353             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1354
1355             /* Calculate temporary vectorial force */
1356             tx               = _mm256_mul_ps(fscal,dx22);
1357             ty               = _mm256_mul_ps(fscal,dy22);
1358             tz               = _mm256_mul_ps(fscal,dz22);
1359
1360             /* Update vectorial force */
1361             fix2             = _mm256_add_ps(fix2,tx);
1362             fiy2             = _mm256_add_ps(fiy2,ty);
1363             fiz2             = _mm256_add_ps(fiz2,tz);
1364
1365             fjx2             = _mm256_add_ps(fjx2,tx);
1366             fjy2             = _mm256_add_ps(fjy2,ty);
1367             fjz2             = _mm256_add_ps(fjz2,tz);
1368
1369             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1370             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1371             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1372             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1373             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1374             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1375             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1376             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1377
1378             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1379                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1380
1381             /* Inner loop uses 396 flops */
1382         }
1383
1384         /* End of innermost loop */
1385
1386         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1387                                                  f+i_coord_offset,fshift+i_shift_offset);
1388
1389         ggid                        = gid[iidx];
1390         /* Update potential energies */
1391         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1392
1393         /* Increment number of inner iterations */
1394         inneriter                  += j_index_end - j_index_start;
1395
1396         /* Outer loop uses 19 flops */
1397     }
1398
1399     /* Increment number of outer iterations */
1400     outeriter        += nri;
1401
1402     /* Update outer/inner flops */
1403
1404     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1405 }
1406 /*
1407  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1408  * Electrostatics interaction: CubicSplineTable
1409  * VdW interaction:            None
1410  * Geometry:                   Water3-Water3
1411  * Calculate force/pot:        Force
1412  */
1413 void
1414 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1415                     (t_nblist * gmx_restrict                nlist,
1416                      rvec * gmx_restrict                    xx,
1417                      rvec * gmx_restrict                    ff,
1418                      t_forcerec * gmx_restrict              fr,
1419                      t_mdatoms * gmx_restrict               mdatoms,
1420                      nb_kernel_data_t * gmx_restrict        kernel_data,
1421                      t_nrnb * gmx_restrict                  nrnb)
1422 {
1423     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1424      * just 0 for non-waters.
1425      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1426      * jnr indices corresponding to data put in the four positions in the SIMD register.
1427      */
1428     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1429     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1430     int              jnrA,jnrB,jnrC,jnrD;
1431     int              jnrE,jnrF,jnrG,jnrH;
1432     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1433     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1434     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1435     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1436     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1437     real             rcutoff_scalar;
1438     real             *shiftvec,*fshift,*x,*f;
1439     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1440     real             scratch[4*DIM];
1441     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1442     real *           vdwioffsetptr0;
1443     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1444     real *           vdwioffsetptr1;
1445     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1446     real *           vdwioffsetptr2;
1447     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1448     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1449     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1450     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1451     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1452     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1453     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1454     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1455     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1456     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1457     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1458     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1459     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1460     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1461     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1462     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1463     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1464     real             *charge;
1465     __m256i          vfitab;
1466     __m128i          vfitab_lo,vfitab_hi;
1467     __m128i          ifour       = _mm_set1_epi32(4);
1468     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1469     real             *vftab;
1470     __m256           dummy_mask,cutoff_mask;
1471     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1472     __m256           one     = _mm256_set1_ps(1.0);
1473     __m256           two     = _mm256_set1_ps(2.0);
1474     x                = xx[0];
1475     f                = ff[0];
1476
1477     nri              = nlist->nri;
1478     iinr             = nlist->iinr;
1479     jindex           = nlist->jindex;
1480     jjnr             = nlist->jjnr;
1481     shiftidx         = nlist->shift;
1482     gid              = nlist->gid;
1483     shiftvec         = fr->shift_vec[0];
1484     fshift           = fr->fshift[0];
1485     facel            = _mm256_set1_ps(fr->epsfac);
1486     charge           = mdatoms->chargeA;
1487
1488     vftab            = kernel_data->table_elec->data;
1489     vftabscale       = _mm256_set1_ps(kernel_data->table_elec->scale);
1490
1491     /* Setup water-specific parameters */
1492     inr              = nlist->iinr[0];
1493     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1494     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1495     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1496
1497     jq0              = _mm256_set1_ps(charge[inr+0]);
1498     jq1              = _mm256_set1_ps(charge[inr+1]);
1499     jq2              = _mm256_set1_ps(charge[inr+2]);
1500     qq00             = _mm256_mul_ps(iq0,jq0);
1501     qq01             = _mm256_mul_ps(iq0,jq1);
1502     qq02             = _mm256_mul_ps(iq0,jq2);
1503     qq10             = _mm256_mul_ps(iq1,jq0);
1504     qq11             = _mm256_mul_ps(iq1,jq1);
1505     qq12             = _mm256_mul_ps(iq1,jq2);
1506     qq20             = _mm256_mul_ps(iq2,jq0);
1507     qq21             = _mm256_mul_ps(iq2,jq1);
1508     qq22             = _mm256_mul_ps(iq2,jq2);
1509
1510     /* Avoid stupid compiler warnings */
1511     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1512     j_coord_offsetA = 0;
1513     j_coord_offsetB = 0;
1514     j_coord_offsetC = 0;
1515     j_coord_offsetD = 0;
1516     j_coord_offsetE = 0;
1517     j_coord_offsetF = 0;
1518     j_coord_offsetG = 0;
1519     j_coord_offsetH = 0;
1520
1521     outeriter        = 0;
1522     inneriter        = 0;
1523
1524     for(iidx=0;iidx<4*DIM;iidx++)
1525     {
1526         scratch[iidx] = 0.0;
1527     }
1528
1529     /* Start outer loop over neighborlists */
1530     for(iidx=0; iidx<nri; iidx++)
1531     {
1532         /* Load shift vector for this list */
1533         i_shift_offset   = DIM*shiftidx[iidx];
1534
1535         /* Load limits for loop over neighbors */
1536         j_index_start    = jindex[iidx];
1537         j_index_end      = jindex[iidx+1];
1538
1539         /* Get outer coordinate index */
1540         inr              = iinr[iidx];
1541         i_coord_offset   = DIM*inr;
1542
1543         /* Load i particle coords and add shift vector */
1544         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1545                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1546
1547         fix0             = _mm256_setzero_ps();
1548         fiy0             = _mm256_setzero_ps();
1549         fiz0             = _mm256_setzero_ps();
1550         fix1             = _mm256_setzero_ps();
1551         fiy1             = _mm256_setzero_ps();
1552         fiz1             = _mm256_setzero_ps();
1553         fix2             = _mm256_setzero_ps();
1554         fiy2             = _mm256_setzero_ps();
1555         fiz2             = _mm256_setzero_ps();
1556
1557         /* Start inner kernel loop */
1558         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1559         {
1560
1561             /* Get j neighbor index, and coordinate index */
1562             jnrA             = jjnr[jidx];
1563             jnrB             = jjnr[jidx+1];
1564             jnrC             = jjnr[jidx+2];
1565             jnrD             = jjnr[jidx+3];
1566             jnrE             = jjnr[jidx+4];
1567             jnrF             = jjnr[jidx+5];
1568             jnrG             = jjnr[jidx+6];
1569             jnrH             = jjnr[jidx+7];
1570             j_coord_offsetA  = DIM*jnrA;
1571             j_coord_offsetB  = DIM*jnrB;
1572             j_coord_offsetC  = DIM*jnrC;
1573             j_coord_offsetD  = DIM*jnrD;
1574             j_coord_offsetE  = DIM*jnrE;
1575             j_coord_offsetF  = DIM*jnrF;
1576             j_coord_offsetG  = DIM*jnrG;
1577             j_coord_offsetH  = DIM*jnrH;
1578
1579             /* load j atom coordinates */
1580             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1581                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1582                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1583                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1584                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1585
1586             /* Calculate displacement vector */
1587             dx00             = _mm256_sub_ps(ix0,jx0);
1588             dy00             = _mm256_sub_ps(iy0,jy0);
1589             dz00             = _mm256_sub_ps(iz0,jz0);
1590             dx01             = _mm256_sub_ps(ix0,jx1);
1591             dy01             = _mm256_sub_ps(iy0,jy1);
1592             dz01             = _mm256_sub_ps(iz0,jz1);
1593             dx02             = _mm256_sub_ps(ix0,jx2);
1594             dy02             = _mm256_sub_ps(iy0,jy2);
1595             dz02             = _mm256_sub_ps(iz0,jz2);
1596             dx10             = _mm256_sub_ps(ix1,jx0);
1597             dy10             = _mm256_sub_ps(iy1,jy0);
1598             dz10             = _mm256_sub_ps(iz1,jz0);
1599             dx11             = _mm256_sub_ps(ix1,jx1);
1600             dy11             = _mm256_sub_ps(iy1,jy1);
1601             dz11             = _mm256_sub_ps(iz1,jz1);
1602             dx12             = _mm256_sub_ps(ix1,jx2);
1603             dy12             = _mm256_sub_ps(iy1,jy2);
1604             dz12             = _mm256_sub_ps(iz1,jz2);
1605             dx20             = _mm256_sub_ps(ix2,jx0);
1606             dy20             = _mm256_sub_ps(iy2,jy0);
1607             dz20             = _mm256_sub_ps(iz2,jz0);
1608             dx21             = _mm256_sub_ps(ix2,jx1);
1609             dy21             = _mm256_sub_ps(iy2,jy1);
1610             dz21             = _mm256_sub_ps(iz2,jz1);
1611             dx22             = _mm256_sub_ps(ix2,jx2);
1612             dy22             = _mm256_sub_ps(iy2,jy2);
1613             dz22             = _mm256_sub_ps(iz2,jz2);
1614
1615             /* Calculate squared distance and things based on it */
1616             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1617             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1618             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1619             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1620             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1621             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1622             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1623             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1624             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1625
1626             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1627             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1628             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1629             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1630             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1631             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1632             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1633             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1634             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1635
1636             fjx0             = _mm256_setzero_ps();
1637             fjy0             = _mm256_setzero_ps();
1638             fjz0             = _mm256_setzero_ps();
1639             fjx1             = _mm256_setzero_ps();
1640             fjy1             = _mm256_setzero_ps();
1641             fjz1             = _mm256_setzero_ps();
1642             fjx2             = _mm256_setzero_ps();
1643             fjy2             = _mm256_setzero_ps();
1644             fjz2             = _mm256_setzero_ps();
1645
1646             /**************************
1647              * CALCULATE INTERACTIONS *
1648              **************************/
1649
1650             r00              = _mm256_mul_ps(rsq00,rinv00);
1651
1652             /* Calculate table index by multiplying r with table scale and truncate to integer */
1653             rt               = _mm256_mul_ps(r00,vftabscale);
1654             vfitab           = _mm256_cvttps_epi32(rt);
1655             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1656             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1657             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1658             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1659             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1660             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1661
1662             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1664                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1665             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1666                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1667             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1668                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1669             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1670                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1671             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1672             Heps             = _mm256_mul_ps(vfeps,H);
1673             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1674             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1675             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1676
1677             fscal            = felec;
1678
1679             /* Calculate temporary vectorial force */
1680             tx               = _mm256_mul_ps(fscal,dx00);
1681             ty               = _mm256_mul_ps(fscal,dy00);
1682             tz               = _mm256_mul_ps(fscal,dz00);
1683
1684             /* Update vectorial force */
1685             fix0             = _mm256_add_ps(fix0,tx);
1686             fiy0             = _mm256_add_ps(fiy0,ty);
1687             fiz0             = _mm256_add_ps(fiz0,tz);
1688
1689             fjx0             = _mm256_add_ps(fjx0,tx);
1690             fjy0             = _mm256_add_ps(fjy0,ty);
1691             fjz0             = _mm256_add_ps(fjz0,tz);
1692
1693             /**************************
1694              * CALCULATE INTERACTIONS *
1695              **************************/
1696
1697             r01              = _mm256_mul_ps(rsq01,rinv01);
1698
1699             /* Calculate table index by multiplying r with table scale and truncate to integer */
1700             rt               = _mm256_mul_ps(r01,vftabscale);
1701             vfitab           = _mm256_cvttps_epi32(rt);
1702             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1703             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1704             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1705             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1706             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1707             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1708
1709             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1710             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1711                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1712             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1713                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1714             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1715                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1716             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1717                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1718             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1719             Heps             = _mm256_mul_ps(vfeps,H);
1720             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1721             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1722             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1723
1724             fscal            = felec;
1725
1726             /* Calculate temporary vectorial force */
1727             tx               = _mm256_mul_ps(fscal,dx01);
1728             ty               = _mm256_mul_ps(fscal,dy01);
1729             tz               = _mm256_mul_ps(fscal,dz01);
1730
1731             /* Update vectorial force */
1732             fix0             = _mm256_add_ps(fix0,tx);
1733             fiy0             = _mm256_add_ps(fiy0,ty);
1734             fiz0             = _mm256_add_ps(fiz0,tz);
1735
1736             fjx1             = _mm256_add_ps(fjx1,tx);
1737             fjy1             = _mm256_add_ps(fjy1,ty);
1738             fjz1             = _mm256_add_ps(fjz1,tz);
1739
1740             /**************************
1741              * CALCULATE INTERACTIONS *
1742              **************************/
1743
1744             r02              = _mm256_mul_ps(rsq02,rinv02);
1745
1746             /* Calculate table index by multiplying r with table scale and truncate to integer */
1747             rt               = _mm256_mul_ps(r02,vftabscale);
1748             vfitab           = _mm256_cvttps_epi32(rt);
1749             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1750             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1751             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1752             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1753             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1754             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1755
1756             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1757             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1758                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1759             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1760                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1761             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1762                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1763             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1764                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1765             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1766             Heps             = _mm256_mul_ps(vfeps,H);
1767             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1768             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1769             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1770
1771             fscal            = felec;
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm256_mul_ps(fscal,dx02);
1775             ty               = _mm256_mul_ps(fscal,dy02);
1776             tz               = _mm256_mul_ps(fscal,dz02);
1777
1778             /* Update vectorial force */
1779             fix0             = _mm256_add_ps(fix0,tx);
1780             fiy0             = _mm256_add_ps(fiy0,ty);
1781             fiz0             = _mm256_add_ps(fiz0,tz);
1782
1783             fjx2             = _mm256_add_ps(fjx2,tx);
1784             fjy2             = _mm256_add_ps(fjy2,ty);
1785             fjz2             = _mm256_add_ps(fjz2,tz);
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             r10              = _mm256_mul_ps(rsq10,rinv10);
1792
1793             /* Calculate table index by multiplying r with table scale and truncate to integer */
1794             rt               = _mm256_mul_ps(r10,vftabscale);
1795             vfitab           = _mm256_cvttps_epi32(rt);
1796             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1797             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1798             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1799             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1800             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1801             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1802
1803             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1804             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1805                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1806             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1807                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1808             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1809                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1810             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1811                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1812             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1813             Heps             = _mm256_mul_ps(vfeps,H);
1814             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1815             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1816             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1817
1818             fscal            = felec;
1819
1820             /* Calculate temporary vectorial force */
1821             tx               = _mm256_mul_ps(fscal,dx10);
1822             ty               = _mm256_mul_ps(fscal,dy10);
1823             tz               = _mm256_mul_ps(fscal,dz10);
1824
1825             /* Update vectorial force */
1826             fix1             = _mm256_add_ps(fix1,tx);
1827             fiy1             = _mm256_add_ps(fiy1,ty);
1828             fiz1             = _mm256_add_ps(fiz1,tz);
1829
1830             fjx0             = _mm256_add_ps(fjx0,tx);
1831             fjy0             = _mm256_add_ps(fjy0,ty);
1832             fjz0             = _mm256_add_ps(fjz0,tz);
1833
1834             /**************************
1835              * CALCULATE INTERACTIONS *
1836              **************************/
1837
1838             r11              = _mm256_mul_ps(rsq11,rinv11);
1839
1840             /* Calculate table index by multiplying r with table scale and truncate to integer */
1841             rt               = _mm256_mul_ps(r11,vftabscale);
1842             vfitab           = _mm256_cvttps_epi32(rt);
1843             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1844             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1845             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1846             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1847             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1848             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1849
1850             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1851             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1852                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1853             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1854                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1855             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1856                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1857             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1858                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1859             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1860             Heps             = _mm256_mul_ps(vfeps,H);
1861             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1862             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1863             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1864
1865             fscal            = felec;
1866
1867             /* Calculate temporary vectorial force */
1868             tx               = _mm256_mul_ps(fscal,dx11);
1869             ty               = _mm256_mul_ps(fscal,dy11);
1870             tz               = _mm256_mul_ps(fscal,dz11);
1871
1872             /* Update vectorial force */
1873             fix1             = _mm256_add_ps(fix1,tx);
1874             fiy1             = _mm256_add_ps(fiy1,ty);
1875             fiz1             = _mm256_add_ps(fiz1,tz);
1876
1877             fjx1             = _mm256_add_ps(fjx1,tx);
1878             fjy1             = _mm256_add_ps(fjy1,ty);
1879             fjz1             = _mm256_add_ps(fjz1,tz);
1880
1881             /**************************
1882              * CALCULATE INTERACTIONS *
1883              **************************/
1884
1885             r12              = _mm256_mul_ps(rsq12,rinv12);
1886
1887             /* Calculate table index by multiplying r with table scale and truncate to integer */
1888             rt               = _mm256_mul_ps(r12,vftabscale);
1889             vfitab           = _mm256_cvttps_epi32(rt);
1890             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1891             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1892             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1893             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1894             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1895             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1896
1897             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1899                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1900             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1901                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1902             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1903                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1904             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1905                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1906             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1907             Heps             = _mm256_mul_ps(vfeps,H);
1908             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1909             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1910             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1911
1912             fscal            = felec;
1913
1914             /* Calculate temporary vectorial force */
1915             tx               = _mm256_mul_ps(fscal,dx12);
1916             ty               = _mm256_mul_ps(fscal,dy12);
1917             tz               = _mm256_mul_ps(fscal,dz12);
1918
1919             /* Update vectorial force */
1920             fix1             = _mm256_add_ps(fix1,tx);
1921             fiy1             = _mm256_add_ps(fiy1,ty);
1922             fiz1             = _mm256_add_ps(fiz1,tz);
1923
1924             fjx2             = _mm256_add_ps(fjx2,tx);
1925             fjy2             = _mm256_add_ps(fjy2,ty);
1926             fjz2             = _mm256_add_ps(fjz2,tz);
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             r20              = _mm256_mul_ps(rsq20,rinv20);
1933
1934             /* Calculate table index by multiplying r with table scale and truncate to integer */
1935             rt               = _mm256_mul_ps(r20,vftabscale);
1936             vfitab           = _mm256_cvttps_epi32(rt);
1937             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1938             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1939             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1940             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1941             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1942             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1943
1944             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1945             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1946                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1947             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1948                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1949             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1950                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1951             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1952                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1953             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1954             Heps             = _mm256_mul_ps(vfeps,H);
1955             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1956             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1957             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1958
1959             fscal            = felec;
1960
1961             /* Calculate temporary vectorial force */
1962             tx               = _mm256_mul_ps(fscal,dx20);
1963             ty               = _mm256_mul_ps(fscal,dy20);
1964             tz               = _mm256_mul_ps(fscal,dz20);
1965
1966             /* Update vectorial force */
1967             fix2             = _mm256_add_ps(fix2,tx);
1968             fiy2             = _mm256_add_ps(fiy2,ty);
1969             fiz2             = _mm256_add_ps(fiz2,tz);
1970
1971             fjx0             = _mm256_add_ps(fjx0,tx);
1972             fjy0             = _mm256_add_ps(fjy0,ty);
1973             fjz0             = _mm256_add_ps(fjz0,tz);
1974
1975             /**************************
1976              * CALCULATE INTERACTIONS *
1977              **************************/
1978
1979             r21              = _mm256_mul_ps(rsq21,rinv21);
1980
1981             /* Calculate table index by multiplying r with table scale and truncate to integer */
1982             rt               = _mm256_mul_ps(r21,vftabscale);
1983             vfitab           = _mm256_cvttps_epi32(rt);
1984             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1985             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1986             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1987             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1988             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
1989             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
1990
1991             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1992             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1993                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1994             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1995                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1996             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1997                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1998             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1999                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2000             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2001             Heps             = _mm256_mul_ps(vfeps,H);
2002             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2003             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2004             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2005
2006             fscal            = felec;
2007
2008             /* Calculate temporary vectorial force */
2009             tx               = _mm256_mul_ps(fscal,dx21);
2010             ty               = _mm256_mul_ps(fscal,dy21);
2011             tz               = _mm256_mul_ps(fscal,dz21);
2012
2013             /* Update vectorial force */
2014             fix2             = _mm256_add_ps(fix2,tx);
2015             fiy2             = _mm256_add_ps(fiy2,ty);
2016             fiz2             = _mm256_add_ps(fiz2,tz);
2017
2018             fjx1             = _mm256_add_ps(fjx1,tx);
2019             fjy1             = _mm256_add_ps(fjy1,ty);
2020             fjz1             = _mm256_add_ps(fjz1,tz);
2021
2022             /**************************
2023              * CALCULATE INTERACTIONS *
2024              **************************/
2025
2026             r22              = _mm256_mul_ps(rsq22,rinv22);
2027
2028             /* Calculate table index by multiplying r with table scale and truncate to integer */
2029             rt               = _mm256_mul_ps(r22,vftabscale);
2030             vfitab           = _mm256_cvttps_epi32(rt);
2031             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2032             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2033             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2034             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2035             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2036             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2037
2038             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2039             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2040                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2041             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2042                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2043             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2044                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2045             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2046                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2047             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2048             Heps             = _mm256_mul_ps(vfeps,H);
2049             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2050             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2051             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2052
2053             fscal            = felec;
2054
2055             /* Calculate temporary vectorial force */
2056             tx               = _mm256_mul_ps(fscal,dx22);
2057             ty               = _mm256_mul_ps(fscal,dy22);
2058             tz               = _mm256_mul_ps(fscal,dz22);
2059
2060             /* Update vectorial force */
2061             fix2             = _mm256_add_ps(fix2,tx);
2062             fiy2             = _mm256_add_ps(fiy2,ty);
2063             fiz2             = _mm256_add_ps(fiz2,tz);
2064
2065             fjx2             = _mm256_add_ps(fjx2,tx);
2066             fjy2             = _mm256_add_ps(fjy2,ty);
2067             fjz2             = _mm256_add_ps(fjz2,tz);
2068
2069             fjptrA             = f+j_coord_offsetA;
2070             fjptrB             = f+j_coord_offsetB;
2071             fjptrC             = f+j_coord_offsetC;
2072             fjptrD             = f+j_coord_offsetD;
2073             fjptrE             = f+j_coord_offsetE;
2074             fjptrF             = f+j_coord_offsetF;
2075             fjptrG             = f+j_coord_offsetG;
2076             fjptrH             = f+j_coord_offsetH;
2077
2078             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2079                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2080
2081             /* Inner loop uses 351 flops */
2082         }
2083
2084         if(jidx<j_index_end)
2085         {
2086
2087             /* Get j neighbor index, and coordinate index */
2088             jnrlistA         = jjnr[jidx];
2089             jnrlistB         = jjnr[jidx+1];
2090             jnrlistC         = jjnr[jidx+2];
2091             jnrlistD         = jjnr[jidx+3];
2092             jnrlistE         = jjnr[jidx+4];
2093             jnrlistF         = jjnr[jidx+5];
2094             jnrlistG         = jjnr[jidx+6];
2095             jnrlistH         = jjnr[jidx+7];
2096             /* Sign of each element will be negative for non-real atoms.
2097              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2098              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2099              */
2100             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2101                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2102                                             
2103             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2104             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2105             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2106             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2107             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2108             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2109             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2110             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2111             j_coord_offsetA  = DIM*jnrA;
2112             j_coord_offsetB  = DIM*jnrB;
2113             j_coord_offsetC  = DIM*jnrC;
2114             j_coord_offsetD  = DIM*jnrD;
2115             j_coord_offsetE  = DIM*jnrE;
2116             j_coord_offsetF  = DIM*jnrF;
2117             j_coord_offsetG  = DIM*jnrG;
2118             j_coord_offsetH  = DIM*jnrH;
2119
2120             /* load j atom coordinates */
2121             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2122                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2123                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2124                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2125                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2126
2127             /* Calculate displacement vector */
2128             dx00             = _mm256_sub_ps(ix0,jx0);
2129             dy00             = _mm256_sub_ps(iy0,jy0);
2130             dz00             = _mm256_sub_ps(iz0,jz0);
2131             dx01             = _mm256_sub_ps(ix0,jx1);
2132             dy01             = _mm256_sub_ps(iy0,jy1);
2133             dz01             = _mm256_sub_ps(iz0,jz1);
2134             dx02             = _mm256_sub_ps(ix0,jx2);
2135             dy02             = _mm256_sub_ps(iy0,jy2);
2136             dz02             = _mm256_sub_ps(iz0,jz2);
2137             dx10             = _mm256_sub_ps(ix1,jx0);
2138             dy10             = _mm256_sub_ps(iy1,jy0);
2139             dz10             = _mm256_sub_ps(iz1,jz0);
2140             dx11             = _mm256_sub_ps(ix1,jx1);
2141             dy11             = _mm256_sub_ps(iy1,jy1);
2142             dz11             = _mm256_sub_ps(iz1,jz1);
2143             dx12             = _mm256_sub_ps(ix1,jx2);
2144             dy12             = _mm256_sub_ps(iy1,jy2);
2145             dz12             = _mm256_sub_ps(iz1,jz2);
2146             dx20             = _mm256_sub_ps(ix2,jx0);
2147             dy20             = _mm256_sub_ps(iy2,jy0);
2148             dz20             = _mm256_sub_ps(iz2,jz0);
2149             dx21             = _mm256_sub_ps(ix2,jx1);
2150             dy21             = _mm256_sub_ps(iy2,jy1);
2151             dz21             = _mm256_sub_ps(iz2,jz1);
2152             dx22             = _mm256_sub_ps(ix2,jx2);
2153             dy22             = _mm256_sub_ps(iy2,jy2);
2154             dz22             = _mm256_sub_ps(iz2,jz2);
2155
2156             /* Calculate squared distance and things based on it */
2157             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2158             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2159             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2160             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2161             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2162             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2163             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2164             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2165             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2166
2167             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2168             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2169             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2170             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2171             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2172             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2173             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2174             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2175             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2176
2177             fjx0             = _mm256_setzero_ps();
2178             fjy0             = _mm256_setzero_ps();
2179             fjz0             = _mm256_setzero_ps();
2180             fjx1             = _mm256_setzero_ps();
2181             fjy1             = _mm256_setzero_ps();
2182             fjz1             = _mm256_setzero_ps();
2183             fjx2             = _mm256_setzero_ps();
2184             fjy2             = _mm256_setzero_ps();
2185             fjz2             = _mm256_setzero_ps();
2186
2187             /**************************
2188              * CALCULATE INTERACTIONS *
2189              **************************/
2190
2191             r00              = _mm256_mul_ps(rsq00,rinv00);
2192             r00              = _mm256_andnot_ps(dummy_mask,r00);
2193
2194             /* Calculate table index by multiplying r with table scale and truncate to integer */
2195             rt               = _mm256_mul_ps(r00,vftabscale);
2196             vfitab           = _mm256_cvttps_epi32(rt);
2197             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2198             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2199             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2200             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2201             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2202             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2203
2204             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2205             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2206                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2207             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2208                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2209             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2210                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2211             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2212                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2213             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2214             Heps             = _mm256_mul_ps(vfeps,H);
2215             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2216             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2217             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2218
2219             fscal            = felec;
2220
2221             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2222
2223             /* Calculate temporary vectorial force */
2224             tx               = _mm256_mul_ps(fscal,dx00);
2225             ty               = _mm256_mul_ps(fscal,dy00);
2226             tz               = _mm256_mul_ps(fscal,dz00);
2227
2228             /* Update vectorial force */
2229             fix0             = _mm256_add_ps(fix0,tx);
2230             fiy0             = _mm256_add_ps(fiy0,ty);
2231             fiz0             = _mm256_add_ps(fiz0,tz);
2232
2233             fjx0             = _mm256_add_ps(fjx0,tx);
2234             fjy0             = _mm256_add_ps(fjy0,ty);
2235             fjz0             = _mm256_add_ps(fjz0,tz);
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             r01              = _mm256_mul_ps(rsq01,rinv01);
2242             r01              = _mm256_andnot_ps(dummy_mask,r01);
2243
2244             /* Calculate table index by multiplying r with table scale and truncate to integer */
2245             rt               = _mm256_mul_ps(r01,vftabscale);
2246             vfitab           = _mm256_cvttps_epi32(rt);
2247             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2248             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2249             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2250             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2251             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2252             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2253
2254             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2256                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2257             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2258                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2259             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2260                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2261             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2262                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2263             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2264             Heps             = _mm256_mul_ps(vfeps,H);
2265             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2266             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2267             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2268
2269             fscal            = felec;
2270
2271             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2272
2273             /* Calculate temporary vectorial force */
2274             tx               = _mm256_mul_ps(fscal,dx01);
2275             ty               = _mm256_mul_ps(fscal,dy01);
2276             tz               = _mm256_mul_ps(fscal,dz01);
2277
2278             /* Update vectorial force */
2279             fix0             = _mm256_add_ps(fix0,tx);
2280             fiy0             = _mm256_add_ps(fiy0,ty);
2281             fiz0             = _mm256_add_ps(fiz0,tz);
2282
2283             fjx1             = _mm256_add_ps(fjx1,tx);
2284             fjy1             = _mm256_add_ps(fjy1,ty);
2285             fjz1             = _mm256_add_ps(fjz1,tz);
2286
2287             /**************************
2288              * CALCULATE INTERACTIONS *
2289              **************************/
2290
2291             r02              = _mm256_mul_ps(rsq02,rinv02);
2292             r02              = _mm256_andnot_ps(dummy_mask,r02);
2293
2294             /* Calculate table index by multiplying r with table scale and truncate to integer */
2295             rt               = _mm256_mul_ps(r02,vftabscale);
2296             vfitab           = _mm256_cvttps_epi32(rt);
2297             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2298             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2299             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2300             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2301             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2302             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2303
2304             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2305             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2306                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2307             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2308                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2309             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2310                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2311             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2312                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2313             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2314             Heps             = _mm256_mul_ps(vfeps,H);
2315             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2316             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2317             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2318
2319             fscal            = felec;
2320
2321             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2322
2323             /* Calculate temporary vectorial force */
2324             tx               = _mm256_mul_ps(fscal,dx02);
2325             ty               = _mm256_mul_ps(fscal,dy02);
2326             tz               = _mm256_mul_ps(fscal,dz02);
2327
2328             /* Update vectorial force */
2329             fix0             = _mm256_add_ps(fix0,tx);
2330             fiy0             = _mm256_add_ps(fiy0,ty);
2331             fiz0             = _mm256_add_ps(fiz0,tz);
2332
2333             fjx2             = _mm256_add_ps(fjx2,tx);
2334             fjy2             = _mm256_add_ps(fjy2,ty);
2335             fjz2             = _mm256_add_ps(fjz2,tz);
2336
2337             /**************************
2338              * CALCULATE INTERACTIONS *
2339              **************************/
2340
2341             r10              = _mm256_mul_ps(rsq10,rinv10);
2342             r10              = _mm256_andnot_ps(dummy_mask,r10);
2343
2344             /* Calculate table index by multiplying r with table scale and truncate to integer */
2345             rt               = _mm256_mul_ps(r10,vftabscale);
2346             vfitab           = _mm256_cvttps_epi32(rt);
2347             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2348             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2349             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2350             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2351             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2352             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2353
2354             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2355             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2356                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2357             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2358                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2359             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2360                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2361             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2362                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2363             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2364             Heps             = _mm256_mul_ps(vfeps,H);
2365             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2366             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2367             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2368
2369             fscal            = felec;
2370
2371             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2372
2373             /* Calculate temporary vectorial force */
2374             tx               = _mm256_mul_ps(fscal,dx10);
2375             ty               = _mm256_mul_ps(fscal,dy10);
2376             tz               = _mm256_mul_ps(fscal,dz10);
2377
2378             /* Update vectorial force */
2379             fix1             = _mm256_add_ps(fix1,tx);
2380             fiy1             = _mm256_add_ps(fiy1,ty);
2381             fiz1             = _mm256_add_ps(fiz1,tz);
2382
2383             fjx0             = _mm256_add_ps(fjx0,tx);
2384             fjy0             = _mm256_add_ps(fjy0,ty);
2385             fjz0             = _mm256_add_ps(fjz0,tz);
2386
2387             /**************************
2388              * CALCULATE INTERACTIONS *
2389              **************************/
2390
2391             r11              = _mm256_mul_ps(rsq11,rinv11);
2392             r11              = _mm256_andnot_ps(dummy_mask,r11);
2393
2394             /* Calculate table index by multiplying r with table scale and truncate to integer */
2395             rt               = _mm256_mul_ps(r11,vftabscale);
2396             vfitab           = _mm256_cvttps_epi32(rt);
2397             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2398             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2399             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2400             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2401             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2402             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2403
2404             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2405             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2406                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2407             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2408                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2409             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2410                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2411             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2412                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2413             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2414             Heps             = _mm256_mul_ps(vfeps,H);
2415             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2416             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2417             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2418
2419             fscal            = felec;
2420
2421             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2422
2423             /* Calculate temporary vectorial force */
2424             tx               = _mm256_mul_ps(fscal,dx11);
2425             ty               = _mm256_mul_ps(fscal,dy11);
2426             tz               = _mm256_mul_ps(fscal,dz11);
2427
2428             /* Update vectorial force */
2429             fix1             = _mm256_add_ps(fix1,tx);
2430             fiy1             = _mm256_add_ps(fiy1,ty);
2431             fiz1             = _mm256_add_ps(fiz1,tz);
2432
2433             fjx1             = _mm256_add_ps(fjx1,tx);
2434             fjy1             = _mm256_add_ps(fjy1,ty);
2435             fjz1             = _mm256_add_ps(fjz1,tz);
2436
2437             /**************************
2438              * CALCULATE INTERACTIONS *
2439              **************************/
2440
2441             r12              = _mm256_mul_ps(rsq12,rinv12);
2442             r12              = _mm256_andnot_ps(dummy_mask,r12);
2443
2444             /* Calculate table index by multiplying r with table scale and truncate to integer */
2445             rt               = _mm256_mul_ps(r12,vftabscale);
2446             vfitab           = _mm256_cvttps_epi32(rt);
2447             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2448             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2449             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2450             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2451             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2452             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2453
2454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2455             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2456                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2457             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2458                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2459             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2460                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2461             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2462                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2463             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2464             Heps             = _mm256_mul_ps(vfeps,H);
2465             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2466             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2467             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2468
2469             fscal            = felec;
2470
2471             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2472
2473             /* Calculate temporary vectorial force */
2474             tx               = _mm256_mul_ps(fscal,dx12);
2475             ty               = _mm256_mul_ps(fscal,dy12);
2476             tz               = _mm256_mul_ps(fscal,dz12);
2477
2478             /* Update vectorial force */
2479             fix1             = _mm256_add_ps(fix1,tx);
2480             fiy1             = _mm256_add_ps(fiy1,ty);
2481             fiz1             = _mm256_add_ps(fiz1,tz);
2482
2483             fjx2             = _mm256_add_ps(fjx2,tx);
2484             fjy2             = _mm256_add_ps(fjy2,ty);
2485             fjz2             = _mm256_add_ps(fjz2,tz);
2486
2487             /**************************
2488              * CALCULATE INTERACTIONS *
2489              **************************/
2490
2491             r20              = _mm256_mul_ps(rsq20,rinv20);
2492             r20              = _mm256_andnot_ps(dummy_mask,r20);
2493
2494             /* Calculate table index by multiplying r with table scale and truncate to integer */
2495             rt               = _mm256_mul_ps(r20,vftabscale);
2496             vfitab           = _mm256_cvttps_epi32(rt);
2497             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2498             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2499             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2500             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2501             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2502             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2503
2504             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2505             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2506                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2507             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2508                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2509             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2510                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2511             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2512                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2513             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2514             Heps             = _mm256_mul_ps(vfeps,H);
2515             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2516             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2517             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2518
2519             fscal            = felec;
2520
2521             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2522
2523             /* Calculate temporary vectorial force */
2524             tx               = _mm256_mul_ps(fscal,dx20);
2525             ty               = _mm256_mul_ps(fscal,dy20);
2526             tz               = _mm256_mul_ps(fscal,dz20);
2527
2528             /* Update vectorial force */
2529             fix2             = _mm256_add_ps(fix2,tx);
2530             fiy2             = _mm256_add_ps(fiy2,ty);
2531             fiz2             = _mm256_add_ps(fiz2,tz);
2532
2533             fjx0             = _mm256_add_ps(fjx0,tx);
2534             fjy0             = _mm256_add_ps(fjy0,ty);
2535             fjz0             = _mm256_add_ps(fjz0,tz);
2536
2537             /**************************
2538              * CALCULATE INTERACTIONS *
2539              **************************/
2540
2541             r21              = _mm256_mul_ps(rsq21,rinv21);
2542             r21              = _mm256_andnot_ps(dummy_mask,r21);
2543
2544             /* Calculate table index by multiplying r with table scale and truncate to integer */
2545             rt               = _mm256_mul_ps(r21,vftabscale);
2546             vfitab           = _mm256_cvttps_epi32(rt);
2547             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2548             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2549             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2550             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2551             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2552             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2553
2554             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2555             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2556                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2557             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2558                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2559             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2560                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2561             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2562                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2563             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2564             Heps             = _mm256_mul_ps(vfeps,H);
2565             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2566             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2567             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2568
2569             fscal            = felec;
2570
2571             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2572
2573             /* Calculate temporary vectorial force */
2574             tx               = _mm256_mul_ps(fscal,dx21);
2575             ty               = _mm256_mul_ps(fscal,dy21);
2576             tz               = _mm256_mul_ps(fscal,dz21);
2577
2578             /* Update vectorial force */
2579             fix2             = _mm256_add_ps(fix2,tx);
2580             fiy2             = _mm256_add_ps(fiy2,ty);
2581             fiz2             = _mm256_add_ps(fiz2,tz);
2582
2583             fjx1             = _mm256_add_ps(fjx1,tx);
2584             fjy1             = _mm256_add_ps(fjy1,ty);
2585             fjz1             = _mm256_add_ps(fjz1,tz);
2586
2587             /**************************
2588              * CALCULATE INTERACTIONS *
2589              **************************/
2590
2591             r22              = _mm256_mul_ps(rsq22,rinv22);
2592             r22              = _mm256_andnot_ps(dummy_mask,r22);
2593
2594             /* Calculate table index by multiplying r with table scale and truncate to integer */
2595             rt               = _mm256_mul_ps(r22,vftabscale);
2596             vfitab           = _mm256_cvttps_epi32(rt);
2597             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2598             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2599             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
2600             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
2601             vfitab_lo        = _mm_slli_epi32(vfitab_lo,2);
2602             vfitab_hi        = _mm_slli_epi32(vfitab_hi,2);
2603
2604             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2605             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2606                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2607             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2608                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2609             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2610                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2611             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2612                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2613             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2614             Heps             = _mm256_mul_ps(vfeps,H);
2615             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2616             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2617             felec            = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2618
2619             fscal            = felec;
2620
2621             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2622
2623             /* Calculate temporary vectorial force */
2624             tx               = _mm256_mul_ps(fscal,dx22);
2625             ty               = _mm256_mul_ps(fscal,dy22);
2626             tz               = _mm256_mul_ps(fscal,dz22);
2627
2628             /* Update vectorial force */
2629             fix2             = _mm256_add_ps(fix2,tx);
2630             fiy2             = _mm256_add_ps(fiy2,ty);
2631             fiz2             = _mm256_add_ps(fiz2,tz);
2632
2633             fjx2             = _mm256_add_ps(fjx2,tx);
2634             fjy2             = _mm256_add_ps(fjy2,ty);
2635             fjz2             = _mm256_add_ps(fjz2,tz);
2636
2637             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2638             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2639             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2640             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2641             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2642             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2643             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2644             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2645
2646             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2647                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2648
2649             /* Inner loop uses 360 flops */
2650         }
2651
2652         /* End of innermost loop */
2653
2654         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2655                                                  f+i_coord_offset,fshift+i_shift_offset);
2656
2657         /* Increment number of inner iterations */
2658         inneriter                  += j_index_end - j_index_start;
2659
2660         /* Outer loop uses 18 flops */
2661     }
2662
2663     /* Increment number of outer iterations */
2664     outeriter        += nri;
2665
2666     /* Update outer/inner flops */
2667
2668     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2669 }