Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRF_VdwCSTab_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
119     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
120     __m128i          vfitab;
121     __m128i          ifour       = _mm_set1_epi32(4);
122     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
123     real             *vftab;
124     __m256d          dummy_mask,cutoff_mask;
125     __m128           tmpmask0,tmpmask1;
126     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
127     __m256d          one     = _mm256_set1_pd(1.0);
128     __m256d          two     = _mm256_set1_pd(2.0);
129     x                = xx[0];
130     f                = ff[0];
131
132     nri              = nlist->nri;
133     iinr             = nlist->iinr;
134     jindex           = nlist->jindex;
135     jjnr             = nlist->jjnr;
136     shiftidx         = nlist->shift;
137     gid              = nlist->gid;
138     shiftvec         = fr->shift_vec[0];
139     fshift           = fr->fshift[0];
140     facel            = _mm256_set1_pd(fr->epsfac);
141     charge           = mdatoms->chargeA;
142     krf              = _mm256_set1_pd(fr->ic->k_rf);
143     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
144     crf              = _mm256_set1_pd(fr->ic->c_rf);
145     nvdwtype         = fr->ntype;
146     vdwparam         = fr->nbfp;
147     vdwtype          = mdatoms->typeA;
148
149     vftab            = kernel_data->table_vdw->data;
150     vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
151
152     /* Setup water-specific parameters */
153     inr              = nlist->iinr[0];
154     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
155     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
156     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
157     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
158
159     jq1              = _mm256_set1_pd(charge[inr+1]);
160     jq2              = _mm256_set1_pd(charge[inr+2]);
161     jq3              = _mm256_set1_pd(charge[inr+3]);
162     vdwjidx0A        = 2*vdwtype[inr+0];
163     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
164     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
165     qq11             = _mm256_mul_pd(iq1,jq1);
166     qq12             = _mm256_mul_pd(iq1,jq2);
167     qq13             = _mm256_mul_pd(iq1,jq3);
168     qq21             = _mm256_mul_pd(iq2,jq1);
169     qq22             = _mm256_mul_pd(iq2,jq2);
170     qq23             = _mm256_mul_pd(iq2,jq3);
171     qq31             = _mm256_mul_pd(iq3,jq1);
172     qq32             = _mm256_mul_pd(iq3,jq2);
173     qq33             = _mm256_mul_pd(iq3,jq3);
174
175     /* Avoid stupid compiler warnings */
176     jnrA = jnrB = jnrC = jnrD = 0;
177     j_coord_offsetA = 0;
178     j_coord_offsetB = 0;
179     j_coord_offsetC = 0;
180     j_coord_offsetD = 0;
181
182     outeriter        = 0;
183     inneriter        = 0;
184
185     for(iidx=0;iidx<4*DIM;iidx++)
186     {
187         scratch[iidx] = 0.0;
188     }
189
190     /* Start outer loop over neighborlists */
191     for(iidx=0; iidx<nri; iidx++)
192     {
193         /* Load shift vector for this list */
194         i_shift_offset   = DIM*shiftidx[iidx];
195
196         /* Load limits for loop over neighbors */
197         j_index_start    = jindex[iidx];
198         j_index_end      = jindex[iidx+1];
199
200         /* Get outer coordinate index */
201         inr              = iinr[iidx];
202         i_coord_offset   = DIM*inr;
203
204         /* Load i particle coords and add shift vector */
205         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
206                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
207
208         fix0             = _mm256_setzero_pd();
209         fiy0             = _mm256_setzero_pd();
210         fiz0             = _mm256_setzero_pd();
211         fix1             = _mm256_setzero_pd();
212         fiy1             = _mm256_setzero_pd();
213         fiz1             = _mm256_setzero_pd();
214         fix2             = _mm256_setzero_pd();
215         fiy2             = _mm256_setzero_pd();
216         fiz2             = _mm256_setzero_pd();
217         fix3             = _mm256_setzero_pd();
218         fiy3             = _mm256_setzero_pd();
219         fiz3             = _mm256_setzero_pd();
220
221         /* Reset potential sums */
222         velecsum         = _mm256_setzero_pd();
223         vvdwsum          = _mm256_setzero_pd();
224
225         /* Start inner kernel loop */
226         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227         {
228
229             /* Get j neighbor index, and coordinate index */
230             jnrA             = jjnr[jidx];
231             jnrB             = jjnr[jidx+1];
232             jnrC             = jjnr[jidx+2];
233             jnrD             = jjnr[jidx+3];
234             j_coord_offsetA  = DIM*jnrA;
235             j_coord_offsetB  = DIM*jnrB;
236             j_coord_offsetC  = DIM*jnrC;
237             j_coord_offsetD  = DIM*jnrD;
238
239             /* load j atom coordinates */
240             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
241                                                  x+j_coord_offsetC,x+j_coord_offsetD,
242                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm256_sub_pd(ix0,jx0);
247             dy00             = _mm256_sub_pd(iy0,jy0);
248             dz00             = _mm256_sub_pd(iz0,jz0);
249             dx11             = _mm256_sub_pd(ix1,jx1);
250             dy11             = _mm256_sub_pd(iy1,jy1);
251             dz11             = _mm256_sub_pd(iz1,jz1);
252             dx12             = _mm256_sub_pd(ix1,jx2);
253             dy12             = _mm256_sub_pd(iy1,jy2);
254             dz12             = _mm256_sub_pd(iz1,jz2);
255             dx13             = _mm256_sub_pd(ix1,jx3);
256             dy13             = _mm256_sub_pd(iy1,jy3);
257             dz13             = _mm256_sub_pd(iz1,jz3);
258             dx21             = _mm256_sub_pd(ix2,jx1);
259             dy21             = _mm256_sub_pd(iy2,jy1);
260             dz21             = _mm256_sub_pd(iz2,jz1);
261             dx22             = _mm256_sub_pd(ix2,jx2);
262             dy22             = _mm256_sub_pd(iy2,jy2);
263             dz22             = _mm256_sub_pd(iz2,jz2);
264             dx23             = _mm256_sub_pd(ix2,jx3);
265             dy23             = _mm256_sub_pd(iy2,jy3);
266             dz23             = _mm256_sub_pd(iz2,jz3);
267             dx31             = _mm256_sub_pd(ix3,jx1);
268             dy31             = _mm256_sub_pd(iy3,jy1);
269             dz31             = _mm256_sub_pd(iz3,jz1);
270             dx32             = _mm256_sub_pd(ix3,jx2);
271             dy32             = _mm256_sub_pd(iy3,jy2);
272             dz32             = _mm256_sub_pd(iz3,jz2);
273             dx33             = _mm256_sub_pd(ix3,jx3);
274             dy33             = _mm256_sub_pd(iy3,jy3);
275             dz33             = _mm256_sub_pd(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
279             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
280             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
281             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
282             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
283             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
284             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
285             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
286             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
287             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
288
289             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
290             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
291             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
292             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
293             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
294             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
295             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
296             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
297             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
298             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
299
300             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
301             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
302             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
303             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
304             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
305             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
306             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
307             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
308             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
309
310             fjx0             = _mm256_setzero_pd();
311             fjy0             = _mm256_setzero_pd();
312             fjz0             = _mm256_setzero_pd();
313             fjx1             = _mm256_setzero_pd();
314             fjy1             = _mm256_setzero_pd();
315             fjz1             = _mm256_setzero_pd();
316             fjx2             = _mm256_setzero_pd();
317             fjy2             = _mm256_setzero_pd();
318             fjz2             = _mm256_setzero_pd();
319             fjx3             = _mm256_setzero_pd();
320             fjy3             = _mm256_setzero_pd();
321             fjz3             = _mm256_setzero_pd();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             r00              = _mm256_mul_pd(rsq00,rinv00);
328
329             /* Calculate table index by multiplying r with table scale and truncate to integer */
330             rt               = _mm256_mul_pd(r00,vftabscale);
331             vfitab           = _mm256_cvttpd_epi32(rt);
332             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
333             vfitab           = _mm_slli_epi32(vfitab,3);
334
335             /* CUBIC SPLINE TABLE DISPERSION */
336             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
337             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
338             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
339             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
340             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
341             Heps             = _mm256_mul_pd(vfeps,H);
342             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
343             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
344             vvdw6            = _mm256_mul_pd(c6_00,VV);
345             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
346             fvdw6            = _mm256_mul_pd(c6_00,FF);
347
348             /* CUBIC SPLINE TABLE REPULSION */
349             vfitab           = _mm_add_epi32(vfitab,ifour);
350             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
351             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
352             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
353             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
354             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
355             Heps             = _mm256_mul_pd(vfeps,H);
356             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
357             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
358             vvdw12           = _mm256_mul_pd(c12_00,VV);
359             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
360             fvdw12           = _mm256_mul_pd(c12_00,FF);
361             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
362             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
366
367             fscal            = fvdw;
368
369             /* Calculate temporary vectorial force */
370             tx               = _mm256_mul_pd(fscal,dx00);
371             ty               = _mm256_mul_pd(fscal,dy00);
372             tz               = _mm256_mul_pd(fscal,dz00);
373
374             /* Update vectorial force */
375             fix0             = _mm256_add_pd(fix0,tx);
376             fiy0             = _mm256_add_pd(fiy0,ty);
377             fiz0             = _mm256_add_pd(fiz0,tz);
378
379             fjx0             = _mm256_add_pd(fjx0,tx);
380             fjy0             = _mm256_add_pd(fjy0,ty);
381             fjz0             = _mm256_add_pd(fjz0,tz);
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             /* REACTION-FIELD ELECTROSTATICS */
388             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
389             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
390
391             /* Update potential sum for this i atom from the interaction with this j atom. */
392             velecsum         = _mm256_add_pd(velecsum,velec);
393
394             fscal            = felec;
395
396             /* Calculate temporary vectorial force */
397             tx               = _mm256_mul_pd(fscal,dx11);
398             ty               = _mm256_mul_pd(fscal,dy11);
399             tz               = _mm256_mul_pd(fscal,dz11);
400
401             /* Update vectorial force */
402             fix1             = _mm256_add_pd(fix1,tx);
403             fiy1             = _mm256_add_pd(fiy1,ty);
404             fiz1             = _mm256_add_pd(fiz1,tz);
405
406             fjx1             = _mm256_add_pd(fjx1,tx);
407             fjy1             = _mm256_add_pd(fjy1,ty);
408             fjz1             = _mm256_add_pd(fjz1,tz);
409
410             /**************************
411              * CALCULATE INTERACTIONS *
412              **************************/
413
414             /* REACTION-FIELD ELECTROSTATICS */
415             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
416             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
417
418             /* Update potential sum for this i atom from the interaction with this j atom. */
419             velecsum         = _mm256_add_pd(velecsum,velec);
420
421             fscal            = felec;
422
423             /* Calculate temporary vectorial force */
424             tx               = _mm256_mul_pd(fscal,dx12);
425             ty               = _mm256_mul_pd(fscal,dy12);
426             tz               = _mm256_mul_pd(fscal,dz12);
427
428             /* Update vectorial force */
429             fix1             = _mm256_add_pd(fix1,tx);
430             fiy1             = _mm256_add_pd(fiy1,ty);
431             fiz1             = _mm256_add_pd(fiz1,tz);
432
433             fjx2             = _mm256_add_pd(fjx2,tx);
434             fjy2             = _mm256_add_pd(fjy2,ty);
435             fjz2             = _mm256_add_pd(fjz2,tz);
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             /* REACTION-FIELD ELECTROSTATICS */
442             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
443             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
444
445             /* Update potential sum for this i atom from the interaction with this j atom. */
446             velecsum         = _mm256_add_pd(velecsum,velec);
447
448             fscal            = felec;
449
450             /* Calculate temporary vectorial force */
451             tx               = _mm256_mul_pd(fscal,dx13);
452             ty               = _mm256_mul_pd(fscal,dy13);
453             tz               = _mm256_mul_pd(fscal,dz13);
454
455             /* Update vectorial force */
456             fix1             = _mm256_add_pd(fix1,tx);
457             fiy1             = _mm256_add_pd(fiy1,ty);
458             fiz1             = _mm256_add_pd(fiz1,tz);
459
460             fjx3             = _mm256_add_pd(fjx3,tx);
461             fjy3             = _mm256_add_pd(fjy3,ty);
462             fjz3             = _mm256_add_pd(fjz3,tz);
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             /* REACTION-FIELD ELECTROSTATICS */
469             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
470             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
471
472             /* Update potential sum for this i atom from the interaction with this j atom. */
473             velecsum         = _mm256_add_pd(velecsum,velec);
474
475             fscal            = felec;
476
477             /* Calculate temporary vectorial force */
478             tx               = _mm256_mul_pd(fscal,dx21);
479             ty               = _mm256_mul_pd(fscal,dy21);
480             tz               = _mm256_mul_pd(fscal,dz21);
481
482             /* Update vectorial force */
483             fix2             = _mm256_add_pd(fix2,tx);
484             fiy2             = _mm256_add_pd(fiy2,ty);
485             fiz2             = _mm256_add_pd(fiz2,tz);
486
487             fjx1             = _mm256_add_pd(fjx1,tx);
488             fjy1             = _mm256_add_pd(fjy1,ty);
489             fjz1             = _mm256_add_pd(fjz1,tz);
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             /* REACTION-FIELD ELECTROSTATICS */
496             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
497             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
498
499             /* Update potential sum for this i atom from the interaction with this j atom. */
500             velecsum         = _mm256_add_pd(velecsum,velec);
501
502             fscal            = felec;
503
504             /* Calculate temporary vectorial force */
505             tx               = _mm256_mul_pd(fscal,dx22);
506             ty               = _mm256_mul_pd(fscal,dy22);
507             tz               = _mm256_mul_pd(fscal,dz22);
508
509             /* Update vectorial force */
510             fix2             = _mm256_add_pd(fix2,tx);
511             fiy2             = _mm256_add_pd(fiy2,ty);
512             fiz2             = _mm256_add_pd(fiz2,tz);
513
514             fjx2             = _mm256_add_pd(fjx2,tx);
515             fjy2             = _mm256_add_pd(fjy2,ty);
516             fjz2             = _mm256_add_pd(fjz2,tz);
517
518             /**************************
519              * CALCULATE INTERACTIONS *
520              **************************/
521
522             /* REACTION-FIELD ELECTROSTATICS */
523             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
524             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velecsum         = _mm256_add_pd(velecsum,velec);
528
529             fscal            = felec;
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm256_mul_pd(fscal,dx23);
533             ty               = _mm256_mul_pd(fscal,dy23);
534             tz               = _mm256_mul_pd(fscal,dz23);
535
536             /* Update vectorial force */
537             fix2             = _mm256_add_pd(fix2,tx);
538             fiy2             = _mm256_add_pd(fiy2,ty);
539             fiz2             = _mm256_add_pd(fiz2,tz);
540
541             fjx3             = _mm256_add_pd(fjx3,tx);
542             fjy3             = _mm256_add_pd(fjy3,ty);
543             fjz3             = _mm256_add_pd(fjz3,tz);
544
545             /**************************
546              * CALCULATE INTERACTIONS *
547              **************************/
548
549             /* REACTION-FIELD ELECTROSTATICS */
550             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
551             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
552
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velecsum         = _mm256_add_pd(velecsum,velec);
555
556             fscal            = felec;
557
558             /* Calculate temporary vectorial force */
559             tx               = _mm256_mul_pd(fscal,dx31);
560             ty               = _mm256_mul_pd(fscal,dy31);
561             tz               = _mm256_mul_pd(fscal,dz31);
562
563             /* Update vectorial force */
564             fix3             = _mm256_add_pd(fix3,tx);
565             fiy3             = _mm256_add_pd(fiy3,ty);
566             fiz3             = _mm256_add_pd(fiz3,tz);
567
568             fjx1             = _mm256_add_pd(fjx1,tx);
569             fjy1             = _mm256_add_pd(fjy1,ty);
570             fjz1             = _mm256_add_pd(fjz1,tz);
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             /* REACTION-FIELD ELECTROSTATICS */
577             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
578             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
579
580             /* Update potential sum for this i atom from the interaction with this j atom. */
581             velecsum         = _mm256_add_pd(velecsum,velec);
582
583             fscal            = felec;
584
585             /* Calculate temporary vectorial force */
586             tx               = _mm256_mul_pd(fscal,dx32);
587             ty               = _mm256_mul_pd(fscal,dy32);
588             tz               = _mm256_mul_pd(fscal,dz32);
589
590             /* Update vectorial force */
591             fix3             = _mm256_add_pd(fix3,tx);
592             fiy3             = _mm256_add_pd(fiy3,ty);
593             fiz3             = _mm256_add_pd(fiz3,tz);
594
595             fjx2             = _mm256_add_pd(fjx2,tx);
596             fjy2             = _mm256_add_pd(fjy2,ty);
597             fjz2             = _mm256_add_pd(fjz2,tz);
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             /* REACTION-FIELD ELECTROSTATICS */
604             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
605             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
606
607             /* Update potential sum for this i atom from the interaction with this j atom. */
608             velecsum         = _mm256_add_pd(velecsum,velec);
609
610             fscal            = felec;
611
612             /* Calculate temporary vectorial force */
613             tx               = _mm256_mul_pd(fscal,dx33);
614             ty               = _mm256_mul_pd(fscal,dy33);
615             tz               = _mm256_mul_pd(fscal,dz33);
616
617             /* Update vectorial force */
618             fix3             = _mm256_add_pd(fix3,tx);
619             fiy3             = _mm256_add_pd(fiy3,ty);
620             fiz3             = _mm256_add_pd(fiz3,tz);
621
622             fjx3             = _mm256_add_pd(fjx3,tx);
623             fjy3             = _mm256_add_pd(fjy3,ty);
624             fjz3             = _mm256_add_pd(fjz3,tz);
625
626             fjptrA             = f+j_coord_offsetA;
627             fjptrB             = f+j_coord_offsetB;
628             fjptrC             = f+j_coord_offsetC;
629             fjptrD             = f+j_coord_offsetD;
630
631             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
632                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
633                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
634
635             /* Inner loop uses 347 flops */
636         }
637
638         if(jidx<j_index_end)
639         {
640
641             /* Get j neighbor index, and coordinate index */
642             jnrlistA         = jjnr[jidx];
643             jnrlistB         = jjnr[jidx+1];
644             jnrlistC         = jjnr[jidx+2];
645             jnrlistD         = jjnr[jidx+3];
646             /* Sign of each element will be negative for non-real atoms.
647              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
648              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
649              */
650             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
651
652             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
653             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
654             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
655
656             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
657             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
658             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
659             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
660             j_coord_offsetA  = DIM*jnrA;
661             j_coord_offsetB  = DIM*jnrB;
662             j_coord_offsetC  = DIM*jnrC;
663             j_coord_offsetD  = DIM*jnrD;
664
665             /* load j atom coordinates */
666             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
667                                                  x+j_coord_offsetC,x+j_coord_offsetD,
668                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
669                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
670
671             /* Calculate displacement vector */
672             dx00             = _mm256_sub_pd(ix0,jx0);
673             dy00             = _mm256_sub_pd(iy0,jy0);
674             dz00             = _mm256_sub_pd(iz0,jz0);
675             dx11             = _mm256_sub_pd(ix1,jx1);
676             dy11             = _mm256_sub_pd(iy1,jy1);
677             dz11             = _mm256_sub_pd(iz1,jz1);
678             dx12             = _mm256_sub_pd(ix1,jx2);
679             dy12             = _mm256_sub_pd(iy1,jy2);
680             dz12             = _mm256_sub_pd(iz1,jz2);
681             dx13             = _mm256_sub_pd(ix1,jx3);
682             dy13             = _mm256_sub_pd(iy1,jy3);
683             dz13             = _mm256_sub_pd(iz1,jz3);
684             dx21             = _mm256_sub_pd(ix2,jx1);
685             dy21             = _mm256_sub_pd(iy2,jy1);
686             dz21             = _mm256_sub_pd(iz2,jz1);
687             dx22             = _mm256_sub_pd(ix2,jx2);
688             dy22             = _mm256_sub_pd(iy2,jy2);
689             dz22             = _mm256_sub_pd(iz2,jz2);
690             dx23             = _mm256_sub_pd(ix2,jx3);
691             dy23             = _mm256_sub_pd(iy2,jy3);
692             dz23             = _mm256_sub_pd(iz2,jz3);
693             dx31             = _mm256_sub_pd(ix3,jx1);
694             dy31             = _mm256_sub_pd(iy3,jy1);
695             dz31             = _mm256_sub_pd(iz3,jz1);
696             dx32             = _mm256_sub_pd(ix3,jx2);
697             dy32             = _mm256_sub_pd(iy3,jy2);
698             dz32             = _mm256_sub_pd(iz3,jz2);
699             dx33             = _mm256_sub_pd(ix3,jx3);
700             dy33             = _mm256_sub_pd(iy3,jy3);
701             dz33             = _mm256_sub_pd(iz3,jz3);
702
703             /* Calculate squared distance and things based on it */
704             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
705             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
706             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
707             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
708             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
709             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
710             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
711             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
712             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
713             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
714
715             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
716             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
717             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
718             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
719             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
720             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
721             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
722             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
723             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
724             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
725
726             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
727             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
728             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
729             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
730             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
731             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
732             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
733             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
734             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
735
736             fjx0             = _mm256_setzero_pd();
737             fjy0             = _mm256_setzero_pd();
738             fjz0             = _mm256_setzero_pd();
739             fjx1             = _mm256_setzero_pd();
740             fjy1             = _mm256_setzero_pd();
741             fjz1             = _mm256_setzero_pd();
742             fjx2             = _mm256_setzero_pd();
743             fjy2             = _mm256_setzero_pd();
744             fjz2             = _mm256_setzero_pd();
745             fjx3             = _mm256_setzero_pd();
746             fjy3             = _mm256_setzero_pd();
747             fjz3             = _mm256_setzero_pd();
748
749             /**************************
750              * CALCULATE INTERACTIONS *
751              **************************/
752
753             r00              = _mm256_mul_pd(rsq00,rinv00);
754             r00              = _mm256_andnot_pd(dummy_mask,r00);
755
756             /* Calculate table index by multiplying r with table scale and truncate to integer */
757             rt               = _mm256_mul_pd(r00,vftabscale);
758             vfitab           = _mm256_cvttpd_epi32(rt);
759             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
760             vfitab           = _mm_slli_epi32(vfitab,3);
761
762             /* CUBIC SPLINE TABLE DISPERSION */
763             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
764             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
765             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
766             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
767             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
768             Heps             = _mm256_mul_pd(vfeps,H);
769             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
770             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
771             vvdw6            = _mm256_mul_pd(c6_00,VV);
772             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
773             fvdw6            = _mm256_mul_pd(c6_00,FF);
774
775             /* CUBIC SPLINE TABLE REPULSION */
776             vfitab           = _mm_add_epi32(vfitab,ifour);
777             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
778             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
779             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
780             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
781             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
782             Heps             = _mm256_mul_pd(vfeps,H);
783             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
784             VV               = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
785             vvdw12           = _mm256_mul_pd(c12_00,VV);
786             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
787             fvdw12           = _mm256_mul_pd(c12_00,FF);
788             vvdw             = _mm256_add_pd(vvdw12,vvdw6);
789             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
790
791             /* Update potential sum for this i atom from the interaction with this j atom. */
792             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
793             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
794
795             fscal            = fvdw;
796
797             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
798
799             /* Calculate temporary vectorial force */
800             tx               = _mm256_mul_pd(fscal,dx00);
801             ty               = _mm256_mul_pd(fscal,dy00);
802             tz               = _mm256_mul_pd(fscal,dz00);
803
804             /* Update vectorial force */
805             fix0             = _mm256_add_pd(fix0,tx);
806             fiy0             = _mm256_add_pd(fiy0,ty);
807             fiz0             = _mm256_add_pd(fiz0,tz);
808
809             fjx0             = _mm256_add_pd(fjx0,tx);
810             fjy0             = _mm256_add_pd(fjy0,ty);
811             fjz0             = _mm256_add_pd(fjz0,tz);
812
813             /**************************
814              * CALCULATE INTERACTIONS *
815              **************************/
816
817             /* REACTION-FIELD ELECTROSTATICS */
818             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
819             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
820
821             /* Update potential sum for this i atom from the interaction with this j atom. */
822             velec            = _mm256_andnot_pd(dummy_mask,velec);
823             velecsum         = _mm256_add_pd(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
828
829             /* Calculate temporary vectorial force */
830             tx               = _mm256_mul_pd(fscal,dx11);
831             ty               = _mm256_mul_pd(fscal,dy11);
832             tz               = _mm256_mul_pd(fscal,dz11);
833
834             /* Update vectorial force */
835             fix1             = _mm256_add_pd(fix1,tx);
836             fiy1             = _mm256_add_pd(fiy1,ty);
837             fiz1             = _mm256_add_pd(fiz1,tz);
838
839             fjx1             = _mm256_add_pd(fjx1,tx);
840             fjy1             = _mm256_add_pd(fjy1,ty);
841             fjz1             = _mm256_add_pd(fjz1,tz);
842
843             /**************************
844              * CALCULATE INTERACTIONS *
845              **************************/
846
847             /* REACTION-FIELD ELECTROSTATICS */
848             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
849             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
850
851             /* Update potential sum for this i atom from the interaction with this j atom. */
852             velec            = _mm256_andnot_pd(dummy_mask,velec);
853             velecsum         = _mm256_add_pd(velecsum,velec);
854
855             fscal            = felec;
856
857             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
858
859             /* Calculate temporary vectorial force */
860             tx               = _mm256_mul_pd(fscal,dx12);
861             ty               = _mm256_mul_pd(fscal,dy12);
862             tz               = _mm256_mul_pd(fscal,dz12);
863
864             /* Update vectorial force */
865             fix1             = _mm256_add_pd(fix1,tx);
866             fiy1             = _mm256_add_pd(fiy1,ty);
867             fiz1             = _mm256_add_pd(fiz1,tz);
868
869             fjx2             = _mm256_add_pd(fjx2,tx);
870             fjy2             = _mm256_add_pd(fjy2,ty);
871             fjz2             = _mm256_add_pd(fjz2,tz);
872
873             /**************************
874              * CALCULATE INTERACTIONS *
875              **************************/
876
877             /* REACTION-FIELD ELECTROSTATICS */
878             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
879             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
880
881             /* Update potential sum for this i atom from the interaction with this j atom. */
882             velec            = _mm256_andnot_pd(dummy_mask,velec);
883             velecsum         = _mm256_add_pd(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm256_mul_pd(fscal,dx13);
891             ty               = _mm256_mul_pd(fscal,dy13);
892             tz               = _mm256_mul_pd(fscal,dz13);
893
894             /* Update vectorial force */
895             fix1             = _mm256_add_pd(fix1,tx);
896             fiy1             = _mm256_add_pd(fiy1,ty);
897             fiz1             = _mm256_add_pd(fiz1,tz);
898
899             fjx3             = _mm256_add_pd(fjx3,tx);
900             fjy3             = _mm256_add_pd(fjy3,ty);
901             fjz3             = _mm256_add_pd(fjz3,tz);
902
903             /**************************
904              * CALCULATE INTERACTIONS *
905              **************************/
906
907             /* REACTION-FIELD ELECTROSTATICS */
908             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
909             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             velec            = _mm256_andnot_pd(dummy_mask,velec);
913             velecsum         = _mm256_add_pd(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
918
919             /* Calculate temporary vectorial force */
920             tx               = _mm256_mul_pd(fscal,dx21);
921             ty               = _mm256_mul_pd(fscal,dy21);
922             tz               = _mm256_mul_pd(fscal,dz21);
923
924             /* Update vectorial force */
925             fix2             = _mm256_add_pd(fix2,tx);
926             fiy2             = _mm256_add_pd(fiy2,ty);
927             fiz2             = _mm256_add_pd(fiz2,tz);
928
929             fjx1             = _mm256_add_pd(fjx1,tx);
930             fjy1             = _mm256_add_pd(fjy1,ty);
931             fjz1             = _mm256_add_pd(fjz1,tz);
932
933             /**************************
934              * CALCULATE INTERACTIONS *
935              **************************/
936
937             /* REACTION-FIELD ELECTROSTATICS */
938             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
939             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
940
941             /* Update potential sum for this i atom from the interaction with this j atom. */
942             velec            = _mm256_andnot_pd(dummy_mask,velec);
943             velecsum         = _mm256_add_pd(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm256_mul_pd(fscal,dx22);
951             ty               = _mm256_mul_pd(fscal,dy22);
952             tz               = _mm256_mul_pd(fscal,dz22);
953
954             /* Update vectorial force */
955             fix2             = _mm256_add_pd(fix2,tx);
956             fiy2             = _mm256_add_pd(fiy2,ty);
957             fiz2             = _mm256_add_pd(fiz2,tz);
958
959             fjx2             = _mm256_add_pd(fjx2,tx);
960             fjy2             = _mm256_add_pd(fjy2,ty);
961             fjz2             = _mm256_add_pd(fjz2,tz);
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             /* REACTION-FIELD ELECTROSTATICS */
968             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
969             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
970
971             /* Update potential sum for this i atom from the interaction with this j atom. */
972             velec            = _mm256_andnot_pd(dummy_mask,velec);
973             velecsum         = _mm256_add_pd(velecsum,velec);
974
975             fscal            = felec;
976
977             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
978
979             /* Calculate temporary vectorial force */
980             tx               = _mm256_mul_pd(fscal,dx23);
981             ty               = _mm256_mul_pd(fscal,dy23);
982             tz               = _mm256_mul_pd(fscal,dz23);
983
984             /* Update vectorial force */
985             fix2             = _mm256_add_pd(fix2,tx);
986             fiy2             = _mm256_add_pd(fiy2,ty);
987             fiz2             = _mm256_add_pd(fiz2,tz);
988
989             fjx3             = _mm256_add_pd(fjx3,tx);
990             fjy3             = _mm256_add_pd(fjy3,ty);
991             fjz3             = _mm256_add_pd(fjz3,tz);
992
993             /**************************
994              * CALCULATE INTERACTIONS *
995              **************************/
996
997             /* REACTION-FIELD ELECTROSTATICS */
998             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
999             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1000
1001             /* Update potential sum for this i atom from the interaction with this j atom. */
1002             velec            = _mm256_andnot_pd(dummy_mask,velec);
1003             velecsum         = _mm256_add_pd(velecsum,velec);
1004
1005             fscal            = felec;
1006
1007             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1008
1009             /* Calculate temporary vectorial force */
1010             tx               = _mm256_mul_pd(fscal,dx31);
1011             ty               = _mm256_mul_pd(fscal,dy31);
1012             tz               = _mm256_mul_pd(fscal,dz31);
1013
1014             /* Update vectorial force */
1015             fix3             = _mm256_add_pd(fix3,tx);
1016             fiy3             = _mm256_add_pd(fiy3,ty);
1017             fiz3             = _mm256_add_pd(fiz3,tz);
1018
1019             fjx1             = _mm256_add_pd(fjx1,tx);
1020             fjy1             = _mm256_add_pd(fjy1,ty);
1021             fjz1             = _mm256_add_pd(fjz1,tz);
1022
1023             /**************************
1024              * CALCULATE INTERACTIONS *
1025              **************************/
1026
1027             /* REACTION-FIELD ELECTROSTATICS */
1028             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1029             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1030
1031             /* Update potential sum for this i atom from the interaction with this j atom. */
1032             velec            = _mm256_andnot_pd(dummy_mask,velec);
1033             velecsum         = _mm256_add_pd(velecsum,velec);
1034
1035             fscal            = felec;
1036
1037             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1038
1039             /* Calculate temporary vectorial force */
1040             tx               = _mm256_mul_pd(fscal,dx32);
1041             ty               = _mm256_mul_pd(fscal,dy32);
1042             tz               = _mm256_mul_pd(fscal,dz32);
1043
1044             /* Update vectorial force */
1045             fix3             = _mm256_add_pd(fix3,tx);
1046             fiy3             = _mm256_add_pd(fiy3,ty);
1047             fiz3             = _mm256_add_pd(fiz3,tz);
1048
1049             fjx2             = _mm256_add_pd(fjx2,tx);
1050             fjy2             = _mm256_add_pd(fjy2,ty);
1051             fjz2             = _mm256_add_pd(fjz2,tz);
1052
1053             /**************************
1054              * CALCULATE INTERACTIONS *
1055              **************************/
1056
1057             /* REACTION-FIELD ELECTROSTATICS */
1058             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1059             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1060
1061             /* Update potential sum for this i atom from the interaction with this j atom. */
1062             velec            = _mm256_andnot_pd(dummy_mask,velec);
1063             velecsum         = _mm256_add_pd(velecsum,velec);
1064
1065             fscal            = felec;
1066
1067             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1068
1069             /* Calculate temporary vectorial force */
1070             tx               = _mm256_mul_pd(fscal,dx33);
1071             ty               = _mm256_mul_pd(fscal,dy33);
1072             tz               = _mm256_mul_pd(fscal,dz33);
1073
1074             /* Update vectorial force */
1075             fix3             = _mm256_add_pd(fix3,tx);
1076             fiy3             = _mm256_add_pd(fiy3,ty);
1077             fiz3             = _mm256_add_pd(fiz3,tz);
1078
1079             fjx3             = _mm256_add_pd(fjx3,tx);
1080             fjy3             = _mm256_add_pd(fjy3,ty);
1081             fjz3             = _mm256_add_pd(fjz3,tz);
1082
1083             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1084             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1085             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1086             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1087
1088             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1089                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1090                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1091
1092             /* Inner loop uses 348 flops */
1093         }
1094
1095         /* End of innermost loop */
1096
1097         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1098                                                  f+i_coord_offset,fshift+i_shift_offset);
1099
1100         ggid                        = gid[iidx];
1101         /* Update potential energies */
1102         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1103         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1104
1105         /* Increment number of inner iterations */
1106         inneriter                  += j_index_end - j_index_start;
1107
1108         /* Outer loop uses 26 flops */
1109     }
1110
1111     /* Increment number of outer iterations */
1112     outeriter        += nri;
1113
1114     /* Update outer/inner flops */
1115
1116     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*348);
1117 }
1118 /*
1119  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double
1120  * Electrostatics interaction: ReactionField
1121  * VdW interaction:            CubicSplineTable
1122  * Geometry:                   Water4-Water4
1123  * Calculate force/pot:        Force
1124  */
1125 void
1126 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_double
1127                     (t_nblist                    * gmx_restrict       nlist,
1128                      rvec                        * gmx_restrict          xx,
1129                      rvec                        * gmx_restrict          ff,
1130                      t_forcerec                  * gmx_restrict          fr,
1131                      t_mdatoms                   * gmx_restrict     mdatoms,
1132                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1133                      t_nrnb                      * gmx_restrict        nrnb)
1134 {
1135     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1136      * just 0 for non-waters.
1137      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1138      * jnr indices corresponding to data put in the four positions in the SIMD register.
1139      */
1140     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1141     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1142     int              jnrA,jnrB,jnrC,jnrD;
1143     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1144     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1145     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1146     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1147     real             rcutoff_scalar;
1148     real             *shiftvec,*fshift,*x,*f;
1149     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1150     real             scratch[4*DIM];
1151     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1152     real *           vdwioffsetptr0;
1153     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1154     real *           vdwioffsetptr1;
1155     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1156     real *           vdwioffsetptr2;
1157     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1158     real *           vdwioffsetptr3;
1159     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1160     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1161     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1162     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1163     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1164     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1165     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1166     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1167     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1168     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1169     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1170     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1171     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1172     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1173     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1174     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1175     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1176     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1177     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1178     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1179     real             *charge;
1180     int              nvdwtype;
1181     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1182     int              *vdwtype;
1183     real             *vdwparam;
1184     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1185     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1186     __m128i          vfitab;
1187     __m128i          ifour       = _mm_set1_epi32(4);
1188     __m256d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1189     real             *vftab;
1190     __m256d          dummy_mask,cutoff_mask;
1191     __m128           tmpmask0,tmpmask1;
1192     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1193     __m256d          one     = _mm256_set1_pd(1.0);
1194     __m256d          two     = _mm256_set1_pd(2.0);
1195     x                = xx[0];
1196     f                = ff[0];
1197
1198     nri              = nlist->nri;
1199     iinr             = nlist->iinr;
1200     jindex           = nlist->jindex;
1201     jjnr             = nlist->jjnr;
1202     shiftidx         = nlist->shift;
1203     gid              = nlist->gid;
1204     shiftvec         = fr->shift_vec[0];
1205     fshift           = fr->fshift[0];
1206     facel            = _mm256_set1_pd(fr->epsfac);
1207     charge           = mdatoms->chargeA;
1208     krf              = _mm256_set1_pd(fr->ic->k_rf);
1209     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1210     crf              = _mm256_set1_pd(fr->ic->c_rf);
1211     nvdwtype         = fr->ntype;
1212     vdwparam         = fr->nbfp;
1213     vdwtype          = mdatoms->typeA;
1214
1215     vftab            = kernel_data->table_vdw->data;
1216     vftabscale       = _mm256_set1_pd(kernel_data->table_vdw->scale);
1217
1218     /* Setup water-specific parameters */
1219     inr              = nlist->iinr[0];
1220     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1221     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1222     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1223     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1224
1225     jq1              = _mm256_set1_pd(charge[inr+1]);
1226     jq2              = _mm256_set1_pd(charge[inr+2]);
1227     jq3              = _mm256_set1_pd(charge[inr+3]);
1228     vdwjidx0A        = 2*vdwtype[inr+0];
1229     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1230     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1231     qq11             = _mm256_mul_pd(iq1,jq1);
1232     qq12             = _mm256_mul_pd(iq1,jq2);
1233     qq13             = _mm256_mul_pd(iq1,jq3);
1234     qq21             = _mm256_mul_pd(iq2,jq1);
1235     qq22             = _mm256_mul_pd(iq2,jq2);
1236     qq23             = _mm256_mul_pd(iq2,jq3);
1237     qq31             = _mm256_mul_pd(iq3,jq1);
1238     qq32             = _mm256_mul_pd(iq3,jq2);
1239     qq33             = _mm256_mul_pd(iq3,jq3);
1240
1241     /* Avoid stupid compiler warnings */
1242     jnrA = jnrB = jnrC = jnrD = 0;
1243     j_coord_offsetA = 0;
1244     j_coord_offsetB = 0;
1245     j_coord_offsetC = 0;
1246     j_coord_offsetD = 0;
1247
1248     outeriter        = 0;
1249     inneriter        = 0;
1250
1251     for(iidx=0;iidx<4*DIM;iidx++)
1252     {
1253         scratch[iidx] = 0.0;
1254     }
1255
1256     /* Start outer loop over neighborlists */
1257     for(iidx=0; iidx<nri; iidx++)
1258     {
1259         /* Load shift vector for this list */
1260         i_shift_offset   = DIM*shiftidx[iidx];
1261
1262         /* Load limits for loop over neighbors */
1263         j_index_start    = jindex[iidx];
1264         j_index_end      = jindex[iidx+1];
1265
1266         /* Get outer coordinate index */
1267         inr              = iinr[iidx];
1268         i_coord_offset   = DIM*inr;
1269
1270         /* Load i particle coords and add shift vector */
1271         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1272                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1273
1274         fix0             = _mm256_setzero_pd();
1275         fiy0             = _mm256_setzero_pd();
1276         fiz0             = _mm256_setzero_pd();
1277         fix1             = _mm256_setzero_pd();
1278         fiy1             = _mm256_setzero_pd();
1279         fiz1             = _mm256_setzero_pd();
1280         fix2             = _mm256_setzero_pd();
1281         fiy2             = _mm256_setzero_pd();
1282         fiz2             = _mm256_setzero_pd();
1283         fix3             = _mm256_setzero_pd();
1284         fiy3             = _mm256_setzero_pd();
1285         fiz3             = _mm256_setzero_pd();
1286
1287         /* Start inner kernel loop */
1288         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1289         {
1290
1291             /* Get j neighbor index, and coordinate index */
1292             jnrA             = jjnr[jidx];
1293             jnrB             = jjnr[jidx+1];
1294             jnrC             = jjnr[jidx+2];
1295             jnrD             = jjnr[jidx+3];
1296             j_coord_offsetA  = DIM*jnrA;
1297             j_coord_offsetB  = DIM*jnrB;
1298             j_coord_offsetC  = DIM*jnrC;
1299             j_coord_offsetD  = DIM*jnrD;
1300
1301             /* load j atom coordinates */
1302             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1303                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1304                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1305                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1306
1307             /* Calculate displacement vector */
1308             dx00             = _mm256_sub_pd(ix0,jx0);
1309             dy00             = _mm256_sub_pd(iy0,jy0);
1310             dz00             = _mm256_sub_pd(iz0,jz0);
1311             dx11             = _mm256_sub_pd(ix1,jx1);
1312             dy11             = _mm256_sub_pd(iy1,jy1);
1313             dz11             = _mm256_sub_pd(iz1,jz1);
1314             dx12             = _mm256_sub_pd(ix1,jx2);
1315             dy12             = _mm256_sub_pd(iy1,jy2);
1316             dz12             = _mm256_sub_pd(iz1,jz2);
1317             dx13             = _mm256_sub_pd(ix1,jx3);
1318             dy13             = _mm256_sub_pd(iy1,jy3);
1319             dz13             = _mm256_sub_pd(iz1,jz3);
1320             dx21             = _mm256_sub_pd(ix2,jx1);
1321             dy21             = _mm256_sub_pd(iy2,jy1);
1322             dz21             = _mm256_sub_pd(iz2,jz1);
1323             dx22             = _mm256_sub_pd(ix2,jx2);
1324             dy22             = _mm256_sub_pd(iy2,jy2);
1325             dz22             = _mm256_sub_pd(iz2,jz2);
1326             dx23             = _mm256_sub_pd(ix2,jx3);
1327             dy23             = _mm256_sub_pd(iy2,jy3);
1328             dz23             = _mm256_sub_pd(iz2,jz3);
1329             dx31             = _mm256_sub_pd(ix3,jx1);
1330             dy31             = _mm256_sub_pd(iy3,jy1);
1331             dz31             = _mm256_sub_pd(iz3,jz1);
1332             dx32             = _mm256_sub_pd(ix3,jx2);
1333             dy32             = _mm256_sub_pd(iy3,jy2);
1334             dz32             = _mm256_sub_pd(iz3,jz2);
1335             dx33             = _mm256_sub_pd(ix3,jx3);
1336             dy33             = _mm256_sub_pd(iy3,jy3);
1337             dz33             = _mm256_sub_pd(iz3,jz3);
1338
1339             /* Calculate squared distance and things based on it */
1340             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1341             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1342             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1343             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1344             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1345             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1346             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1347             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1348             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1349             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1350
1351             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1352             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1353             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1354             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1355             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1356             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1357             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1358             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1359             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1360             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1361
1362             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1363             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1364             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1365             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1366             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1367             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1368             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1369             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1370             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1371
1372             fjx0             = _mm256_setzero_pd();
1373             fjy0             = _mm256_setzero_pd();
1374             fjz0             = _mm256_setzero_pd();
1375             fjx1             = _mm256_setzero_pd();
1376             fjy1             = _mm256_setzero_pd();
1377             fjz1             = _mm256_setzero_pd();
1378             fjx2             = _mm256_setzero_pd();
1379             fjy2             = _mm256_setzero_pd();
1380             fjz2             = _mm256_setzero_pd();
1381             fjx3             = _mm256_setzero_pd();
1382             fjy3             = _mm256_setzero_pd();
1383             fjz3             = _mm256_setzero_pd();
1384
1385             /**************************
1386              * CALCULATE INTERACTIONS *
1387              **************************/
1388
1389             r00              = _mm256_mul_pd(rsq00,rinv00);
1390
1391             /* Calculate table index by multiplying r with table scale and truncate to integer */
1392             rt               = _mm256_mul_pd(r00,vftabscale);
1393             vfitab           = _mm256_cvttpd_epi32(rt);
1394             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1395             vfitab           = _mm_slli_epi32(vfitab,3);
1396
1397             /* CUBIC SPLINE TABLE DISPERSION */
1398             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1399             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1400             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1401             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1402             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1403             Heps             = _mm256_mul_pd(vfeps,H);
1404             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1405             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1406             fvdw6            = _mm256_mul_pd(c6_00,FF);
1407
1408             /* CUBIC SPLINE TABLE REPULSION */
1409             vfitab           = _mm_add_epi32(vfitab,ifour);
1410             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1411             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1412             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1413             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1414             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1415             Heps             = _mm256_mul_pd(vfeps,H);
1416             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1417             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1418             fvdw12           = _mm256_mul_pd(c12_00,FF);
1419             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1420
1421             fscal            = fvdw;
1422
1423             /* Calculate temporary vectorial force */
1424             tx               = _mm256_mul_pd(fscal,dx00);
1425             ty               = _mm256_mul_pd(fscal,dy00);
1426             tz               = _mm256_mul_pd(fscal,dz00);
1427
1428             /* Update vectorial force */
1429             fix0             = _mm256_add_pd(fix0,tx);
1430             fiy0             = _mm256_add_pd(fiy0,ty);
1431             fiz0             = _mm256_add_pd(fiz0,tz);
1432
1433             fjx0             = _mm256_add_pd(fjx0,tx);
1434             fjy0             = _mm256_add_pd(fjy0,ty);
1435             fjz0             = _mm256_add_pd(fjz0,tz);
1436
1437             /**************************
1438              * CALCULATE INTERACTIONS *
1439              **************************/
1440
1441             /* REACTION-FIELD ELECTROSTATICS */
1442             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1443
1444             fscal            = felec;
1445
1446             /* Calculate temporary vectorial force */
1447             tx               = _mm256_mul_pd(fscal,dx11);
1448             ty               = _mm256_mul_pd(fscal,dy11);
1449             tz               = _mm256_mul_pd(fscal,dz11);
1450
1451             /* Update vectorial force */
1452             fix1             = _mm256_add_pd(fix1,tx);
1453             fiy1             = _mm256_add_pd(fiy1,ty);
1454             fiz1             = _mm256_add_pd(fiz1,tz);
1455
1456             fjx1             = _mm256_add_pd(fjx1,tx);
1457             fjy1             = _mm256_add_pd(fjy1,ty);
1458             fjz1             = _mm256_add_pd(fjz1,tz);
1459
1460             /**************************
1461              * CALCULATE INTERACTIONS *
1462              **************************/
1463
1464             /* REACTION-FIELD ELECTROSTATICS */
1465             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1466
1467             fscal            = felec;
1468
1469             /* Calculate temporary vectorial force */
1470             tx               = _mm256_mul_pd(fscal,dx12);
1471             ty               = _mm256_mul_pd(fscal,dy12);
1472             tz               = _mm256_mul_pd(fscal,dz12);
1473
1474             /* Update vectorial force */
1475             fix1             = _mm256_add_pd(fix1,tx);
1476             fiy1             = _mm256_add_pd(fiy1,ty);
1477             fiz1             = _mm256_add_pd(fiz1,tz);
1478
1479             fjx2             = _mm256_add_pd(fjx2,tx);
1480             fjy2             = _mm256_add_pd(fjy2,ty);
1481             fjz2             = _mm256_add_pd(fjz2,tz);
1482
1483             /**************************
1484              * CALCULATE INTERACTIONS *
1485              **************************/
1486
1487             /* REACTION-FIELD ELECTROSTATICS */
1488             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1489
1490             fscal            = felec;
1491
1492             /* Calculate temporary vectorial force */
1493             tx               = _mm256_mul_pd(fscal,dx13);
1494             ty               = _mm256_mul_pd(fscal,dy13);
1495             tz               = _mm256_mul_pd(fscal,dz13);
1496
1497             /* Update vectorial force */
1498             fix1             = _mm256_add_pd(fix1,tx);
1499             fiy1             = _mm256_add_pd(fiy1,ty);
1500             fiz1             = _mm256_add_pd(fiz1,tz);
1501
1502             fjx3             = _mm256_add_pd(fjx3,tx);
1503             fjy3             = _mm256_add_pd(fjy3,ty);
1504             fjz3             = _mm256_add_pd(fjz3,tz);
1505
1506             /**************************
1507              * CALCULATE INTERACTIONS *
1508              **************************/
1509
1510             /* REACTION-FIELD ELECTROSTATICS */
1511             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1512
1513             fscal            = felec;
1514
1515             /* Calculate temporary vectorial force */
1516             tx               = _mm256_mul_pd(fscal,dx21);
1517             ty               = _mm256_mul_pd(fscal,dy21);
1518             tz               = _mm256_mul_pd(fscal,dz21);
1519
1520             /* Update vectorial force */
1521             fix2             = _mm256_add_pd(fix2,tx);
1522             fiy2             = _mm256_add_pd(fiy2,ty);
1523             fiz2             = _mm256_add_pd(fiz2,tz);
1524
1525             fjx1             = _mm256_add_pd(fjx1,tx);
1526             fjy1             = _mm256_add_pd(fjy1,ty);
1527             fjz1             = _mm256_add_pd(fjz1,tz);
1528
1529             /**************************
1530              * CALCULATE INTERACTIONS *
1531              **************************/
1532
1533             /* REACTION-FIELD ELECTROSTATICS */
1534             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1535
1536             fscal            = felec;
1537
1538             /* Calculate temporary vectorial force */
1539             tx               = _mm256_mul_pd(fscal,dx22);
1540             ty               = _mm256_mul_pd(fscal,dy22);
1541             tz               = _mm256_mul_pd(fscal,dz22);
1542
1543             /* Update vectorial force */
1544             fix2             = _mm256_add_pd(fix2,tx);
1545             fiy2             = _mm256_add_pd(fiy2,ty);
1546             fiz2             = _mm256_add_pd(fiz2,tz);
1547
1548             fjx2             = _mm256_add_pd(fjx2,tx);
1549             fjy2             = _mm256_add_pd(fjy2,ty);
1550             fjz2             = _mm256_add_pd(fjz2,tz);
1551
1552             /**************************
1553              * CALCULATE INTERACTIONS *
1554              **************************/
1555
1556             /* REACTION-FIELD ELECTROSTATICS */
1557             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1558
1559             fscal            = felec;
1560
1561             /* Calculate temporary vectorial force */
1562             tx               = _mm256_mul_pd(fscal,dx23);
1563             ty               = _mm256_mul_pd(fscal,dy23);
1564             tz               = _mm256_mul_pd(fscal,dz23);
1565
1566             /* Update vectorial force */
1567             fix2             = _mm256_add_pd(fix2,tx);
1568             fiy2             = _mm256_add_pd(fiy2,ty);
1569             fiz2             = _mm256_add_pd(fiz2,tz);
1570
1571             fjx3             = _mm256_add_pd(fjx3,tx);
1572             fjy3             = _mm256_add_pd(fjy3,ty);
1573             fjz3             = _mm256_add_pd(fjz3,tz);
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             /* REACTION-FIELD ELECTROSTATICS */
1580             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1581
1582             fscal            = felec;
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm256_mul_pd(fscal,dx31);
1586             ty               = _mm256_mul_pd(fscal,dy31);
1587             tz               = _mm256_mul_pd(fscal,dz31);
1588
1589             /* Update vectorial force */
1590             fix3             = _mm256_add_pd(fix3,tx);
1591             fiy3             = _mm256_add_pd(fiy3,ty);
1592             fiz3             = _mm256_add_pd(fiz3,tz);
1593
1594             fjx1             = _mm256_add_pd(fjx1,tx);
1595             fjy1             = _mm256_add_pd(fjy1,ty);
1596             fjz1             = _mm256_add_pd(fjz1,tz);
1597
1598             /**************************
1599              * CALCULATE INTERACTIONS *
1600              **************************/
1601
1602             /* REACTION-FIELD ELECTROSTATICS */
1603             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1604
1605             fscal            = felec;
1606
1607             /* Calculate temporary vectorial force */
1608             tx               = _mm256_mul_pd(fscal,dx32);
1609             ty               = _mm256_mul_pd(fscal,dy32);
1610             tz               = _mm256_mul_pd(fscal,dz32);
1611
1612             /* Update vectorial force */
1613             fix3             = _mm256_add_pd(fix3,tx);
1614             fiy3             = _mm256_add_pd(fiy3,ty);
1615             fiz3             = _mm256_add_pd(fiz3,tz);
1616
1617             fjx2             = _mm256_add_pd(fjx2,tx);
1618             fjy2             = _mm256_add_pd(fjy2,ty);
1619             fjz2             = _mm256_add_pd(fjz2,tz);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             /* REACTION-FIELD ELECTROSTATICS */
1626             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1627
1628             fscal            = felec;
1629
1630             /* Calculate temporary vectorial force */
1631             tx               = _mm256_mul_pd(fscal,dx33);
1632             ty               = _mm256_mul_pd(fscal,dy33);
1633             tz               = _mm256_mul_pd(fscal,dz33);
1634
1635             /* Update vectorial force */
1636             fix3             = _mm256_add_pd(fix3,tx);
1637             fiy3             = _mm256_add_pd(fiy3,ty);
1638             fiz3             = _mm256_add_pd(fiz3,tz);
1639
1640             fjx3             = _mm256_add_pd(fjx3,tx);
1641             fjy3             = _mm256_add_pd(fjy3,ty);
1642             fjz3             = _mm256_add_pd(fjz3,tz);
1643
1644             fjptrA             = f+j_coord_offsetA;
1645             fjptrB             = f+j_coord_offsetB;
1646             fjptrC             = f+j_coord_offsetC;
1647             fjptrD             = f+j_coord_offsetD;
1648
1649             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1650                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1651                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1652
1653             /* Inner loop uses 294 flops */
1654         }
1655
1656         if(jidx<j_index_end)
1657         {
1658
1659             /* Get j neighbor index, and coordinate index */
1660             jnrlistA         = jjnr[jidx];
1661             jnrlistB         = jjnr[jidx+1];
1662             jnrlistC         = jjnr[jidx+2];
1663             jnrlistD         = jjnr[jidx+3];
1664             /* Sign of each element will be negative for non-real atoms.
1665              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1666              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1667              */
1668             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1669
1670             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1671             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1672             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1673
1674             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1675             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1676             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1677             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1678             j_coord_offsetA  = DIM*jnrA;
1679             j_coord_offsetB  = DIM*jnrB;
1680             j_coord_offsetC  = DIM*jnrC;
1681             j_coord_offsetD  = DIM*jnrD;
1682
1683             /* load j atom coordinates */
1684             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1685                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1686                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1687                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1688
1689             /* Calculate displacement vector */
1690             dx00             = _mm256_sub_pd(ix0,jx0);
1691             dy00             = _mm256_sub_pd(iy0,jy0);
1692             dz00             = _mm256_sub_pd(iz0,jz0);
1693             dx11             = _mm256_sub_pd(ix1,jx1);
1694             dy11             = _mm256_sub_pd(iy1,jy1);
1695             dz11             = _mm256_sub_pd(iz1,jz1);
1696             dx12             = _mm256_sub_pd(ix1,jx2);
1697             dy12             = _mm256_sub_pd(iy1,jy2);
1698             dz12             = _mm256_sub_pd(iz1,jz2);
1699             dx13             = _mm256_sub_pd(ix1,jx3);
1700             dy13             = _mm256_sub_pd(iy1,jy3);
1701             dz13             = _mm256_sub_pd(iz1,jz3);
1702             dx21             = _mm256_sub_pd(ix2,jx1);
1703             dy21             = _mm256_sub_pd(iy2,jy1);
1704             dz21             = _mm256_sub_pd(iz2,jz1);
1705             dx22             = _mm256_sub_pd(ix2,jx2);
1706             dy22             = _mm256_sub_pd(iy2,jy2);
1707             dz22             = _mm256_sub_pd(iz2,jz2);
1708             dx23             = _mm256_sub_pd(ix2,jx3);
1709             dy23             = _mm256_sub_pd(iy2,jy3);
1710             dz23             = _mm256_sub_pd(iz2,jz3);
1711             dx31             = _mm256_sub_pd(ix3,jx1);
1712             dy31             = _mm256_sub_pd(iy3,jy1);
1713             dz31             = _mm256_sub_pd(iz3,jz1);
1714             dx32             = _mm256_sub_pd(ix3,jx2);
1715             dy32             = _mm256_sub_pd(iy3,jy2);
1716             dz32             = _mm256_sub_pd(iz3,jz2);
1717             dx33             = _mm256_sub_pd(ix3,jx3);
1718             dy33             = _mm256_sub_pd(iy3,jy3);
1719             dz33             = _mm256_sub_pd(iz3,jz3);
1720
1721             /* Calculate squared distance and things based on it */
1722             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1723             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1724             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1725             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1726             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1727             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1728             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1729             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1730             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1731             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1732
1733             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1734             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1735             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1736             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1737             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1738             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1739             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1740             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1741             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1742             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1743
1744             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1745             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1746             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1747             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1748             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1749             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1750             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1751             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1752             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1753
1754             fjx0             = _mm256_setzero_pd();
1755             fjy0             = _mm256_setzero_pd();
1756             fjz0             = _mm256_setzero_pd();
1757             fjx1             = _mm256_setzero_pd();
1758             fjy1             = _mm256_setzero_pd();
1759             fjz1             = _mm256_setzero_pd();
1760             fjx2             = _mm256_setzero_pd();
1761             fjy2             = _mm256_setzero_pd();
1762             fjz2             = _mm256_setzero_pd();
1763             fjx3             = _mm256_setzero_pd();
1764             fjy3             = _mm256_setzero_pd();
1765             fjz3             = _mm256_setzero_pd();
1766
1767             /**************************
1768              * CALCULATE INTERACTIONS *
1769              **************************/
1770
1771             r00              = _mm256_mul_pd(rsq00,rinv00);
1772             r00              = _mm256_andnot_pd(dummy_mask,r00);
1773
1774             /* Calculate table index by multiplying r with table scale and truncate to integer */
1775             rt               = _mm256_mul_pd(r00,vftabscale);
1776             vfitab           = _mm256_cvttpd_epi32(rt);
1777             vfeps            = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1778             vfitab           = _mm_slli_epi32(vfitab,3);
1779
1780             /* CUBIC SPLINE TABLE DISPERSION */
1781             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1782             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1783             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1784             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1785             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1786             Heps             = _mm256_mul_pd(vfeps,H);
1787             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1788             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1789             fvdw6            = _mm256_mul_pd(c6_00,FF);
1790
1791             /* CUBIC SPLINE TABLE REPULSION */
1792             vfitab           = _mm_add_epi32(vfitab,ifour);
1793             Y                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1794             F                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1795             G                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1796             H                = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1797             GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1798             Heps             = _mm256_mul_pd(vfeps,H);
1799             Fp               = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1800             FF               = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1801             fvdw12           = _mm256_mul_pd(c12_00,FF);
1802             fvdw             = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1803
1804             fscal            = fvdw;
1805
1806             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1807
1808             /* Calculate temporary vectorial force */
1809             tx               = _mm256_mul_pd(fscal,dx00);
1810             ty               = _mm256_mul_pd(fscal,dy00);
1811             tz               = _mm256_mul_pd(fscal,dz00);
1812
1813             /* Update vectorial force */
1814             fix0             = _mm256_add_pd(fix0,tx);
1815             fiy0             = _mm256_add_pd(fiy0,ty);
1816             fiz0             = _mm256_add_pd(fiz0,tz);
1817
1818             fjx0             = _mm256_add_pd(fjx0,tx);
1819             fjy0             = _mm256_add_pd(fjy0,ty);
1820             fjz0             = _mm256_add_pd(fjz0,tz);
1821
1822             /**************************
1823              * CALCULATE INTERACTIONS *
1824              **************************/
1825
1826             /* REACTION-FIELD ELECTROSTATICS */
1827             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1828
1829             fscal            = felec;
1830
1831             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1832
1833             /* Calculate temporary vectorial force */
1834             tx               = _mm256_mul_pd(fscal,dx11);
1835             ty               = _mm256_mul_pd(fscal,dy11);
1836             tz               = _mm256_mul_pd(fscal,dz11);
1837
1838             /* Update vectorial force */
1839             fix1             = _mm256_add_pd(fix1,tx);
1840             fiy1             = _mm256_add_pd(fiy1,ty);
1841             fiz1             = _mm256_add_pd(fiz1,tz);
1842
1843             fjx1             = _mm256_add_pd(fjx1,tx);
1844             fjy1             = _mm256_add_pd(fjy1,ty);
1845             fjz1             = _mm256_add_pd(fjz1,tz);
1846
1847             /**************************
1848              * CALCULATE INTERACTIONS *
1849              **************************/
1850
1851             /* REACTION-FIELD ELECTROSTATICS */
1852             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1853
1854             fscal            = felec;
1855
1856             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1857
1858             /* Calculate temporary vectorial force */
1859             tx               = _mm256_mul_pd(fscal,dx12);
1860             ty               = _mm256_mul_pd(fscal,dy12);
1861             tz               = _mm256_mul_pd(fscal,dz12);
1862
1863             /* Update vectorial force */
1864             fix1             = _mm256_add_pd(fix1,tx);
1865             fiy1             = _mm256_add_pd(fiy1,ty);
1866             fiz1             = _mm256_add_pd(fiz1,tz);
1867
1868             fjx2             = _mm256_add_pd(fjx2,tx);
1869             fjy2             = _mm256_add_pd(fjy2,ty);
1870             fjz2             = _mm256_add_pd(fjz2,tz);
1871
1872             /**************************
1873              * CALCULATE INTERACTIONS *
1874              **************************/
1875
1876             /* REACTION-FIELD ELECTROSTATICS */
1877             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1878
1879             fscal            = felec;
1880
1881             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1882
1883             /* Calculate temporary vectorial force */
1884             tx               = _mm256_mul_pd(fscal,dx13);
1885             ty               = _mm256_mul_pd(fscal,dy13);
1886             tz               = _mm256_mul_pd(fscal,dz13);
1887
1888             /* Update vectorial force */
1889             fix1             = _mm256_add_pd(fix1,tx);
1890             fiy1             = _mm256_add_pd(fiy1,ty);
1891             fiz1             = _mm256_add_pd(fiz1,tz);
1892
1893             fjx3             = _mm256_add_pd(fjx3,tx);
1894             fjy3             = _mm256_add_pd(fjy3,ty);
1895             fjz3             = _mm256_add_pd(fjz3,tz);
1896
1897             /**************************
1898              * CALCULATE INTERACTIONS *
1899              **************************/
1900
1901             /* REACTION-FIELD ELECTROSTATICS */
1902             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1903
1904             fscal            = felec;
1905
1906             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1907
1908             /* Calculate temporary vectorial force */
1909             tx               = _mm256_mul_pd(fscal,dx21);
1910             ty               = _mm256_mul_pd(fscal,dy21);
1911             tz               = _mm256_mul_pd(fscal,dz21);
1912
1913             /* Update vectorial force */
1914             fix2             = _mm256_add_pd(fix2,tx);
1915             fiy2             = _mm256_add_pd(fiy2,ty);
1916             fiz2             = _mm256_add_pd(fiz2,tz);
1917
1918             fjx1             = _mm256_add_pd(fjx1,tx);
1919             fjy1             = _mm256_add_pd(fjy1,ty);
1920             fjz1             = _mm256_add_pd(fjz1,tz);
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             /* REACTION-FIELD ELECTROSTATICS */
1927             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1928
1929             fscal            = felec;
1930
1931             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1932
1933             /* Calculate temporary vectorial force */
1934             tx               = _mm256_mul_pd(fscal,dx22);
1935             ty               = _mm256_mul_pd(fscal,dy22);
1936             tz               = _mm256_mul_pd(fscal,dz22);
1937
1938             /* Update vectorial force */
1939             fix2             = _mm256_add_pd(fix2,tx);
1940             fiy2             = _mm256_add_pd(fiy2,ty);
1941             fiz2             = _mm256_add_pd(fiz2,tz);
1942
1943             fjx2             = _mm256_add_pd(fjx2,tx);
1944             fjy2             = _mm256_add_pd(fjy2,ty);
1945             fjz2             = _mm256_add_pd(fjz2,tz);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             /* REACTION-FIELD ELECTROSTATICS */
1952             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1953
1954             fscal            = felec;
1955
1956             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1957
1958             /* Calculate temporary vectorial force */
1959             tx               = _mm256_mul_pd(fscal,dx23);
1960             ty               = _mm256_mul_pd(fscal,dy23);
1961             tz               = _mm256_mul_pd(fscal,dz23);
1962
1963             /* Update vectorial force */
1964             fix2             = _mm256_add_pd(fix2,tx);
1965             fiy2             = _mm256_add_pd(fiy2,ty);
1966             fiz2             = _mm256_add_pd(fiz2,tz);
1967
1968             fjx3             = _mm256_add_pd(fjx3,tx);
1969             fjy3             = _mm256_add_pd(fjy3,ty);
1970             fjz3             = _mm256_add_pd(fjz3,tz);
1971
1972             /**************************
1973              * CALCULATE INTERACTIONS *
1974              **************************/
1975
1976             /* REACTION-FIELD ELECTROSTATICS */
1977             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1978
1979             fscal            = felec;
1980
1981             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1982
1983             /* Calculate temporary vectorial force */
1984             tx               = _mm256_mul_pd(fscal,dx31);
1985             ty               = _mm256_mul_pd(fscal,dy31);
1986             tz               = _mm256_mul_pd(fscal,dz31);
1987
1988             /* Update vectorial force */
1989             fix3             = _mm256_add_pd(fix3,tx);
1990             fiy3             = _mm256_add_pd(fiy3,ty);
1991             fiz3             = _mm256_add_pd(fiz3,tz);
1992
1993             fjx1             = _mm256_add_pd(fjx1,tx);
1994             fjy1             = _mm256_add_pd(fjy1,ty);
1995             fjz1             = _mm256_add_pd(fjz1,tz);
1996
1997             /**************************
1998              * CALCULATE INTERACTIONS *
1999              **************************/
2000
2001             /* REACTION-FIELD ELECTROSTATICS */
2002             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2003
2004             fscal            = felec;
2005
2006             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2007
2008             /* Calculate temporary vectorial force */
2009             tx               = _mm256_mul_pd(fscal,dx32);
2010             ty               = _mm256_mul_pd(fscal,dy32);
2011             tz               = _mm256_mul_pd(fscal,dz32);
2012
2013             /* Update vectorial force */
2014             fix3             = _mm256_add_pd(fix3,tx);
2015             fiy3             = _mm256_add_pd(fiy3,ty);
2016             fiz3             = _mm256_add_pd(fiz3,tz);
2017
2018             fjx2             = _mm256_add_pd(fjx2,tx);
2019             fjy2             = _mm256_add_pd(fjy2,ty);
2020             fjz2             = _mm256_add_pd(fjz2,tz);
2021
2022             /**************************
2023              * CALCULATE INTERACTIONS *
2024              **************************/
2025
2026             /* REACTION-FIELD ELECTROSTATICS */
2027             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2028
2029             fscal            = felec;
2030
2031             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2032
2033             /* Calculate temporary vectorial force */
2034             tx               = _mm256_mul_pd(fscal,dx33);
2035             ty               = _mm256_mul_pd(fscal,dy33);
2036             tz               = _mm256_mul_pd(fscal,dz33);
2037
2038             /* Update vectorial force */
2039             fix3             = _mm256_add_pd(fix3,tx);
2040             fiy3             = _mm256_add_pd(fiy3,ty);
2041             fiz3             = _mm256_add_pd(fiz3,tz);
2042
2043             fjx3             = _mm256_add_pd(fjx3,tx);
2044             fjy3             = _mm256_add_pd(fjy3,ty);
2045             fjz3             = _mm256_add_pd(fjz3,tz);
2046
2047             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2048             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2049             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2050             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2051
2052             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2053                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2054                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2055
2056             /* Inner loop uses 295 flops */
2057         }
2058
2059         /* End of innermost loop */
2060
2061         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2062                                                  f+i_coord_offset,fshift+i_shift_offset);
2063
2064         /* Increment number of inner iterations */
2065         inneriter                  += j_index_end - j_index_start;
2066
2067         /* Outer loop uses 24 flops */
2068     }
2069
2070     /* Increment number of outer iterations */
2071     outeriter        += nri;
2072
2073     /* Update outer/inner flops */
2074
2075     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
2076 }