Introduce gmxpre.h for truly global definitions
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
119     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
120     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
121     real             rswitch_scalar,d_scalar;
122     __m256d          dummy_mask,cutoff_mask;
123     __m128           tmpmask0,tmpmask1;
124     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
125     __m256d          one     = _mm256_set1_pd(1.0);
126     __m256d          two     = _mm256_set1_pd(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm256_set1_pd(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     krf              = _mm256_set1_pd(fr->ic->k_rf);
141     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
142     crf              = _mm256_set1_pd(fr->ic->c_rf);
143     nvdwtype         = fr->ntype;
144     vdwparam         = fr->nbfp;
145     vdwtype          = mdatoms->typeA;
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
150     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
151     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
152     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
153
154     jq1              = _mm256_set1_pd(charge[inr+1]);
155     jq2              = _mm256_set1_pd(charge[inr+2]);
156     jq3              = _mm256_set1_pd(charge[inr+3]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
159     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
160     qq11             = _mm256_mul_pd(iq1,jq1);
161     qq12             = _mm256_mul_pd(iq1,jq2);
162     qq13             = _mm256_mul_pd(iq1,jq3);
163     qq21             = _mm256_mul_pd(iq2,jq1);
164     qq22             = _mm256_mul_pd(iq2,jq2);
165     qq23             = _mm256_mul_pd(iq2,jq3);
166     qq31             = _mm256_mul_pd(iq3,jq1);
167     qq32             = _mm256_mul_pd(iq3,jq2);
168     qq33             = _mm256_mul_pd(iq3,jq3);
169
170     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171     rcutoff_scalar   = fr->rcoulomb;
172     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
173     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
174
175     rswitch_scalar   = fr->rvdw_switch;
176     rswitch          = _mm256_set1_pd(rswitch_scalar);
177     /* Setup switch parameters */
178     d_scalar         = rcutoff_scalar-rswitch_scalar;
179     d                = _mm256_set1_pd(d_scalar);
180     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
181     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
184     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = jnrC = jnrD = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191     j_coord_offsetC = 0;
192     j_coord_offsetD = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     for(iidx=0;iidx<4*DIM;iidx++)
198     {
199         scratch[iidx] = 0.0;
200     }
201
202     /* Start outer loop over neighborlists */
203     for(iidx=0; iidx<nri; iidx++)
204     {
205         /* Load shift vector for this list */
206         i_shift_offset   = DIM*shiftidx[iidx];
207
208         /* Load limits for loop over neighbors */
209         j_index_start    = jindex[iidx];
210         j_index_end      = jindex[iidx+1];
211
212         /* Get outer coordinate index */
213         inr              = iinr[iidx];
214         i_coord_offset   = DIM*inr;
215
216         /* Load i particle coords and add shift vector */
217         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
218                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
219
220         fix0             = _mm256_setzero_pd();
221         fiy0             = _mm256_setzero_pd();
222         fiz0             = _mm256_setzero_pd();
223         fix1             = _mm256_setzero_pd();
224         fiy1             = _mm256_setzero_pd();
225         fiz1             = _mm256_setzero_pd();
226         fix2             = _mm256_setzero_pd();
227         fiy2             = _mm256_setzero_pd();
228         fiz2             = _mm256_setzero_pd();
229         fix3             = _mm256_setzero_pd();
230         fiy3             = _mm256_setzero_pd();
231         fiz3             = _mm256_setzero_pd();
232
233         /* Reset potential sums */
234         velecsum         = _mm256_setzero_pd();
235         vvdwsum          = _mm256_setzero_pd();
236
237         /* Start inner kernel loop */
238         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239         {
240
241             /* Get j neighbor index, and coordinate index */
242             jnrA             = jjnr[jidx];
243             jnrB             = jjnr[jidx+1];
244             jnrC             = jjnr[jidx+2];
245             jnrD             = jjnr[jidx+3];
246             j_coord_offsetA  = DIM*jnrA;
247             j_coord_offsetB  = DIM*jnrB;
248             j_coord_offsetC  = DIM*jnrC;
249             j_coord_offsetD  = DIM*jnrD;
250
251             /* load j atom coordinates */
252             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
253                                                  x+j_coord_offsetC,x+j_coord_offsetD,
254                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
256
257             /* Calculate displacement vector */
258             dx00             = _mm256_sub_pd(ix0,jx0);
259             dy00             = _mm256_sub_pd(iy0,jy0);
260             dz00             = _mm256_sub_pd(iz0,jz0);
261             dx11             = _mm256_sub_pd(ix1,jx1);
262             dy11             = _mm256_sub_pd(iy1,jy1);
263             dz11             = _mm256_sub_pd(iz1,jz1);
264             dx12             = _mm256_sub_pd(ix1,jx2);
265             dy12             = _mm256_sub_pd(iy1,jy2);
266             dz12             = _mm256_sub_pd(iz1,jz2);
267             dx13             = _mm256_sub_pd(ix1,jx3);
268             dy13             = _mm256_sub_pd(iy1,jy3);
269             dz13             = _mm256_sub_pd(iz1,jz3);
270             dx21             = _mm256_sub_pd(ix2,jx1);
271             dy21             = _mm256_sub_pd(iy2,jy1);
272             dz21             = _mm256_sub_pd(iz2,jz1);
273             dx22             = _mm256_sub_pd(ix2,jx2);
274             dy22             = _mm256_sub_pd(iy2,jy2);
275             dz22             = _mm256_sub_pd(iz2,jz2);
276             dx23             = _mm256_sub_pd(ix2,jx3);
277             dy23             = _mm256_sub_pd(iy2,jy3);
278             dz23             = _mm256_sub_pd(iz2,jz3);
279             dx31             = _mm256_sub_pd(ix3,jx1);
280             dy31             = _mm256_sub_pd(iy3,jy1);
281             dz31             = _mm256_sub_pd(iz3,jz1);
282             dx32             = _mm256_sub_pd(ix3,jx2);
283             dy32             = _mm256_sub_pd(iy3,jy2);
284             dz32             = _mm256_sub_pd(iz3,jz2);
285             dx33             = _mm256_sub_pd(ix3,jx3);
286             dy33             = _mm256_sub_pd(iy3,jy3);
287             dz33             = _mm256_sub_pd(iz3,jz3);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
291             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
292             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
293             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
294             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
295             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
296             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
297             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
298             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
299             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
300
301             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
302             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
303             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
304             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
305             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
306             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
307             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
308             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
309             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
310             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
311
312             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
313             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
314             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
315             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
316             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
317             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
318             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
319             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
320             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
321             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
322
323             fjx0             = _mm256_setzero_pd();
324             fjy0             = _mm256_setzero_pd();
325             fjz0             = _mm256_setzero_pd();
326             fjx1             = _mm256_setzero_pd();
327             fjy1             = _mm256_setzero_pd();
328             fjz1             = _mm256_setzero_pd();
329             fjx2             = _mm256_setzero_pd();
330             fjy2             = _mm256_setzero_pd();
331             fjz2             = _mm256_setzero_pd();
332             fjx3             = _mm256_setzero_pd();
333             fjy3             = _mm256_setzero_pd();
334             fjz3             = _mm256_setzero_pd();
335
336             /**************************
337              * CALCULATE INTERACTIONS *
338              **************************/
339
340             if (gmx_mm256_any_lt(rsq00,rcutoff2))
341             {
342
343             r00              = _mm256_mul_pd(rsq00,rinv00);
344
345             /* LENNARD-JONES DISPERSION/REPULSION */
346
347             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
348             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
349             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
350             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
351             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
352
353             d                = _mm256_sub_pd(r00,rswitch);
354             d                = _mm256_max_pd(d,_mm256_setzero_pd());
355             d2               = _mm256_mul_pd(d,d);
356             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
357
358             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
359
360             /* Evaluate switch function */
361             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
363             vvdw             = _mm256_mul_pd(vvdw,sw);
364             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
368             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
369
370             fscal            = fvdw;
371
372             fscal            = _mm256_and_pd(fscal,cutoff_mask);
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm256_mul_pd(fscal,dx00);
376             ty               = _mm256_mul_pd(fscal,dy00);
377             tz               = _mm256_mul_pd(fscal,dz00);
378
379             /* Update vectorial force */
380             fix0             = _mm256_add_pd(fix0,tx);
381             fiy0             = _mm256_add_pd(fiy0,ty);
382             fiz0             = _mm256_add_pd(fiz0,tz);
383
384             fjx0             = _mm256_add_pd(fjx0,tx);
385             fjy0             = _mm256_add_pd(fjy0,ty);
386             fjz0             = _mm256_add_pd(fjz0,tz);
387
388             }
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             if (gmx_mm256_any_lt(rsq11,rcutoff2))
395             {
396
397             /* REACTION-FIELD ELECTROSTATICS */
398             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
399             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
400
401             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velec            = _mm256_and_pd(velec,cutoff_mask);
405             velecsum         = _mm256_add_pd(velecsum,velec);
406
407             fscal            = felec;
408
409             fscal            = _mm256_and_pd(fscal,cutoff_mask);
410
411             /* Calculate temporary vectorial force */
412             tx               = _mm256_mul_pd(fscal,dx11);
413             ty               = _mm256_mul_pd(fscal,dy11);
414             tz               = _mm256_mul_pd(fscal,dz11);
415
416             /* Update vectorial force */
417             fix1             = _mm256_add_pd(fix1,tx);
418             fiy1             = _mm256_add_pd(fiy1,ty);
419             fiz1             = _mm256_add_pd(fiz1,tz);
420
421             fjx1             = _mm256_add_pd(fjx1,tx);
422             fjy1             = _mm256_add_pd(fjy1,ty);
423             fjz1             = _mm256_add_pd(fjz1,tz);
424
425             }
426
427             /**************************
428              * CALCULATE INTERACTIONS *
429              **************************/
430
431             if (gmx_mm256_any_lt(rsq12,rcutoff2))
432             {
433
434             /* REACTION-FIELD ELECTROSTATICS */
435             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
436             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
437
438             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velec            = _mm256_and_pd(velec,cutoff_mask);
442             velecsum         = _mm256_add_pd(velecsum,velec);
443
444             fscal            = felec;
445
446             fscal            = _mm256_and_pd(fscal,cutoff_mask);
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm256_mul_pd(fscal,dx12);
450             ty               = _mm256_mul_pd(fscal,dy12);
451             tz               = _mm256_mul_pd(fscal,dz12);
452
453             /* Update vectorial force */
454             fix1             = _mm256_add_pd(fix1,tx);
455             fiy1             = _mm256_add_pd(fiy1,ty);
456             fiz1             = _mm256_add_pd(fiz1,tz);
457
458             fjx2             = _mm256_add_pd(fjx2,tx);
459             fjy2             = _mm256_add_pd(fjy2,ty);
460             fjz2             = _mm256_add_pd(fjz2,tz);
461
462             }
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             if (gmx_mm256_any_lt(rsq13,rcutoff2))
469             {
470
471             /* REACTION-FIELD ELECTROSTATICS */
472             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
473             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
474
475             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velec            = _mm256_and_pd(velec,cutoff_mask);
479             velecsum         = _mm256_add_pd(velecsum,velec);
480
481             fscal            = felec;
482
483             fscal            = _mm256_and_pd(fscal,cutoff_mask);
484
485             /* Calculate temporary vectorial force */
486             tx               = _mm256_mul_pd(fscal,dx13);
487             ty               = _mm256_mul_pd(fscal,dy13);
488             tz               = _mm256_mul_pd(fscal,dz13);
489
490             /* Update vectorial force */
491             fix1             = _mm256_add_pd(fix1,tx);
492             fiy1             = _mm256_add_pd(fiy1,ty);
493             fiz1             = _mm256_add_pd(fiz1,tz);
494
495             fjx3             = _mm256_add_pd(fjx3,tx);
496             fjy3             = _mm256_add_pd(fjy3,ty);
497             fjz3             = _mm256_add_pd(fjz3,tz);
498
499             }
500
501             /**************************
502              * CALCULATE INTERACTIONS *
503              **************************/
504
505             if (gmx_mm256_any_lt(rsq21,rcutoff2))
506             {
507
508             /* REACTION-FIELD ELECTROSTATICS */
509             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
510             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
511
512             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velec            = _mm256_and_pd(velec,cutoff_mask);
516             velecsum         = _mm256_add_pd(velecsum,velec);
517
518             fscal            = felec;
519
520             fscal            = _mm256_and_pd(fscal,cutoff_mask);
521
522             /* Calculate temporary vectorial force */
523             tx               = _mm256_mul_pd(fscal,dx21);
524             ty               = _mm256_mul_pd(fscal,dy21);
525             tz               = _mm256_mul_pd(fscal,dz21);
526
527             /* Update vectorial force */
528             fix2             = _mm256_add_pd(fix2,tx);
529             fiy2             = _mm256_add_pd(fiy2,ty);
530             fiz2             = _mm256_add_pd(fiz2,tz);
531
532             fjx1             = _mm256_add_pd(fjx1,tx);
533             fjy1             = _mm256_add_pd(fjy1,ty);
534             fjz1             = _mm256_add_pd(fjz1,tz);
535
536             }
537
538             /**************************
539              * CALCULATE INTERACTIONS *
540              **************************/
541
542             if (gmx_mm256_any_lt(rsq22,rcutoff2))
543             {
544
545             /* REACTION-FIELD ELECTROSTATICS */
546             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
547             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
548
549             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
550
551             /* Update potential sum for this i atom from the interaction with this j atom. */
552             velec            = _mm256_and_pd(velec,cutoff_mask);
553             velecsum         = _mm256_add_pd(velecsum,velec);
554
555             fscal            = felec;
556
557             fscal            = _mm256_and_pd(fscal,cutoff_mask);
558
559             /* Calculate temporary vectorial force */
560             tx               = _mm256_mul_pd(fscal,dx22);
561             ty               = _mm256_mul_pd(fscal,dy22);
562             tz               = _mm256_mul_pd(fscal,dz22);
563
564             /* Update vectorial force */
565             fix2             = _mm256_add_pd(fix2,tx);
566             fiy2             = _mm256_add_pd(fiy2,ty);
567             fiz2             = _mm256_add_pd(fiz2,tz);
568
569             fjx2             = _mm256_add_pd(fjx2,tx);
570             fjy2             = _mm256_add_pd(fjy2,ty);
571             fjz2             = _mm256_add_pd(fjz2,tz);
572
573             }
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             if (gmx_mm256_any_lt(rsq23,rcutoff2))
580             {
581
582             /* REACTION-FIELD ELECTROSTATICS */
583             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
584             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
585
586             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
587
588             /* Update potential sum for this i atom from the interaction with this j atom. */
589             velec            = _mm256_and_pd(velec,cutoff_mask);
590             velecsum         = _mm256_add_pd(velecsum,velec);
591
592             fscal            = felec;
593
594             fscal            = _mm256_and_pd(fscal,cutoff_mask);
595
596             /* Calculate temporary vectorial force */
597             tx               = _mm256_mul_pd(fscal,dx23);
598             ty               = _mm256_mul_pd(fscal,dy23);
599             tz               = _mm256_mul_pd(fscal,dz23);
600
601             /* Update vectorial force */
602             fix2             = _mm256_add_pd(fix2,tx);
603             fiy2             = _mm256_add_pd(fiy2,ty);
604             fiz2             = _mm256_add_pd(fiz2,tz);
605
606             fjx3             = _mm256_add_pd(fjx3,tx);
607             fjy3             = _mm256_add_pd(fjy3,ty);
608             fjz3             = _mm256_add_pd(fjz3,tz);
609
610             }
611
612             /**************************
613              * CALCULATE INTERACTIONS *
614              **************************/
615
616             if (gmx_mm256_any_lt(rsq31,rcutoff2))
617             {
618
619             /* REACTION-FIELD ELECTROSTATICS */
620             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
621             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
622
623             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
624
625             /* Update potential sum for this i atom from the interaction with this j atom. */
626             velec            = _mm256_and_pd(velec,cutoff_mask);
627             velecsum         = _mm256_add_pd(velecsum,velec);
628
629             fscal            = felec;
630
631             fscal            = _mm256_and_pd(fscal,cutoff_mask);
632
633             /* Calculate temporary vectorial force */
634             tx               = _mm256_mul_pd(fscal,dx31);
635             ty               = _mm256_mul_pd(fscal,dy31);
636             tz               = _mm256_mul_pd(fscal,dz31);
637
638             /* Update vectorial force */
639             fix3             = _mm256_add_pd(fix3,tx);
640             fiy3             = _mm256_add_pd(fiy3,ty);
641             fiz3             = _mm256_add_pd(fiz3,tz);
642
643             fjx1             = _mm256_add_pd(fjx1,tx);
644             fjy1             = _mm256_add_pd(fjy1,ty);
645             fjz1             = _mm256_add_pd(fjz1,tz);
646
647             }
648
649             /**************************
650              * CALCULATE INTERACTIONS *
651              **************************/
652
653             if (gmx_mm256_any_lt(rsq32,rcutoff2))
654             {
655
656             /* REACTION-FIELD ELECTROSTATICS */
657             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
658             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
659
660             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velec            = _mm256_and_pd(velec,cutoff_mask);
664             velecsum         = _mm256_add_pd(velecsum,velec);
665
666             fscal            = felec;
667
668             fscal            = _mm256_and_pd(fscal,cutoff_mask);
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_pd(fscal,dx32);
672             ty               = _mm256_mul_pd(fscal,dy32);
673             tz               = _mm256_mul_pd(fscal,dz32);
674
675             /* Update vectorial force */
676             fix3             = _mm256_add_pd(fix3,tx);
677             fiy3             = _mm256_add_pd(fiy3,ty);
678             fiz3             = _mm256_add_pd(fiz3,tz);
679
680             fjx2             = _mm256_add_pd(fjx2,tx);
681             fjy2             = _mm256_add_pd(fjy2,ty);
682             fjz2             = _mm256_add_pd(fjz2,tz);
683
684             }
685
686             /**************************
687              * CALCULATE INTERACTIONS *
688              **************************/
689
690             if (gmx_mm256_any_lt(rsq33,rcutoff2))
691             {
692
693             /* REACTION-FIELD ELECTROSTATICS */
694             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
695             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
696
697             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
698
699             /* Update potential sum for this i atom from the interaction with this j atom. */
700             velec            = _mm256_and_pd(velec,cutoff_mask);
701             velecsum         = _mm256_add_pd(velecsum,velec);
702
703             fscal            = felec;
704
705             fscal            = _mm256_and_pd(fscal,cutoff_mask);
706
707             /* Calculate temporary vectorial force */
708             tx               = _mm256_mul_pd(fscal,dx33);
709             ty               = _mm256_mul_pd(fscal,dy33);
710             tz               = _mm256_mul_pd(fscal,dz33);
711
712             /* Update vectorial force */
713             fix3             = _mm256_add_pd(fix3,tx);
714             fiy3             = _mm256_add_pd(fiy3,ty);
715             fiz3             = _mm256_add_pd(fiz3,tz);
716
717             fjx3             = _mm256_add_pd(fjx3,tx);
718             fjy3             = _mm256_add_pd(fjy3,ty);
719             fjz3             = _mm256_add_pd(fjz3,tz);
720
721             }
722
723             fjptrA             = f+j_coord_offsetA;
724             fjptrB             = f+j_coord_offsetB;
725             fjptrC             = f+j_coord_offsetC;
726             fjptrD             = f+j_coord_offsetD;
727
728             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
729                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
730                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
731
732             /* Inner loop uses 386 flops */
733         }
734
735         if(jidx<j_index_end)
736         {
737
738             /* Get j neighbor index, and coordinate index */
739             jnrlistA         = jjnr[jidx];
740             jnrlistB         = jjnr[jidx+1];
741             jnrlistC         = jjnr[jidx+2];
742             jnrlistD         = jjnr[jidx+3];
743             /* Sign of each element will be negative for non-real atoms.
744              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
745              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
746              */
747             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
748
749             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
750             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
751             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
752
753             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
754             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
755             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
756             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
757             j_coord_offsetA  = DIM*jnrA;
758             j_coord_offsetB  = DIM*jnrB;
759             j_coord_offsetC  = DIM*jnrC;
760             j_coord_offsetD  = DIM*jnrD;
761
762             /* load j atom coordinates */
763             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
764                                                  x+j_coord_offsetC,x+j_coord_offsetD,
765                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
766                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
767
768             /* Calculate displacement vector */
769             dx00             = _mm256_sub_pd(ix0,jx0);
770             dy00             = _mm256_sub_pd(iy0,jy0);
771             dz00             = _mm256_sub_pd(iz0,jz0);
772             dx11             = _mm256_sub_pd(ix1,jx1);
773             dy11             = _mm256_sub_pd(iy1,jy1);
774             dz11             = _mm256_sub_pd(iz1,jz1);
775             dx12             = _mm256_sub_pd(ix1,jx2);
776             dy12             = _mm256_sub_pd(iy1,jy2);
777             dz12             = _mm256_sub_pd(iz1,jz2);
778             dx13             = _mm256_sub_pd(ix1,jx3);
779             dy13             = _mm256_sub_pd(iy1,jy3);
780             dz13             = _mm256_sub_pd(iz1,jz3);
781             dx21             = _mm256_sub_pd(ix2,jx1);
782             dy21             = _mm256_sub_pd(iy2,jy1);
783             dz21             = _mm256_sub_pd(iz2,jz1);
784             dx22             = _mm256_sub_pd(ix2,jx2);
785             dy22             = _mm256_sub_pd(iy2,jy2);
786             dz22             = _mm256_sub_pd(iz2,jz2);
787             dx23             = _mm256_sub_pd(ix2,jx3);
788             dy23             = _mm256_sub_pd(iy2,jy3);
789             dz23             = _mm256_sub_pd(iz2,jz3);
790             dx31             = _mm256_sub_pd(ix3,jx1);
791             dy31             = _mm256_sub_pd(iy3,jy1);
792             dz31             = _mm256_sub_pd(iz3,jz1);
793             dx32             = _mm256_sub_pd(ix3,jx2);
794             dy32             = _mm256_sub_pd(iy3,jy2);
795             dz32             = _mm256_sub_pd(iz3,jz2);
796             dx33             = _mm256_sub_pd(ix3,jx3);
797             dy33             = _mm256_sub_pd(iy3,jy3);
798             dz33             = _mm256_sub_pd(iz3,jz3);
799
800             /* Calculate squared distance and things based on it */
801             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
802             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
803             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
804             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
805             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
806             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
807             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
808             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
809             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
810             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
811
812             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
813             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
814             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
815             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
816             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
817             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
818             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
819             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
820             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
821             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
822
823             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
824             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
825             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
826             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
827             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
828             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
829             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
830             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
831             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
832             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
833
834             fjx0             = _mm256_setzero_pd();
835             fjy0             = _mm256_setzero_pd();
836             fjz0             = _mm256_setzero_pd();
837             fjx1             = _mm256_setzero_pd();
838             fjy1             = _mm256_setzero_pd();
839             fjz1             = _mm256_setzero_pd();
840             fjx2             = _mm256_setzero_pd();
841             fjy2             = _mm256_setzero_pd();
842             fjz2             = _mm256_setzero_pd();
843             fjx3             = _mm256_setzero_pd();
844             fjy3             = _mm256_setzero_pd();
845             fjz3             = _mm256_setzero_pd();
846
847             /**************************
848              * CALCULATE INTERACTIONS *
849              **************************/
850
851             if (gmx_mm256_any_lt(rsq00,rcutoff2))
852             {
853
854             r00              = _mm256_mul_pd(rsq00,rinv00);
855             r00              = _mm256_andnot_pd(dummy_mask,r00);
856
857             /* LENNARD-JONES DISPERSION/REPULSION */
858
859             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
860             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
861             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
862             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
863             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
864
865             d                = _mm256_sub_pd(r00,rswitch);
866             d                = _mm256_max_pd(d,_mm256_setzero_pd());
867             d2               = _mm256_mul_pd(d,d);
868             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
869
870             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
871
872             /* Evaluate switch function */
873             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
874             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
875             vvdw             = _mm256_mul_pd(vvdw,sw);
876             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
877
878             /* Update potential sum for this i atom from the interaction with this j atom. */
879             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
880             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
881             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
882
883             fscal            = fvdw;
884
885             fscal            = _mm256_and_pd(fscal,cutoff_mask);
886
887             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm256_mul_pd(fscal,dx00);
891             ty               = _mm256_mul_pd(fscal,dy00);
892             tz               = _mm256_mul_pd(fscal,dz00);
893
894             /* Update vectorial force */
895             fix0             = _mm256_add_pd(fix0,tx);
896             fiy0             = _mm256_add_pd(fiy0,ty);
897             fiz0             = _mm256_add_pd(fiz0,tz);
898
899             fjx0             = _mm256_add_pd(fjx0,tx);
900             fjy0             = _mm256_add_pd(fjy0,ty);
901             fjz0             = _mm256_add_pd(fjz0,tz);
902
903             }
904
905             /**************************
906              * CALCULATE INTERACTIONS *
907              **************************/
908
909             if (gmx_mm256_any_lt(rsq11,rcutoff2))
910             {
911
912             /* REACTION-FIELD ELECTROSTATICS */
913             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
914             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
915
916             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
917
918             /* Update potential sum for this i atom from the interaction with this j atom. */
919             velec            = _mm256_and_pd(velec,cutoff_mask);
920             velec            = _mm256_andnot_pd(dummy_mask,velec);
921             velecsum         = _mm256_add_pd(velecsum,velec);
922
923             fscal            = felec;
924
925             fscal            = _mm256_and_pd(fscal,cutoff_mask);
926
927             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
928
929             /* Calculate temporary vectorial force */
930             tx               = _mm256_mul_pd(fscal,dx11);
931             ty               = _mm256_mul_pd(fscal,dy11);
932             tz               = _mm256_mul_pd(fscal,dz11);
933
934             /* Update vectorial force */
935             fix1             = _mm256_add_pd(fix1,tx);
936             fiy1             = _mm256_add_pd(fiy1,ty);
937             fiz1             = _mm256_add_pd(fiz1,tz);
938
939             fjx1             = _mm256_add_pd(fjx1,tx);
940             fjy1             = _mm256_add_pd(fjy1,ty);
941             fjz1             = _mm256_add_pd(fjz1,tz);
942
943             }
944
945             /**************************
946              * CALCULATE INTERACTIONS *
947              **************************/
948
949             if (gmx_mm256_any_lt(rsq12,rcutoff2))
950             {
951
952             /* REACTION-FIELD ELECTROSTATICS */
953             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
954             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
955
956             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
957
958             /* Update potential sum for this i atom from the interaction with this j atom. */
959             velec            = _mm256_and_pd(velec,cutoff_mask);
960             velec            = _mm256_andnot_pd(dummy_mask,velec);
961             velecsum         = _mm256_add_pd(velecsum,velec);
962
963             fscal            = felec;
964
965             fscal            = _mm256_and_pd(fscal,cutoff_mask);
966
967             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
968
969             /* Calculate temporary vectorial force */
970             tx               = _mm256_mul_pd(fscal,dx12);
971             ty               = _mm256_mul_pd(fscal,dy12);
972             tz               = _mm256_mul_pd(fscal,dz12);
973
974             /* Update vectorial force */
975             fix1             = _mm256_add_pd(fix1,tx);
976             fiy1             = _mm256_add_pd(fiy1,ty);
977             fiz1             = _mm256_add_pd(fiz1,tz);
978
979             fjx2             = _mm256_add_pd(fjx2,tx);
980             fjy2             = _mm256_add_pd(fjy2,ty);
981             fjz2             = _mm256_add_pd(fjz2,tz);
982
983             }
984
985             /**************************
986              * CALCULATE INTERACTIONS *
987              **************************/
988
989             if (gmx_mm256_any_lt(rsq13,rcutoff2))
990             {
991
992             /* REACTION-FIELD ELECTROSTATICS */
993             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
994             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
995
996             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
997
998             /* Update potential sum for this i atom from the interaction with this j atom. */
999             velec            = _mm256_and_pd(velec,cutoff_mask);
1000             velec            = _mm256_andnot_pd(dummy_mask,velec);
1001             velecsum         = _mm256_add_pd(velecsum,velec);
1002
1003             fscal            = felec;
1004
1005             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1006
1007             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1008
1009             /* Calculate temporary vectorial force */
1010             tx               = _mm256_mul_pd(fscal,dx13);
1011             ty               = _mm256_mul_pd(fscal,dy13);
1012             tz               = _mm256_mul_pd(fscal,dz13);
1013
1014             /* Update vectorial force */
1015             fix1             = _mm256_add_pd(fix1,tx);
1016             fiy1             = _mm256_add_pd(fiy1,ty);
1017             fiz1             = _mm256_add_pd(fiz1,tz);
1018
1019             fjx3             = _mm256_add_pd(fjx3,tx);
1020             fjy3             = _mm256_add_pd(fjy3,ty);
1021             fjz3             = _mm256_add_pd(fjz3,tz);
1022
1023             }
1024
1025             /**************************
1026              * CALCULATE INTERACTIONS *
1027              **************************/
1028
1029             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1030             {
1031
1032             /* REACTION-FIELD ELECTROSTATICS */
1033             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1034             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1035
1036             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1037
1038             /* Update potential sum for this i atom from the interaction with this j atom. */
1039             velec            = _mm256_and_pd(velec,cutoff_mask);
1040             velec            = _mm256_andnot_pd(dummy_mask,velec);
1041             velecsum         = _mm256_add_pd(velecsum,velec);
1042
1043             fscal            = felec;
1044
1045             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1046
1047             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1048
1049             /* Calculate temporary vectorial force */
1050             tx               = _mm256_mul_pd(fscal,dx21);
1051             ty               = _mm256_mul_pd(fscal,dy21);
1052             tz               = _mm256_mul_pd(fscal,dz21);
1053
1054             /* Update vectorial force */
1055             fix2             = _mm256_add_pd(fix2,tx);
1056             fiy2             = _mm256_add_pd(fiy2,ty);
1057             fiz2             = _mm256_add_pd(fiz2,tz);
1058
1059             fjx1             = _mm256_add_pd(fjx1,tx);
1060             fjy1             = _mm256_add_pd(fjy1,ty);
1061             fjz1             = _mm256_add_pd(fjz1,tz);
1062
1063             }
1064
1065             /**************************
1066              * CALCULATE INTERACTIONS *
1067              **************************/
1068
1069             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1070             {
1071
1072             /* REACTION-FIELD ELECTROSTATICS */
1073             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1074             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1075
1076             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1077
1078             /* Update potential sum for this i atom from the interaction with this j atom. */
1079             velec            = _mm256_and_pd(velec,cutoff_mask);
1080             velec            = _mm256_andnot_pd(dummy_mask,velec);
1081             velecsum         = _mm256_add_pd(velecsum,velec);
1082
1083             fscal            = felec;
1084
1085             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1086
1087             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1088
1089             /* Calculate temporary vectorial force */
1090             tx               = _mm256_mul_pd(fscal,dx22);
1091             ty               = _mm256_mul_pd(fscal,dy22);
1092             tz               = _mm256_mul_pd(fscal,dz22);
1093
1094             /* Update vectorial force */
1095             fix2             = _mm256_add_pd(fix2,tx);
1096             fiy2             = _mm256_add_pd(fiy2,ty);
1097             fiz2             = _mm256_add_pd(fiz2,tz);
1098
1099             fjx2             = _mm256_add_pd(fjx2,tx);
1100             fjy2             = _mm256_add_pd(fjy2,ty);
1101             fjz2             = _mm256_add_pd(fjz2,tz);
1102
1103             }
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1110             {
1111
1112             /* REACTION-FIELD ELECTROSTATICS */
1113             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1114             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1115
1116             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm256_and_pd(velec,cutoff_mask);
1120             velec            = _mm256_andnot_pd(dummy_mask,velec);
1121             velecsum         = _mm256_add_pd(velecsum,velec);
1122
1123             fscal            = felec;
1124
1125             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1126
1127             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1128
1129             /* Calculate temporary vectorial force */
1130             tx               = _mm256_mul_pd(fscal,dx23);
1131             ty               = _mm256_mul_pd(fscal,dy23);
1132             tz               = _mm256_mul_pd(fscal,dz23);
1133
1134             /* Update vectorial force */
1135             fix2             = _mm256_add_pd(fix2,tx);
1136             fiy2             = _mm256_add_pd(fiy2,ty);
1137             fiz2             = _mm256_add_pd(fiz2,tz);
1138
1139             fjx3             = _mm256_add_pd(fjx3,tx);
1140             fjy3             = _mm256_add_pd(fjy3,ty);
1141             fjz3             = _mm256_add_pd(fjz3,tz);
1142
1143             }
1144
1145             /**************************
1146              * CALCULATE INTERACTIONS *
1147              **************************/
1148
1149             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1150             {
1151
1152             /* REACTION-FIELD ELECTROSTATICS */
1153             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1154             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1155
1156             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1157
1158             /* Update potential sum for this i atom from the interaction with this j atom. */
1159             velec            = _mm256_and_pd(velec,cutoff_mask);
1160             velec            = _mm256_andnot_pd(dummy_mask,velec);
1161             velecsum         = _mm256_add_pd(velecsum,velec);
1162
1163             fscal            = felec;
1164
1165             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1166
1167             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1168
1169             /* Calculate temporary vectorial force */
1170             tx               = _mm256_mul_pd(fscal,dx31);
1171             ty               = _mm256_mul_pd(fscal,dy31);
1172             tz               = _mm256_mul_pd(fscal,dz31);
1173
1174             /* Update vectorial force */
1175             fix3             = _mm256_add_pd(fix3,tx);
1176             fiy3             = _mm256_add_pd(fiy3,ty);
1177             fiz3             = _mm256_add_pd(fiz3,tz);
1178
1179             fjx1             = _mm256_add_pd(fjx1,tx);
1180             fjy1             = _mm256_add_pd(fjy1,ty);
1181             fjz1             = _mm256_add_pd(fjz1,tz);
1182
1183             }
1184
1185             /**************************
1186              * CALCULATE INTERACTIONS *
1187              **************************/
1188
1189             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1190             {
1191
1192             /* REACTION-FIELD ELECTROSTATICS */
1193             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1194             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1195
1196             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _mm256_and_pd(velec,cutoff_mask);
1200             velec            = _mm256_andnot_pd(dummy_mask,velec);
1201             velecsum         = _mm256_add_pd(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1206
1207             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm256_mul_pd(fscal,dx32);
1211             ty               = _mm256_mul_pd(fscal,dy32);
1212             tz               = _mm256_mul_pd(fscal,dz32);
1213
1214             /* Update vectorial force */
1215             fix3             = _mm256_add_pd(fix3,tx);
1216             fiy3             = _mm256_add_pd(fiy3,ty);
1217             fiz3             = _mm256_add_pd(fiz3,tz);
1218
1219             fjx2             = _mm256_add_pd(fjx2,tx);
1220             fjy2             = _mm256_add_pd(fjy2,ty);
1221             fjz2             = _mm256_add_pd(fjz2,tz);
1222
1223             }
1224
1225             /**************************
1226              * CALCULATE INTERACTIONS *
1227              **************************/
1228
1229             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1230             {
1231
1232             /* REACTION-FIELD ELECTROSTATICS */
1233             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1234             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1235
1236             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1237
1238             /* Update potential sum for this i atom from the interaction with this j atom. */
1239             velec            = _mm256_and_pd(velec,cutoff_mask);
1240             velec            = _mm256_andnot_pd(dummy_mask,velec);
1241             velecsum         = _mm256_add_pd(velecsum,velec);
1242
1243             fscal            = felec;
1244
1245             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1246
1247             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1248
1249             /* Calculate temporary vectorial force */
1250             tx               = _mm256_mul_pd(fscal,dx33);
1251             ty               = _mm256_mul_pd(fscal,dy33);
1252             tz               = _mm256_mul_pd(fscal,dz33);
1253
1254             /* Update vectorial force */
1255             fix3             = _mm256_add_pd(fix3,tx);
1256             fiy3             = _mm256_add_pd(fiy3,ty);
1257             fiz3             = _mm256_add_pd(fiz3,tz);
1258
1259             fjx3             = _mm256_add_pd(fjx3,tx);
1260             fjy3             = _mm256_add_pd(fjy3,ty);
1261             fjz3             = _mm256_add_pd(fjz3,tz);
1262
1263             }
1264
1265             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269
1270             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1271                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1272                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1273
1274             /* Inner loop uses 387 flops */
1275         }
1276
1277         /* End of innermost loop */
1278
1279         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1280                                                  f+i_coord_offset,fshift+i_shift_offset);
1281
1282         ggid                        = gid[iidx];
1283         /* Update potential energies */
1284         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1285         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1286
1287         /* Increment number of inner iterations */
1288         inneriter                  += j_index_end - j_index_start;
1289
1290         /* Outer loop uses 26 flops */
1291     }
1292
1293     /* Increment number of outer iterations */
1294     outeriter        += nri;
1295
1296     /* Update outer/inner flops */
1297
1298     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1299 }
1300 /*
1301  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1302  * Electrostatics interaction: ReactionField
1303  * VdW interaction:            LennardJones
1304  * Geometry:                   Water4-Water4
1305  * Calculate force/pot:        Force
1306  */
1307 void
1308 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1309                     (t_nblist                    * gmx_restrict       nlist,
1310                      rvec                        * gmx_restrict          xx,
1311                      rvec                        * gmx_restrict          ff,
1312                      t_forcerec                  * gmx_restrict          fr,
1313                      t_mdatoms                   * gmx_restrict     mdatoms,
1314                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1315                      t_nrnb                      * gmx_restrict        nrnb)
1316 {
1317     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1318      * just 0 for non-waters.
1319      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1320      * jnr indices corresponding to data put in the four positions in the SIMD register.
1321      */
1322     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1323     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324     int              jnrA,jnrB,jnrC,jnrD;
1325     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1327     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1328     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1329     real             rcutoff_scalar;
1330     real             *shiftvec,*fshift,*x,*f;
1331     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1332     real             scratch[4*DIM];
1333     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1334     real *           vdwioffsetptr0;
1335     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1336     real *           vdwioffsetptr1;
1337     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1338     real *           vdwioffsetptr2;
1339     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1340     real *           vdwioffsetptr3;
1341     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1342     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1343     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1344     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1345     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1346     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1347     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1348     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1349     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1350     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1351     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1352     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1353     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1354     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1355     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1356     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1357     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1358     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1359     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1360     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1361     real             *charge;
1362     int              nvdwtype;
1363     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1364     int              *vdwtype;
1365     real             *vdwparam;
1366     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1367     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1368     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1369     real             rswitch_scalar,d_scalar;
1370     __m256d          dummy_mask,cutoff_mask;
1371     __m128           tmpmask0,tmpmask1;
1372     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1373     __m256d          one     = _mm256_set1_pd(1.0);
1374     __m256d          two     = _mm256_set1_pd(2.0);
1375     x                = xx[0];
1376     f                = ff[0];
1377
1378     nri              = nlist->nri;
1379     iinr             = nlist->iinr;
1380     jindex           = nlist->jindex;
1381     jjnr             = nlist->jjnr;
1382     shiftidx         = nlist->shift;
1383     gid              = nlist->gid;
1384     shiftvec         = fr->shift_vec[0];
1385     fshift           = fr->fshift[0];
1386     facel            = _mm256_set1_pd(fr->epsfac);
1387     charge           = mdatoms->chargeA;
1388     krf              = _mm256_set1_pd(fr->ic->k_rf);
1389     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1390     crf              = _mm256_set1_pd(fr->ic->c_rf);
1391     nvdwtype         = fr->ntype;
1392     vdwparam         = fr->nbfp;
1393     vdwtype          = mdatoms->typeA;
1394
1395     /* Setup water-specific parameters */
1396     inr              = nlist->iinr[0];
1397     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1398     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1399     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1400     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1401
1402     jq1              = _mm256_set1_pd(charge[inr+1]);
1403     jq2              = _mm256_set1_pd(charge[inr+2]);
1404     jq3              = _mm256_set1_pd(charge[inr+3]);
1405     vdwjidx0A        = 2*vdwtype[inr+0];
1406     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1407     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1408     qq11             = _mm256_mul_pd(iq1,jq1);
1409     qq12             = _mm256_mul_pd(iq1,jq2);
1410     qq13             = _mm256_mul_pd(iq1,jq3);
1411     qq21             = _mm256_mul_pd(iq2,jq1);
1412     qq22             = _mm256_mul_pd(iq2,jq2);
1413     qq23             = _mm256_mul_pd(iq2,jq3);
1414     qq31             = _mm256_mul_pd(iq3,jq1);
1415     qq32             = _mm256_mul_pd(iq3,jq2);
1416     qq33             = _mm256_mul_pd(iq3,jq3);
1417
1418     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1419     rcutoff_scalar   = fr->rcoulomb;
1420     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1421     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1422
1423     rswitch_scalar   = fr->rvdw_switch;
1424     rswitch          = _mm256_set1_pd(rswitch_scalar);
1425     /* Setup switch parameters */
1426     d_scalar         = rcutoff_scalar-rswitch_scalar;
1427     d                = _mm256_set1_pd(d_scalar);
1428     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1429     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1430     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1431     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1432     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1433     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1434
1435     /* Avoid stupid compiler warnings */
1436     jnrA = jnrB = jnrC = jnrD = 0;
1437     j_coord_offsetA = 0;
1438     j_coord_offsetB = 0;
1439     j_coord_offsetC = 0;
1440     j_coord_offsetD = 0;
1441
1442     outeriter        = 0;
1443     inneriter        = 0;
1444
1445     for(iidx=0;iidx<4*DIM;iidx++)
1446     {
1447         scratch[iidx] = 0.0;
1448     }
1449
1450     /* Start outer loop over neighborlists */
1451     for(iidx=0; iidx<nri; iidx++)
1452     {
1453         /* Load shift vector for this list */
1454         i_shift_offset   = DIM*shiftidx[iidx];
1455
1456         /* Load limits for loop over neighbors */
1457         j_index_start    = jindex[iidx];
1458         j_index_end      = jindex[iidx+1];
1459
1460         /* Get outer coordinate index */
1461         inr              = iinr[iidx];
1462         i_coord_offset   = DIM*inr;
1463
1464         /* Load i particle coords and add shift vector */
1465         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1466                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1467
1468         fix0             = _mm256_setzero_pd();
1469         fiy0             = _mm256_setzero_pd();
1470         fiz0             = _mm256_setzero_pd();
1471         fix1             = _mm256_setzero_pd();
1472         fiy1             = _mm256_setzero_pd();
1473         fiz1             = _mm256_setzero_pd();
1474         fix2             = _mm256_setzero_pd();
1475         fiy2             = _mm256_setzero_pd();
1476         fiz2             = _mm256_setzero_pd();
1477         fix3             = _mm256_setzero_pd();
1478         fiy3             = _mm256_setzero_pd();
1479         fiz3             = _mm256_setzero_pd();
1480
1481         /* Start inner kernel loop */
1482         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1483         {
1484
1485             /* Get j neighbor index, and coordinate index */
1486             jnrA             = jjnr[jidx];
1487             jnrB             = jjnr[jidx+1];
1488             jnrC             = jjnr[jidx+2];
1489             jnrD             = jjnr[jidx+3];
1490             j_coord_offsetA  = DIM*jnrA;
1491             j_coord_offsetB  = DIM*jnrB;
1492             j_coord_offsetC  = DIM*jnrC;
1493             j_coord_offsetD  = DIM*jnrD;
1494
1495             /* load j atom coordinates */
1496             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1497                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1498                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1499                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1500
1501             /* Calculate displacement vector */
1502             dx00             = _mm256_sub_pd(ix0,jx0);
1503             dy00             = _mm256_sub_pd(iy0,jy0);
1504             dz00             = _mm256_sub_pd(iz0,jz0);
1505             dx11             = _mm256_sub_pd(ix1,jx1);
1506             dy11             = _mm256_sub_pd(iy1,jy1);
1507             dz11             = _mm256_sub_pd(iz1,jz1);
1508             dx12             = _mm256_sub_pd(ix1,jx2);
1509             dy12             = _mm256_sub_pd(iy1,jy2);
1510             dz12             = _mm256_sub_pd(iz1,jz2);
1511             dx13             = _mm256_sub_pd(ix1,jx3);
1512             dy13             = _mm256_sub_pd(iy1,jy3);
1513             dz13             = _mm256_sub_pd(iz1,jz3);
1514             dx21             = _mm256_sub_pd(ix2,jx1);
1515             dy21             = _mm256_sub_pd(iy2,jy1);
1516             dz21             = _mm256_sub_pd(iz2,jz1);
1517             dx22             = _mm256_sub_pd(ix2,jx2);
1518             dy22             = _mm256_sub_pd(iy2,jy2);
1519             dz22             = _mm256_sub_pd(iz2,jz2);
1520             dx23             = _mm256_sub_pd(ix2,jx3);
1521             dy23             = _mm256_sub_pd(iy2,jy3);
1522             dz23             = _mm256_sub_pd(iz2,jz3);
1523             dx31             = _mm256_sub_pd(ix3,jx1);
1524             dy31             = _mm256_sub_pd(iy3,jy1);
1525             dz31             = _mm256_sub_pd(iz3,jz1);
1526             dx32             = _mm256_sub_pd(ix3,jx2);
1527             dy32             = _mm256_sub_pd(iy3,jy2);
1528             dz32             = _mm256_sub_pd(iz3,jz2);
1529             dx33             = _mm256_sub_pd(ix3,jx3);
1530             dy33             = _mm256_sub_pd(iy3,jy3);
1531             dz33             = _mm256_sub_pd(iz3,jz3);
1532
1533             /* Calculate squared distance and things based on it */
1534             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1535             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1536             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1537             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1538             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1539             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1540             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1541             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1542             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1543             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1544
1545             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1546             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1547             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1548             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1549             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1550             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1551             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1552             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1553             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1554             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1555
1556             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1557             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1558             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1559             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1560             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1561             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1562             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1563             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1564             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1565             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1566
1567             fjx0             = _mm256_setzero_pd();
1568             fjy0             = _mm256_setzero_pd();
1569             fjz0             = _mm256_setzero_pd();
1570             fjx1             = _mm256_setzero_pd();
1571             fjy1             = _mm256_setzero_pd();
1572             fjz1             = _mm256_setzero_pd();
1573             fjx2             = _mm256_setzero_pd();
1574             fjy2             = _mm256_setzero_pd();
1575             fjz2             = _mm256_setzero_pd();
1576             fjx3             = _mm256_setzero_pd();
1577             fjy3             = _mm256_setzero_pd();
1578             fjz3             = _mm256_setzero_pd();
1579
1580             /**************************
1581              * CALCULATE INTERACTIONS *
1582              **************************/
1583
1584             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1585             {
1586
1587             r00              = _mm256_mul_pd(rsq00,rinv00);
1588
1589             /* LENNARD-JONES DISPERSION/REPULSION */
1590
1591             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1592             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1593             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1594             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1595             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1596
1597             d                = _mm256_sub_pd(r00,rswitch);
1598             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1599             d2               = _mm256_mul_pd(d,d);
1600             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1601
1602             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1603
1604             /* Evaluate switch function */
1605             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1606             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1607             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1608
1609             fscal            = fvdw;
1610
1611             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1612
1613             /* Calculate temporary vectorial force */
1614             tx               = _mm256_mul_pd(fscal,dx00);
1615             ty               = _mm256_mul_pd(fscal,dy00);
1616             tz               = _mm256_mul_pd(fscal,dz00);
1617
1618             /* Update vectorial force */
1619             fix0             = _mm256_add_pd(fix0,tx);
1620             fiy0             = _mm256_add_pd(fiy0,ty);
1621             fiz0             = _mm256_add_pd(fiz0,tz);
1622
1623             fjx0             = _mm256_add_pd(fjx0,tx);
1624             fjy0             = _mm256_add_pd(fjy0,ty);
1625             fjz0             = _mm256_add_pd(fjz0,tz);
1626
1627             }
1628
1629             /**************************
1630              * CALCULATE INTERACTIONS *
1631              **************************/
1632
1633             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1634             {
1635
1636             /* REACTION-FIELD ELECTROSTATICS */
1637             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1638
1639             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1640
1641             fscal            = felec;
1642
1643             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1644
1645             /* Calculate temporary vectorial force */
1646             tx               = _mm256_mul_pd(fscal,dx11);
1647             ty               = _mm256_mul_pd(fscal,dy11);
1648             tz               = _mm256_mul_pd(fscal,dz11);
1649
1650             /* Update vectorial force */
1651             fix1             = _mm256_add_pd(fix1,tx);
1652             fiy1             = _mm256_add_pd(fiy1,ty);
1653             fiz1             = _mm256_add_pd(fiz1,tz);
1654
1655             fjx1             = _mm256_add_pd(fjx1,tx);
1656             fjy1             = _mm256_add_pd(fjy1,ty);
1657             fjz1             = _mm256_add_pd(fjz1,tz);
1658
1659             }
1660
1661             /**************************
1662              * CALCULATE INTERACTIONS *
1663              **************************/
1664
1665             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1666             {
1667
1668             /* REACTION-FIELD ELECTROSTATICS */
1669             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1670
1671             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1672
1673             fscal            = felec;
1674
1675             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1676
1677             /* Calculate temporary vectorial force */
1678             tx               = _mm256_mul_pd(fscal,dx12);
1679             ty               = _mm256_mul_pd(fscal,dy12);
1680             tz               = _mm256_mul_pd(fscal,dz12);
1681
1682             /* Update vectorial force */
1683             fix1             = _mm256_add_pd(fix1,tx);
1684             fiy1             = _mm256_add_pd(fiy1,ty);
1685             fiz1             = _mm256_add_pd(fiz1,tz);
1686
1687             fjx2             = _mm256_add_pd(fjx2,tx);
1688             fjy2             = _mm256_add_pd(fjy2,ty);
1689             fjz2             = _mm256_add_pd(fjz2,tz);
1690
1691             }
1692
1693             /**************************
1694              * CALCULATE INTERACTIONS *
1695              **************************/
1696
1697             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1698             {
1699
1700             /* REACTION-FIELD ELECTROSTATICS */
1701             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1702
1703             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1704
1705             fscal            = felec;
1706
1707             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1708
1709             /* Calculate temporary vectorial force */
1710             tx               = _mm256_mul_pd(fscal,dx13);
1711             ty               = _mm256_mul_pd(fscal,dy13);
1712             tz               = _mm256_mul_pd(fscal,dz13);
1713
1714             /* Update vectorial force */
1715             fix1             = _mm256_add_pd(fix1,tx);
1716             fiy1             = _mm256_add_pd(fiy1,ty);
1717             fiz1             = _mm256_add_pd(fiz1,tz);
1718
1719             fjx3             = _mm256_add_pd(fjx3,tx);
1720             fjy3             = _mm256_add_pd(fjy3,ty);
1721             fjz3             = _mm256_add_pd(fjz3,tz);
1722
1723             }
1724
1725             /**************************
1726              * CALCULATE INTERACTIONS *
1727              **************************/
1728
1729             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1730             {
1731
1732             /* REACTION-FIELD ELECTROSTATICS */
1733             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1734
1735             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1736
1737             fscal            = felec;
1738
1739             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1740
1741             /* Calculate temporary vectorial force */
1742             tx               = _mm256_mul_pd(fscal,dx21);
1743             ty               = _mm256_mul_pd(fscal,dy21);
1744             tz               = _mm256_mul_pd(fscal,dz21);
1745
1746             /* Update vectorial force */
1747             fix2             = _mm256_add_pd(fix2,tx);
1748             fiy2             = _mm256_add_pd(fiy2,ty);
1749             fiz2             = _mm256_add_pd(fiz2,tz);
1750
1751             fjx1             = _mm256_add_pd(fjx1,tx);
1752             fjy1             = _mm256_add_pd(fjy1,ty);
1753             fjz1             = _mm256_add_pd(fjz1,tz);
1754
1755             }
1756
1757             /**************************
1758              * CALCULATE INTERACTIONS *
1759              **************************/
1760
1761             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1762             {
1763
1764             /* REACTION-FIELD ELECTROSTATICS */
1765             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1766
1767             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1768
1769             fscal            = felec;
1770
1771             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm256_mul_pd(fscal,dx22);
1775             ty               = _mm256_mul_pd(fscal,dy22);
1776             tz               = _mm256_mul_pd(fscal,dz22);
1777
1778             /* Update vectorial force */
1779             fix2             = _mm256_add_pd(fix2,tx);
1780             fiy2             = _mm256_add_pd(fiy2,ty);
1781             fiz2             = _mm256_add_pd(fiz2,tz);
1782
1783             fjx2             = _mm256_add_pd(fjx2,tx);
1784             fjy2             = _mm256_add_pd(fjy2,ty);
1785             fjz2             = _mm256_add_pd(fjz2,tz);
1786
1787             }
1788
1789             /**************************
1790              * CALCULATE INTERACTIONS *
1791              **************************/
1792
1793             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1794             {
1795
1796             /* REACTION-FIELD ELECTROSTATICS */
1797             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1798
1799             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1800
1801             fscal            = felec;
1802
1803             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1804
1805             /* Calculate temporary vectorial force */
1806             tx               = _mm256_mul_pd(fscal,dx23);
1807             ty               = _mm256_mul_pd(fscal,dy23);
1808             tz               = _mm256_mul_pd(fscal,dz23);
1809
1810             /* Update vectorial force */
1811             fix2             = _mm256_add_pd(fix2,tx);
1812             fiy2             = _mm256_add_pd(fiy2,ty);
1813             fiz2             = _mm256_add_pd(fiz2,tz);
1814
1815             fjx3             = _mm256_add_pd(fjx3,tx);
1816             fjy3             = _mm256_add_pd(fjy3,ty);
1817             fjz3             = _mm256_add_pd(fjz3,tz);
1818
1819             }
1820
1821             /**************************
1822              * CALCULATE INTERACTIONS *
1823              **************************/
1824
1825             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1826             {
1827
1828             /* REACTION-FIELD ELECTROSTATICS */
1829             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1830
1831             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1832
1833             fscal            = felec;
1834
1835             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1836
1837             /* Calculate temporary vectorial force */
1838             tx               = _mm256_mul_pd(fscal,dx31);
1839             ty               = _mm256_mul_pd(fscal,dy31);
1840             tz               = _mm256_mul_pd(fscal,dz31);
1841
1842             /* Update vectorial force */
1843             fix3             = _mm256_add_pd(fix3,tx);
1844             fiy3             = _mm256_add_pd(fiy3,ty);
1845             fiz3             = _mm256_add_pd(fiz3,tz);
1846
1847             fjx1             = _mm256_add_pd(fjx1,tx);
1848             fjy1             = _mm256_add_pd(fjy1,ty);
1849             fjz1             = _mm256_add_pd(fjz1,tz);
1850
1851             }
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1858             {
1859
1860             /* REACTION-FIELD ELECTROSTATICS */
1861             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1862
1863             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1864
1865             fscal            = felec;
1866
1867             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1868
1869             /* Calculate temporary vectorial force */
1870             tx               = _mm256_mul_pd(fscal,dx32);
1871             ty               = _mm256_mul_pd(fscal,dy32);
1872             tz               = _mm256_mul_pd(fscal,dz32);
1873
1874             /* Update vectorial force */
1875             fix3             = _mm256_add_pd(fix3,tx);
1876             fiy3             = _mm256_add_pd(fiy3,ty);
1877             fiz3             = _mm256_add_pd(fiz3,tz);
1878
1879             fjx2             = _mm256_add_pd(fjx2,tx);
1880             fjy2             = _mm256_add_pd(fjy2,ty);
1881             fjz2             = _mm256_add_pd(fjz2,tz);
1882
1883             }
1884
1885             /**************************
1886              * CALCULATE INTERACTIONS *
1887              **************************/
1888
1889             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1890             {
1891
1892             /* REACTION-FIELD ELECTROSTATICS */
1893             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1894
1895             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1896
1897             fscal            = felec;
1898
1899             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1900
1901             /* Calculate temporary vectorial force */
1902             tx               = _mm256_mul_pd(fscal,dx33);
1903             ty               = _mm256_mul_pd(fscal,dy33);
1904             tz               = _mm256_mul_pd(fscal,dz33);
1905
1906             /* Update vectorial force */
1907             fix3             = _mm256_add_pd(fix3,tx);
1908             fiy3             = _mm256_add_pd(fiy3,ty);
1909             fiz3             = _mm256_add_pd(fiz3,tz);
1910
1911             fjx3             = _mm256_add_pd(fjx3,tx);
1912             fjy3             = _mm256_add_pd(fjy3,ty);
1913             fjz3             = _mm256_add_pd(fjz3,tz);
1914
1915             }
1916
1917             fjptrA             = f+j_coord_offsetA;
1918             fjptrB             = f+j_coord_offsetB;
1919             fjptrC             = f+j_coord_offsetC;
1920             fjptrD             = f+j_coord_offsetD;
1921
1922             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1923                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1924                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1925
1926             /* Inner loop uses 329 flops */
1927         }
1928
1929         if(jidx<j_index_end)
1930         {
1931
1932             /* Get j neighbor index, and coordinate index */
1933             jnrlistA         = jjnr[jidx];
1934             jnrlistB         = jjnr[jidx+1];
1935             jnrlistC         = jjnr[jidx+2];
1936             jnrlistD         = jjnr[jidx+3];
1937             /* Sign of each element will be negative for non-real atoms.
1938              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1939              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1940              */
1941             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1942
1943             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1944             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1945             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1946
1947             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1948             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1949             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1950             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1951             j_coord_offsetA  = DIM*jnrA;
1952             j_coord_offsetB  = DIM*jnrB;
1953             j_coord_offsetC  = DIM*jnrC;
1954             j_coord_offsetD  = DIM*jnrD;
1955
1956             /* load j atom coordinates */
1957             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1958                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1959                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1960                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1961
1962             /* Calculate displacement vector */
1963             dx00             = _mm256_sub_pd(ix0,jx0);
1964             dy00             = _mm256_sub_pd(iy0,jy0);
1965             dz00             = _mm256_sub_pd(iz0,jz0);
1966             dx11             = _mm256_sub_pd(ix1,jx1);
1967             dy11             = _mm256_sub_pd(iy1,jy1);
1968             dz11             = _mm256_sub_pd(iz1,jz1);
1969             dx12             = _mm256_sub_pd(ix1,jx2);
1970             dy12             = _mm256_sub_pd(iy1,jy2);
1971             dz12             = _mm256_sub_pd(iz1,jz2);
1972             dx13             = _mm256_sub_pd(ix1,jx3);
1973             dy13             = _mm256_sub_pd(iy1,jy3);
1974             dz13             = _mm256_sub_pd(iz1,jz3);
1975             dx21             = _mm256_sub_pd(ix2,jx1);
1976             dy21             = _mm256_sub_pd(iy2,jy1);
1977             dz21             = _mm256_sub_pd(iz2,jz1);
1978             dx22             = _mm256_sub_pd(ix2,jx2);
1979             dy22             = _mm256_sub_pd(iy2,jy2);
1980             dz22             = _mm256_sub_pd(iz2,jz2);
1981             dx23             = _mm256_sub_pd(ix2,jx3);
1982             dy23             = _mm256_sub_pd(iy2,jy3);
1983             dz23             = _mm256_sub_pd(iz2,jz3);
1984             dx31             = _mm256_sub_pd(ix3,jx1);
1985             dy31             = _mm256_sub_pd(iy3,jy1);
1986             dz31             = _mm256_sub_pd(iz3,jz1);
1987             dx32             = _mm256_sub_pd(ix3,jx2);
1988             dy32             = _mm256_sub_pd(iy3,jy2);
1989             dz32             = _mm256_sub_pd(iz3,jz2);
1990             dx33             = _mm256_sub_pd(ix3,jx3);
1991             dy33             = _mm256_sub_pd(iy3,jy3);
1992             dz33             = _mm256_sub_pd(iz3,jz3);
1993
1994             /* Calculate squared distance and things based on it */
1995             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1996             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1997             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1998             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1999             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2000             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2001             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2002             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2003             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2004             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2005
2006             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2007             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2008             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2009             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2010             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2011             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2012             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2013             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2014             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2015             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2016
2017             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2018             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2019             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2020             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2021             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2022             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2023             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2024             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2025             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2026             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2027
2028             fjx0             = _mm256_setzero_pd();
2029             fjy0             = _mm256_setzero_pd();
2030             fjz0             = _mm256_setzero_pd();
2031             fjx1             = _mm256_setzero_pd();
2032             fjy1             = _mm256_setzero_pd();
2033             fjz1             = _mm256_setzero_pd();
2034             fjx2             = _mm256_setzero_pd();
2035             fjy2             = _mm256_setzero_pd();
2036             fjz2             = _mm256_setzero_pd();
2037             fjx3             = _mm256_setzero_pd();
2038             fjy3             = _mm256_setzero_pd();
2039             fjz3             = _mm256_setzero_pd();
2040
2041             /**************************
2042              * CALCULATE INTERACTIONS *
2043              **************************/
2044
2045             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2046             {
2047
2048             r00              = _mm256_mul_pd(rsq00,rinv00);
2049             r00              = _mm256_andnot_pd(dummy_mask,r00);
2050
2051             /* LENNARD-JONES DISPERSION/REPULSION */
2052
2053             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2054             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2055             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2056             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2057             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2058
2059             d                = _mm256_sub_pd(r00,rswitch);
2060             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2061             d2               = _mm256_mul_pd(d,d);
2062             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2063
2064             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2065
2066             /* Evaluate switch function */
2067             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2068             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2069             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2070
2071             fscal            = fvdw;
2072
2073             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2074
2075             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2076
2077             /* Calculate temporary vectorial force */
2078             tx               = _mm256_mul_pd(fscal,dx00);
2079             ty               = _mm256_mul_pd(fscal,dy00);
2080             tz               = _mm256_mul_pd(fscal,dz00);
2081
2082             /* Update vectorial force */
2083             fix0             = _mm256_add_pd(fix0,tx);
2084             fiy0             = _mm256_add_pd(fiy0,ty);
2085             fiz0             = _mm256_add_pd(fiz0,tz);
2086
2087             fjx0             = _mm256_add_pd(fjx0,tx);
2088             fjy0             = _mm256_add_pd(fjy0,ty);
2089             fjz0             = _mm256_add_pd(fjz0,tz);
2090
2091             }
2092
2093             /**************************
2094              * CALCULATE INTERACTIONS *
2095              **************************/
2096
2097             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2098             {
2099
2100             /* REACTION-FIELD ELECTROSTATICS */
2101             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2102
2103             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2104
2105             fscal            = felec;
2106
2107             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2108
2109             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2110
2111             /* Calculate temporary vectorial force */
2112             tx               = _mm256_mul_pd(fscal,dx11);
2113             ty               = _mm256_mul_pd(fscal,dy11);
2114             tz               = _mm256_mul_pd(fscal,dz11);
2115
2116             /* Update vectorial force */
2117             fix1             = _mm256_add_pd(fix1,tx);
2118             fiy1             = _mm256_add_pd(fiy1,ty);
2119             fiz1             = _mm256_add_pd(fiz1,tz);
2120
2121             fjx1             = _mm256_add_pd(fjx1,tx);
2122             fjy1             = _mm256_add_pd(fjy1,ty);
2123             fjz1             = _mm256_add_pd(fjz1,tz);
2124
2125             }
2126
2127             /**************************
2128              * CALCULATE INTERACTIONS *
2129              **************************/
2130
2131             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2132             {
2133
2134             /* REACTION-FIELD ELECTROSTATICS */
2135             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2136
2137             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2138
2139             fscal            = felec;
2140
2141             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2142
2143             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm256_mul_pd(fscal,dx12);
2147             ty               = _mm256_mul_pd(fscal,dy12);
2148             tz               = _mm256_mul_pd(fscal,dz12);
2149
2150             /* Update vectorial force */
2151             fix1             = _mm256_add_pd(fix1,tx);
2152             fiy1             = _mm256_add_pd(fiy1,ty);
2153             fiz1             = _mm256_add_pd(fiz1,tz);
2154
2155             fjx2             = _mm256_add_pd(fjx2,tx);
2156             fjy2             = _mm256_add_pd(fjy2,ty);
2157             fjz2             = _mm256_add_pd(fjz2,tz);
2158
2159             }
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2166             {
2167
2168             /* REACTION-FIELD ELECTROSTATICS */
2169             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2170
2171             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2172
2173             fscal            = felec;
2174
2175             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2176
2177             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2178
2179             /* Calculate temporary vectorial force */
2180             tx               = _mm256_mul_pd(fscal,dx13);
2181             ty               = _mm256_mul_pd(fscal,dy13);
2182             tz               = _mm256_mul_pd(fscal,dz13);
2183
2184             /* Update vectorial force */
2185             fix1             = _mm256_add_pd(fix1,tx);
2186             fiy1             = _mm256_add_pd(fiy1,ty);
2187             fiz1             = _mm256_add_pd(fiz1,tz);
2188
2189             fjx3             = _mm256_add_pd(fjx3,tx);
2190             fjy3             = _mm256_add_pd(fjy3,ty);
2191             fjz3             = _mm256_add_pd(fjz3,tz);
2192
2193             }
2194
2195             /**************************
2196              * CALCULATE INTERACTIONS *
2197              **************************/
2198
2199             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2200             {
2201
2202             /* REACTION-FIELD ELECTROSTATICS */
2203             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2204
2205             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2206
2207             fscal            = felec;
2208
2209             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2210
2211             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2212
2213             /* Calculate temporary vectorial force */
2214             tx               = _mm256_mul_pd(fscal,dx21);
2215             ty               = _mm256_mul_pd(fscal,dy21);
2216             tz               = _mm256_mul_pd(fscal,dz21);
2217
2218             /* Update vectorial force */
2219             fix2             = _mm256_add_pd(fix2,tx);
2220             fiy2             = _mm256_add_pd(fiy2,ty);
2221             fiz2             = _mm256_add_pd(fiz2,tz);
2222
2223             fjx1             = _mm256_add_pd(fjx1,tx);
2224             fjy1             = _mm256_add_pd(fjy1,ty);
2225             fjz1             = _mm256_add_pd(fjz1,tz);
2226
2227             }
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2234             {
2235
2236             /* REACTION-FIELD ELECTROSTATICS */
2237             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2238
2239             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2240
2241             fscal            = felec;
2242
2243             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2244
2245             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2246
2247             /* Calculate temporary vectorial force */
2248             tx               = _mm256_mul_pd(fscal,dx22);
2249             ty               = _mm256_mul_pd(fscal,dy22);
2250             tz               = _mm256_mul_pd(fscal,dz22);
2251
2252             /* Update vectorial force */
2253             fix2             = _mm256_add_pd(fix2,tx);
2254             fiy2             = _mm256_add_pd(fiy2,ty);
2255             fiz2             = _mm256_add_pd(fiz2,tz);
2256
2257             fjx2             = _mm256_add_pd(fjx2,tx);
2258             fjy2             = _mm256_add_pd(fjy2,ty);
2259             fjz2             = _mm256_add_pd(fjz2,tz);
2260
2261             }
2262
2263             /**************************
2264              * CALCULATE INTERACTIONS *
2265              **************************/
2266
2267             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2268             {
2269
2270             /* REACTION-FIELD ELECTROSTATICS */
2271             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2272
2273             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2274
2275             fscal            = felec;
2276
2277             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2278
2279             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2280
2281             /* Calculate temporary vectorial force */
2282             tx               = _mm256_mul_pd(fscal,dx23);
2283             ty               = _mm256_mul_pd(fscal,dy23);
2284             tz               = _mm256_mul_pd(fscal,dz23);
2285
2286             /* Update vectorial force */
2287             fix2             = _mm256_add_pd(fix2,tx);
2288             fiy2             = _mm256_add_pd(fiy2,ty);
2289             fiz2             = _mm256_add_pd(fiz2,tz);
2290
2291             fjx3             = _mm256_add_pd(fjx3,tx);
2292             fjy3             = _mm256_add_pd(fjy3,ty);
2293             fjz3             = _mm256_add_pd(fjz3,tz);
2294
2295             }
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2302             {
2303
2304             /* REACTION-FIELD ELECTROSTATICS */
2305             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2306
2307             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2308
2309             fscal            = felec;
2310
2311             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2312
2313             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2314
2315             /* Calculate temporary vectorial force */
2316             tx               = _mm256_mul_pd(fscal,dx31);
2317             ty               = _mm256_mul_pd(fscal,dy31);
2318             tz               = _mm256_mul_pd(fscal,dz31);
2319
2320             /* Update vectorial force */
2321             fix3             = _mm256_add_pd(fix3,tx);
2322             fiy3             = _mm256_add_pd(fiy3,ty);
2323             fiz3             = _mm256_add_pd(fiz3,tz);
2324
2325             fjx1             = _mm256_add_pd(fjx1,tx);
2326             fjy1             = _mm256_add_pd(fjy1,ty);
2327             fjz1             = _mm256_add_pd(fjz1,tz);
2328
2329             }
2330
2331             /**************************
2332              * CALCULATE INTERACTIONS *
2333              **************************/
2334
2335             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2336             {
2337
2338             /* REACTION-FIELD ELECTROSTATICS */
2339             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2340
2341             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2342
2343             fscal            = felec;
2344
2345             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2346
2347             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2348
2349             /* Calculate temporary vectorial force */
2350             tx               = _mm256_mul_pd(fscal,dx32);
2351             ty               = _mm256_mul_pd(fscal,dy32);
2352             tz               = _mm256_mul_pd(fscal,dz32);
2353
2354             /* Update vectorial force */
2355             fix3             = _mm256_add_pd(fix3,tx);
2356             fiy3             = _mm256_add_pd(fiy3,ty);
2357             fiz3             = _mm256_add_pd(fiz3,tz);
2358
2359             fjx2             = _mm256_add_pd(fjx2,tx);
2360             fjy2             = _mm256_add_pd(fjy2,ty);
2361             fjz2             = _mm256_add_pd(fjz2,tz);
2362
2363             }
2364
2365             /**************************
2366              * CALCULATE INTERACTIONS *
2367              **************************/
2368
2369             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2370             {
2371
2372             /* REACTION-FIELD ELECTROSTATICS */
2373             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2374
2375             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2376
2377             fscal            = felec;
2378
2379             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2380
2381             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2382
2383             /* Calculate temporary vectorial force */
2384             tx               = _mm256_mul_pd(fscal,dx33);
2385             ty               = _mm256_mul_pd(fscal,dy33);
2386             tz               = _mm256_mul_pd(fscal,dz33);
2387
2388             /* Update vectorial force */
2389             fix3             = _mm256_add_pd(fix3,tx);
2390             fiy3             = _mm256_add_pd(fiy3,ty);
2391             fiz3             = _mm256_add_pd(fiz3,tz);
2392
2393             fjx3             = _mm256_add_pd(fjx3,tx);
2394             fjy3             = _mm256_add_pd(fjy3,ty);
2395             fjz3             = _mm256_add_pd(fjz3,tz);
2396
2397             }
2398
2399             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2400             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2401             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2402             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2403
2404             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2405                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2406                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2407
2408             /* Inner loop uses 330 flops */
2409         }
2410
2411         /* End of innermost loop */
2412
2413         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2414                                                  f+i_coord_offset,fshift+i_shift_offset);
2415
2416         /* Increment number of inner iterations */
2417         inneriter                  += j_index_end - j_index_start;
2418
2419         /* Outer loop uses 24 flops */
2420     }
2421
2422     /* Increment number of outer iterations */
2423     outeriter        += nri;
2424
2425     /* Update outer/inner flops */
2426
2427     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);
2428 }