Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82     real             scratch[4*DIM];
83     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84     real *           vdwioffsetptr0;
85     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86     real *           vdwioffsetptr1;
87     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     real *           vdwioffsetptr3;
91     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
99     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
104     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
107     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
108     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
109     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
110     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
111     real             *charge;
112     int              nvdwtype;
113     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114     int              *vdwtype;
115     real             *vdwparam;
116     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
117     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
118     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
119     real             rswitch_scalar,d_scalar;
120     __m256d          dummy_mask,cutoff_mask;
121     __m128           tmpmask0,tmpmask1;
122     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
123     __m256d          one     = _mm256_set1_pd(1.0);
124     __m256d          two     = _mm256_set1_pd(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm256_set1_pd(fr->epsfac);
137     charge           = mdatoms->chargeA;
138     krf              = _mm256_set1_pd(fr->ic->k_rf);
139     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
140     crf              = _mm256_set1_pd(fr->ic->c_rf);
141     nvdwtype         = fr->ntype;
142     vdwparam         = fr->nbfp;
143     vdwtype          = mdatoms->typeA;
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
148     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
149     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
150     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm256_set1_pd(charge[inr+1]);
153     jq2              = _mm256_set1_pd(charge[inr+2]);
154     jq3              = _mm256_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
157     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
158     qq11             = _mm256_mul_pd(iq1,jq1);
159     qq12             = _mm256_mul_pd(iq1,jq2);
160     qq13             = _mm256_mul_pd(iq1,jq3);
161     qq21             = _mm256_mul_pd(iq2,jq1);
162     qq22             = _mm256_mul_pd(iq2,jq2);
163     qq23             = _mm256_mul_pd(iq2,jq3);
164     qq31             = _mm256_mul_pd(iq3,jq1);
165     qq32             = _mm256_mul_pd(iq3,jq2);
166     qq33             = _mm256_mul_pd(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
171     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
172
173     rswitch_scalar   = fr->rvdw_switch;
174     rswitch          = _mm256_set1_pd(rswitch_scalar);
175     /* Setup switch parameters */
176     d_scalar         = rcutoff_scalar-rswitch_scalar;
177     d                = _mm256_set1_pd(d_scalar);
178     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184
185     /* Avoid stupid compiler warnings */
186     jnrA = jnrB = jnrC = jnrD = 0;
187     j_coord_offsetA = 0;
188     j_coord_offsetB = 0;
189     j_coord_offsetC = 0;
190     j_coord_offsetD = 0;
191
192     outeriter        = 0;
193     inneriter        = 0;
194
195     for(iidx=0;iidx<4*DIM;iidx++)
196     {
197         scratch[iidx] = 0.0;
198     }
199
200     /* Start outer loop over neighborlists */
201     for(iidx=0; iidx<nri; iidx++)
202     {
203         /* Load shift vector for this list */
204         i_shift_offset   = DIM*shiftidx[iidx];
205
206         /* Load limits for loop over neighbors */
207         j_index_start    = jindex[iidx];
208         j_index_end      = jindex[iidx+1];
209
210         /* Get outer coordinate index */
211         inr              = iinr[iidx];
212         i_coord_offset   = DIM*inr;
213
214         /* Load i particle coords and add shift vector */
215         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
216                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
217
218         fix0             = _mm256_setzero_pd();
219         fiy0             = _mm256_setzero_pd();
220         fiz0             = _mm256_setzero_pd();
221         fix1             = _mm256_setzero_pd();
222         fiy1             = _mm256_setzero_pd();
223         fiz1             = _mm256_setzero_pd();
224         fix2             = _mm256_setzero_pd();
225         fiy2             = _mm256_setzero_pd();
226         fiz2             = _mm256_setzero_pd();
227         fix3             = _mm256_setzero_pd();
228         fiy3             = _mm256_setzero_pd();
229         fiz3             = _mm256_setzero_pd();
230
231         /* Reset potential sums */
232         velecsum         = _mm256_setzero_pd();
233         vvdwsum          = _mm256_setzero_pd();
234
235         /* Start inner kernel loop */
236         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
237         {
238
239             /* Get j neighbor index, and coordinate index */
240             jnrA             = jjnr[jidx];
241             jnrB             = jjnr[jidx+1];
242             jnrC             = jjnr[jidx+2];
243             jnrD             = jjnr[jidx+3];
244             j_coord_offsetA  = DIM*jnrA;
245             j_coord_offsetB  = DIM*jnrB;
246             j_coord_offsetC  = DIM*jnrC;
247             j_coord_offsetD  = DIM*jnrD;
248
249             /* load j atom coordinates */
250             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
251                                                  x+j_coord_offsetC,x+j_coord_offsetD,
252                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
253                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
254
255             /* Calculate displacement vector */
256             dx00             = _mm256_sub_pd(ix0,jx0);
257             dy00             = _mm256_sub_pd(iy0,jy0);
258             dz00             = _mm256_sub_pd(iz0,jz0);
259             dx11             = _mm256_sub_pd(ix1,jx1);
260             dy11             = _mm256_sub_pd(iy1,jy1);
261             dz11             = _mm256_sub_pd(iz1,jz1);
262             dx12             = _mm256_sub_pd(ix1,jx2);
263             dy12             = _mm256_sub_pd(iy1,jy2);
264             dz12             = _mm256_sub_pd(iz1,jz2);
265             dx13             = _mm256_sub_pd(ix1,jx3);
266             dy13             = _mm256_sub_pd(iy1,jy3);
267             dz13             = _mm256_sub_pd(iz1,jz3);
268             dx21             = _mm256_sub_pd(ix2,jx1);
269             dy21             = _mm256_sub_pd(iy2,jy1);
270             dz21             = _mm256_sub_pd(iz2,jz1);
271             dx22             = _mm256_sub_pd(ix2,jx2);
272             dy22             = _mm256_sub_pd(iy2,jy2);
273             dz22             = _mm256_sub_pd(iz2,jz2);
274             dx23             = _mm256_sub_pd(ix2,jx3);
275             dy23             = _mm256_sub_pd(iy2,jy3);
276             dz23             = _mm256_sub_pd(iz2,jz3);
277             dx31             = _mm256_sub_pd(ix3,jx1);
278             dy31             = _mm256_sub_pd(iy3,jy1);
279             dz31             = _mm256_sub_pd(iz3,jz1);
280             dx32             = _mm256_sub_pd(ix3,jx2);
281             dy32             = _mm256_sub_pd(iy3,jy2);
282             dz32             = _mm256_sub_pd(iz3,jz2);
283             dx33             = _mm256_sub_pd(ix3,jx3);
284             dy33             = _mm256_sub_pd(iy3,jy3);
285             dz33             = _mm256_sub_pd(iz3,jz3);
286
287             /* Calculate squared distance and things based on it */
288             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
289             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
290             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
291             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
292             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
293             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
294             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
295             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
296             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
297             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
298
299             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
300             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
301             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
302             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
303             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
304             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
305             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
306             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
307             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
308             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
309
310             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
311             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
312             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
313             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
314             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
315             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
316             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
317             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
318             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
319             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
320
321             fjx0             = _mm256_setzero_pd();
322             fjy0             = _mm256_setzero_pd();
323             fjz0             = _mm256_setzero_pd();
324             fjx1             = _mm256_setzero_pd();
325             fjy1             = _mm256_setzero_pd();
326             fjz1             = _mm256_setzero_pd();
327             fjx2             = _mm256_setzero_pd();
328             fjy2             = _mm256_setzero_pd();
329             fjz2             = _mm256_setzero_pd();
330             fjx3             = _mm256_setzero_pd();
331             fjy3             = _mm256_setzero_pd();
332             fjz3             = _mm256_setzero_pd();
333
334             /**************************
335              * CALCULATE INTERACTIONS *
336              **************************/
337
338             if (gmx_mm256_any_lt(rsq00,rcutoff2))
339             {
340
341             r00              = _mm256_mul_pd(rsq00,rinv00);
342
343             /* LENNARD-JONES DISPERSION/REPULSION */
344
345             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
346             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
347             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
348             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
349             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
350
351             d                = _mm256_sub_pd(r00,rswitch);
352             d                = _mm256_max_pd(d,_mm256_setzero_pd());
353             d2               = _mm256_mul_pd(d,d);
354             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
355
356             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
357
358             /* Evaluate switch function */
359             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
360             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
361             vvdw             = _mm256_mul_pd(vvdw,sw);
362             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
366             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
367
368             fscal            = fvdw;
369
370             fscal            = _mm256_and_pd(fscal,cutoff_mask);
371
372             /* Calculate temporary vectorial force */
373             tx               = _mm256_mul_pd(fscal,dx00);
374             ty               = _mm256_mul_pd(fscal,dy00);
375             tz               = _mm256_mul_pd(fscal,dz00);
376
377             /* Update vectorial force */
378             fix0             = _mm256_add_pd(fix0,tx);
379             fiy0             = _mm256_add_pd(fiy0,ty);
380             fiz0             = _mm256_add_pd(fiz0,tz);
381
382             fjx0             = _mm256_add_pd(fjx0,tx);
383             fjy0             = _mm256_add_pd(fjy0,ty);
384             fjz0             = _mm256_add_pd(fjz0,tz);
385
386             }
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             if (gmx_mm256_any_lt(rsq11,rcutoff2))
393             {
394
395             /* REACTION-FIELD ELECTROSTATICS */
396             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
397             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
398
399             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
400
401             /* Update potential sum for this i atom from the interaction with this j atom. */
402             velec            = _mm256_and_pd(velec,cutoff_mask);
403             velecsum         = _mm256_add_pd(velecsum,velec);
404
405             fscal            = felec;
406
407             fscal            = _mm256_and_pd(fscal,cutoff_mask);
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm256_mul_pd(fscal,dx11);
411             ty               = _mm256_mul_pd(fscal,dy11);
412             tz               = _mm256_mul_pd(fscal,dz11);
413
414             /* Update vectorial force */
415             fix1             = _mm256_add_pd(fix1,tx);
416             fiy1             = _mm256_add_pd(fiy1,ty);
417             fiz1             = _mm256_add_pd(fiz1,tz);
418
419             fjx1             = _mm256_add_pd(fjx1,tx);
420             fjy1             = _mm256_add_pd(fjy1,ty);
421             fjz1             = _mm256_add_pd(fjz1,tz);
422
423             }
424
425             /**************************
426              * CALCULATE INTERACTIONS *
427              **************************/
428
429             if (gmx_mm256_any_lt(rsq12,rcutoff2))
430             {
431
432             /* REACTION-FIELD ELECTROSTATICS */
433             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
434             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
435
436             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
437
438             /* Update potential sum for this i atom from the interaction with this j atom. */
439             velec            = _mm256_and_pd(velec,cutoff_mask);
440             velecsum         = _mm256_add_pd(velecsum,velec);
441
442             fscal            = felec;
443
444             fscal            = _mm256_and_pd(fscal,cutoff_mask);
445
446             /* Calculate temporary vectorial force */
447             tx               = _mm256_mul_pd(fscal,dx12);
448             ty               = _mm256_mul_pd(fscal,dy12);
449             tz               = _mm256_mul_pd(fscal,dz12);
450
451             /* Update vectorial force */
452             fix1             = _mm256_add_pd(fix1,tx);
453             fiy1             = _mm256_add_pd(fiy1,ty);
454             fiz1             = _mm256_add_pd(fiz1,tz);
455
456             fjx2             = _mm256_add_pd(fjx2,tx);
457             fjy2             = _mm256_add_pd(fjy2,ty);
458             fjz2             = _mm256_add_pd(fjz2,tz);
459
460             }
461
462             /**************************
463              * CALCULATE INTERACTIONS *
464              **************************/
465
466             if (gmx_mm256_any_lt(rsq13,rcutoff2))
467             {
468
469             /* REACTION-FIELD ELECTROSTATICS */
470             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
471             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
472
473             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velec            = _mm256_and_pd(velec,cutoff_mask);
477             velecsum         = _mm256_add_pd(velecsum,velec);
478
479             fscal            = felec;
480
481             fscal            = _mm256_and_pd(fscal,cutoff_mask);
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm256_mul_pd(fscal,dx13);
485             ty               = _mm256_mul_pd(fscal,dy13);
486             tz               = _mm256_mul_pd(fscal,dz13);
487
488             /* Update vectorial force */
489             fix1             = _mm256_add_pd(fix1,tx);
490             fiy1             = _mm256_add_pd(fiy1,ty);
491             fiz1             = _mm256_add_pd(fiz1,tz);
492
493             fjx3             = _mm256_add_pd(fjx3,tx);
494             fjy3             = _mm256_add_pd(fjy3,ty);
495             fjz3             = _mm256_add_pd(fjz3,tz);
496
497             }
498
499             /**************************
500              * CALCULATE INTERACTIONS *
501              **************************/
502
503             if (gmx_mm256_any_lt(rsq21,rcutoff2))
504             {
505
506             /* REACTION-FIELD ELECTROSTATICS */
507             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
508             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
509
510             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
511
512             /* Update potential sum for this i atom from the interaction with this j atom. */
513             velec            = _mm256_and_pd(velec,cutoff_mask);
514             velecsum         = _mm256_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             fscal            = _mm256_and_pd(fscal,cutoff_mask);
519
520             /* Calculate temporary vectorial force */
521             tx               = _mm256_mul_pd(fscal,dx21);
522             ty               = _mm256_mul_pd(fscal,dy21);
523             tz               = _mm256_mul_pd(fscal,dz21);
524
525             /* Update vectorial force */
526             fix2             = _mm256_add_pd(fix2,tx);
527             fiy2             = _mm256_add_pd(fiy2,ty);
528             fiz2             = _mm256_add_pd(fiz2,tz);
529
530             fjx1             = _mm256_add_pd(fjx1,tx);
531             fjy1             = _mm256_add_pd(fjy1,ty);
532             fjz1             = _mm256_add_pd(fjz1,tz);
533
534             }
535
536             /**************************
537              * CALCULATE INTERACTIONS *
538              **************************/
539
540             if (gmx_mm256_any_lt(rsq22,rcutoff2))
541             {
542
543             /* REACTION-FIELD ELECTROSTATICS */
544             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
545             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
546
547             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
548
549             /* Update potential sum for this i atom from the interaction with this j atom. */
550             velec            = _mm256_and_pd(velec,cutoff_mask);
551             velecsum         = _mm256_add_pd(velecsum,velec);
552
553             fscal            = felec;
554
555             fscal            = _mm256_and_pd(fscal,cutoff_mask);
556
557             /* Calculate temporary vectorial force */
558             tx               = _mm256_mul_pd(fscal,dx22);
559             ty               = _mm256_mul_pd(fscal,dy22);
560             tz               = _mm256_mul_pd(fscal,dz22);
561
562             /* Update vectorial force */
563             fix2             = _mm256_add_pd(fix2,tx);
564             fiy2             = _mm256_add_pd(fiy2,ty);
565             fiz2             = _mm256_add_pd(fiz2,tz);
566
567             fjx2             = _mm256_add_pd(fjx2,tx);
568             fjy2             = _mm256_add_pd(fjy2,ty);
569             fjz2             = _mm256_add_pd(fjz2,tz);
570
571             }
572
573             /**************************
574              * CALCULATE INTERACTIONS *
575              **************************/
576
577             if (gmx_mm256_any_lt(rsq23,rcutoff2))
578             {
579
580             /* REACTION-FIELD ELECTROSTATICS */
581             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
582             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
583
584             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
585
586             /* Update potential sum for this i atom from the interaction with this j atom. */
587             velec            = _mm256_and_pd(velec,cutoff_mask);
588             velecsum         = _mm256_add_pd(velecsum,velec);
589
590             fscal            = felec;
591
592             fscal            = _mm256_and_pd(fscal,cutoff_mask);
593
594             /* Calculate temporary vectorial force */
595             tx               = _mm256_mul_pd(fscal,dx23);
596             ty               = _mm256_mul_pd(fscal,dy23);
597             tz               = _mm256_mul_pd(fscal,dz23);
598
599             /* Update vectorial force */
600             fix2             = _mm256_add_pd(fix2,tx);
601             fiy2             = _mm256_add_pd(fiy2,ty);
602             fiz2             = _mm256_add_pd(fiz2,tz);
603
604             fjx3             = _mm256_add_pd(fjx3,tx);
605             fjy3             = _mm256_add_pd(fjy3,ty);
606             fjz3             = _mm256_add_pd(fjz3,tz);
607
608             }
609
610             /**************************
611              * CALCULATE INTERACTIONS *
612              **************************/
613
614             if (gmx_mm256_any_lt(rsq31,rcutoff2))
615             {
616
617             /* REACTION-FIELD ELECTROSTATICS */
618             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
619             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
620
621             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velec            = _mm256_and_pd(velec,cutoff_mask);
625             velecsum         = _mm256_add_pd(velecsum,velec);
626
627             fscal            = felec;
628
629             fscal            = _mm256_and_pd(fscal,cutoff_mask);
630
631             /* Calculate temporary vectorial force */
632             tx               = _mm256_mul_pd(fscal,dx31);
633             ty               = _mm256_mul_pd(fscal,dy31);
634             tz               = _mm256_mul_pd(fscal,dz31);
635
636             /* Update vectorial force */
637             fix3             = _mm256_add_pd(fix3,tx);
638             fiy3             = _mm256_add_pd(fiy3,ty);
639             fiz3             = _mm256_add_pd(fiz3,tz);
640
641             fjx1             = _mm256_add_pd(fjx1,tx);
642             fjy1             = _mm256_add_pd(fjy1,ty);
643             fjz1             = _mm256_add_pd(fjz1,tz);
644
645             }
646
647             /**************************
648              * CALCULATE INTERACTIONS *
649              **************************/
650
651             if (gmx_mm256_any_lt(rsq32,rcutoff2))
652             {
653
654             /* REACTION-FIELD ELECTROSTATICS */
655             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
656             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
657
658             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
659
660             /* Update potential sum for this i atom from the interaction with this j atom. */
661             velec            = _mm256_and_pd(velec,cutoff_mask);
662             velecsum         = _mm256_add_pd(velecsum,velec);
663
664             fscal            = felec;
665
666             fscal            = _mm256_and_pd(fscal,cutoff_mask);
667
668             /* Calculate temporary vectorial force */
669             tx               = _mm256_mul_pd(fscal,dx32);
670             ty               = _mm256_mul_pd(fscal,dy32);
671             tz               = _mm256_mul_pd(fscal,dz32);
672
673             /* Update vectorial force */
674             fix3             = _mm256_add_pd(fix3,tx);
675             fiy3             = _mm256_add_pd(fiy3,ty);
676             fiz3             = _mm256_add_pd(fiz3,tz);
677
678             fjx2             = _mm256_add_pd(fjx2,tx);
679             fjy2             = _mm256_add_pd(fjy2,ty);
680             fjz2             = _mm256_add_pd(fjz2,tz);
681
682             }
683
684             /**************************
685              * CALCULATE INTERACTIONS *
686              **************************/
687
688             if (gmx_mm256_any_lt(rsq33,rcutoff2))
689             {
690
691             /* REACTION-FIELD ELECTROSTATICS */
692             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
693             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
694
695             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
696
697             /* Update potential sum for this i atom from the interaction with this j atom. */
698             velec            = _mm256_and_pd(velec,cutoff_mask);
699             velecsum         = _mm256_add_pd(velecsum,velec);
700
701             fscal            = felec;
702
703             fscal            = _mm256_and_pd(fscal,cutoff_mask);
704
705             /* Calculate temporary vectorial force */
706             tx               = _mm256_mul_pd(fscal,dx33);
707             ty               = _mm256_mul_pd(fscal,dy33);
708             tz               = _mm256_mul_pd(fscal,dz33);
709
710             /* Update vectorial force */
711             fix3             = _mm256_add_pd(fix3,tx);
712             fiy3             = _mm256_add_pd(fiy3,ty);
713             fiz3             = _mm256_add_pd(fiz3,tz);
714
715             fjx3             = _mm256_add_pd(fjx3,tx);
716             fjy3             = _mm256_add_pd(fjy3,ty);
717             fjz3             = _mm256_add_pd(fjz3,tz);
718
719             }
720
721             fjptrA             = f+j_coord_offsetA;
722             fjptrB             = f+j_coord_offsetB;
723             fjptrC             = f+j_coord_offsetC;
724             fjptrD             = f+j_coord_offsetD;
725
726             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
727                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
728                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
729
730             /* Inner loop uses 386 flops */
731         }
732
733         if(jidx<j_index_end)
734         {
735
736             /* Get j neighbor index, and coordinate index */
737             jnrlistA         = jjnr[jidx];
738             jnrlistB         = jjnr[jidx+1];
739             jnrlistC         = jjnr[jidx+2];
740             jnrlistD         = jjnr[jidx+3];
741             /* Sign of each element will be negative for non-real atoms.
742              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
743              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
744              */
745             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
746
747             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
748             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
749             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
750
751             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
752             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
753             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
754             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
755             j_coord_offsetA  = DIM*jnrA;
756             j_coord_offsetB  = DIM*jnrB;
757             j_coord_offsetC  = DIM*jnrC;
758             j_coord_offsetD  = DIM*jnrD;
759
760             /* load j atom coordinates */
761             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
762                                                  x+j_coord_offsetC,x+j_coord_offsetD,
763                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
764                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
765
766             /* Calculate displacement vector */
767             dx00             = _mm256_sub_pd(ix0,jx0);
768             dy00             = _mm256_sub_pd(iy0,jy0);
769             dz00             = _mm256_sub_pd(iz0,jz0);
770             dx11             = _mm256_sub_pd(ix1,jx1);
771             dy11             = _mm256_sub_pd(iy1,jy1);
772             dz11             = _mm256_sub_pd(iz1,jz1);
773             dx12             = _mm256_sub_pd(ix1,jx2);
774             dy12             = _mm256_sub_pd(iy1,jy2);
775             dz12             = _mm256_sub_pd(iz1,jz2);
776             dx13             = _mm256_sub_pd(ix1,jx3);
777             dy13             = _mm256_sub_pd(iy1,jy3);
778             dz13             = _mm256_sub_pd(iz1,jz3);
779             dx21             = _mm256_sub_pd(ix2,jx1);
780             dy21             = _mm256_sub_pd(iy2,jy1);
781             dz21             = _mm256_sub_pd(iz2,jz1);
782             dx22             = _mm256_sub_pd(ix2,jx2);
783             dy22             = _mm256_sub_pd(iy2,jy2);
784             dz22             = _mm256_sub_pd(iz2,jz2);
785             dx23             = _mm256_sub_pd(ix2,jx3);
786             dy23             = _mm256_sub_pd(iy2,jy3);
787             dz23             = _mm256_sub_pd(iz2,jz3);
788             dx31             = _mm256_sub_pd(ix3,jx1);
789             dy31             = _mm256_sub_pd(iy3,jy1);
790             dz31             = _mm256_sub_pd(iz3,jz1);
791             dx32             = _mm256_sub_pd(ix3,jx2);
792             dy32             = _mm256_sub_pd(iy3,jy2);
793             dz32             = _mm256_sub_pd(iz3,jz2);
794             dx33             = _mm256_sub_pd(ix3,jx3);
795             dy33             = _mm256_sub_pd(iy3,jy3);
796             dz33             = _mm256_sub_pd(iz3,jz3);
797
798             /* Calculate squared distance and things based on it */
799             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
800             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
801             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
802             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
803             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
804             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
805             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
806             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
807             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
808             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
809
810             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
811             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
812             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
813             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
814             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
815             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
816             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
817             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
818             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
819             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
820
821             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
822             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
823             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
824             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
825             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
826             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
827             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
828             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
829             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
830             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
831
832             fjx0             = _mm256_setzero_pd();
833             fjy0             = _mm256_setzero_pd();
834             fjz0             = _mm256_setzero_pd();
835             fjx1             = _mm256_setzero_pd();
836             fjy1             = _mm256_setzero_pd();
837             fjz1             = _mm256_setzero_pd();
838             fjx2             = _mm256_setzero_pd();
839             fjy2             = _mm256_setzero_pd();
840             fjz2             = _mm256_setzero_pd();
841             fjx3             = _mm256_setzero_pd();
842             fjy3             = _mm256_setzero_pd();
843             fjz3             = _mm256_setzero_pd();
844
845             /**************************
846              * CALCULATE INTERACTIONS *
847              **************************/
848
849             if (gmx_mm256_any_lt(rsq00,rcutoff2))
850             {
851
852             r00              = _mm256_mul_pd(rsq00,rinv00);
853             r00              = _mm256_andnot_pd(dummy_mask,r00);
854
855             /* LENNARD-JONES DISPERSION/REPULSION */
856
857             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
858             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
859             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
860             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
861             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
862
863             d                = _mm256_sub_pd(r00,rswitch);
864             d                = _mm256_max_pd(d,_mm256_setzero_pd());
865             d2               = _mm256_mul_pd(d,d);
866             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
867
868             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
869
870             /* Evaluate switch function */
871             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
872             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
873             vvdw             = _mm256_mul_pd(vvdw,sw);
874             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
875
876             /* Update potential sum for this i atom from the interaction with this j atom. */
877             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
878             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
879             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
880
881             fscal            = fvdw;
882
883             fscal            = _mm256_and_pd(fscal,cutoff_mask);
884
885             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
886
887             /* Calculate temporary vectorial force */
888             tx               = _mm256_mul_pd(fscal,dx00);
889             ty               = _mm256_mul_pd(fscal,dy00);
890             tz               = _mm256_mul_pd(fscal,dz00);
891
892             /* Update vectorial force */
893             fix0             = _mm256_add_pd(fix0,tx);
894             fiy0             = _mm256_add_pd(fiy0,ty);
895             fiz0             = _mm256_add_pd(fiz0,tz);
896
897             fjx0             = _mm256_add_pd(fjx0,tx);
898             fjy0             = _mm256_add_pd(fjy0,ty);
899             fjz0             = _mm256_add_pd(fjz0,tz);
900
901             }
902
903             /**************************
904              * CALCULATE INTERACTIONS *
905              **************************/
906
907             if (gmx_mm256_any_lt(rsq11,rcutoff2))
908             {
909
910             /* REACTION-FIELD ELECTROSTATICS */
911             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
912             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
913
914             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
915
916             /* Update potential sum for this i atom from the interaction with this j atom. */
917             velec            = _mm256_and_pd(velec,cutoff_mask);
918             velec            = _mm256_andnot_pd(dummy_mask,velec);
919             velecsum         = _mm256_add_pd(velecsum,velec);
920
921             fscal            = felec;
922
923             fscal            = _mm256_and_pd(fscal,cutoff_mask);
924
925             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
926
927             /* Calculate temporary vectorial force */
928             tx               = _mm256_mul_pd(fscal,dx11);
929             ty               = _mm256_mul_pd(fscal,dy11);
930             tz               = _mm256_mul_pd(fscal,dz11);
931
932             /* Update vectorial force */
933             fix1             = _mm256_add_pd(fix1,tx);
934             fiy1             = _mm256_add_pd(fiy1,ty);
935             fiz1             = _mm256_add_pd(fiz1,tz);
936
937             fjx1             = _mm256_add_pd(fjx1,tx);
938             fjy1             = _mm256_add_pd(fjy1,ty);
939             fjz1             = _mm256_add_pd(fjz1,tz);
940
941             }
942
943             /**************************
944              * CALCULATE INTERACTIONS *
945              **************************/
946
947             if (gmx_mm256_any_lt(rsq12,rcutoff2))
948             {
949
950             /* REACTION-FIELD ELECTROSTATICS */
951             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
952             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
953
954             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
955
956             /* Update potential sum for this i atom from the interaction with this j atom. */
957             velec            = _mm256_and_pd(velec,cutoff_mask);
958             velec            = _mm256_andnot_pd(dummy_mask,velec);
959             velecsum         = _mm256_add_pd(velecsum,velec);
960
961             fscal            = felec;
962
963             fscal            = _mm256_and_pd(fscal,cutoff_mask);
964
965             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
966
967             /* Calculate temporary vectorial force */
968             tx               = _mm256_mul_pd(fscal,dx12);
969             ty               = _mm256_mul_pd(fscal,dy12);
970             tz               = _mm256_mul_pd(fscal,dz12);
971
972             /* Update vectorial force */
973             fix1             = _mm256_add_pd(fix1,tx);
974             fiy1             = _mm256_add_pd(fiy1,ty);
975             fiz1             = _mm256_add_pd(fiz1,tz);
976
977             fjx2             = _mm256_add_pd(fjx2,tx);
978             fjy2             = _mm256_add_pd(fjy2,ty);
979             fjz2             = _mm256_add_pd(fjz2,tz);
980
981             }
982
983             /**************************
984              * CALCULATE INTERACTIONS *
985              **************************/
986
987             if (gmx_mm256_any_lt(rsq13,rcutoff2))
988             {
989
990             /* REACTION-FIELD ELECTROSTATICS */
991             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
992             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
993
994             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
995
996             /* Update potential sum for this i atom from the interaction with this j atom. */
997             velec            = _mm256_and_pd(velec,cutoff_mask);
998             velec            = _mm256_andnot_pd(dummy_mask,velec);
999             velecsum         = _mm256_add_pd(velecsum,velec);
1000
1001             fscal            = felec;
1002
1003             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1004
1005             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1006
1007             /* Calculate temporary vectorial force */
1008             tx               = _mm256_mul_pd(fscal,dx13);
1009             ty               = _mm256_mul_pd(fscal,dy13);
1010             tz               = _mm256_mul_pd(fscal,dz13);
1011
1012             /* Update vectorial force */
1013             fix1             = _mm256_add_pd(fix1,tx);
1014             fiy1             = _mm256_add_pd(fiy1,ty);
1015             fiz1             = _mm256_add_pd(fiz1,tz);
1016
1017             fjx3             = _mm256_add_pd(fjx3,tx);
1018             fjy3             = _mm256_add_pd(fjy3,ty);
1019             fjz3             = _mm256_add_pd(fjz3,tz);
1020
1021             }
1022
1023             /**************************
1024              * CALCULATE INTERACTIONS *
1025              **************************/
1026
1027             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1028             {
1029
1030             /* REACTION-FIELD ELECTROSTATICS */
1031             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1032             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1033
1034             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1035
1036             /* Update potential sum for this i atom from the interaction with this j atom. */
1037             velec            = _mm256_and_pd(velec,cutoff_mask);
1038             velec            = _mm256_andnot_pd(dummy_mask,velec);
1039             velecsum         = _mm256_add_pd(velecsum,velec);
1040
1041             fscal            = felec;
1042
1043             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1044
1045             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1046
1047             /* Calculate temporary vectorial force */
1048             tx               = _mm256_mul_pd(fscal,dx21);
1049             ty               = _mm256_mul_pd(fscal,dy21);
1050             tz               = _mm256_mul_pd(fscal,dz21);
1051
1052             /* Update vectorial force */
1053             fix2             = _mm256_add_pd(fix2,tx);
1054             fiy2             = _mm256_add_pd(fiy2,ty);
1055             fiz2             = _mm256_add_pd(fiz2,tz);
1056
1057             fjx1             = _mm256_add_pd(fjx1,tx);
1058             fjy1             = _mm256_add_pd(fjy1,ty);
1059             fjz1             = _mm256_add_pd(fjz1,tz);
1060
1061             }
1062
1063             /**************************
1064              * CALCULATE INTERACTIONS *
1065              **************************/
1066
1067             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1068             {
1069
1070             /* REACTION-FIELD ELECTROSTATICS */
1071             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1072             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1073
1074             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1075
1076             /* Update potential sum for this i atom from the interaction with this j atom. */
1077             velec            = _mm256_and_pd(velec,cutoff_mask);
1078             velec            = _mm256_andnot_pd(dummy_mask,velec);
1079             velecsum         = _mm256_add_pd(velecsum,velec);
1080
1081             fscal            = felec;
1082
1083             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1084
1085             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1086
1087             /* Calculate temporary vectorial force */
1088             tx               = _mm256_mul_pd(fscal,dx22);
1089             ty               = _mm256_mul_pd(fscal,dy22);
1090             tz               = _mm256_mul_pd(fscal,dz22);
1091
1092             /* Update vectorial force */
1093             fix2             = _mm256_add_pd(fix2,tx);
1094             fiy2             = _mm256_add_pd(fiy2,ty);
1095             fiz2             = _mm256_add_pd(fiz2,tz);
1096
1097             fjx2             = _mm256_add_pd(fjx2,tx);
1098             fjy2             = _mm256_add_pd(fjy2,ty);
1099             fjz2             = _mm256_add_pd(fjz2,tz);
1100
1101             }
1102
1103             /**************************
1104              * CALCULATE INTERACTIONS *
1105              **************************/
1106
1107             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1108             {
1109
1110             /* REACTION-FIELD ELECTROSTATICS */
1111             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1112             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1113
1114             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1115
1116             /* Update potential sum for this i atom from the interaction with this j atom. */
1117             velec            = _mm256_and_pd(velec,cutoff_mask);
1118             velec            = _mm256_andnot_pd(dummy_mask,velec);
1119             velecsum         = _mm256_add_pd(velecsum,velec);
1120
1121             fscal            = felec;
1122
1123             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1124
1125             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1126
1127             /* Calculate temporary vectorial force */
1128             tx               = _mm256_mul_pd(fscal,dx23);
1129             ty               = _mm256_mul_pd(fscal,dy23);
1130             tz               = _mm256_mul_pd(fscal,dz23);
1131
1132             /* Update vectorial force */
1133             fix2             = _mm256_add_pd(fix2,tx);
1134             fiy2             = _mm256_add_pd(fiy2,ty);
1135             fiz2             = _mm256_add_pd(fiz2,tz);
1136
1137             fjx3             = _mm256_add_pd(fjx3,tx);
1138             fjy3             = _mm256_add_pd(fjy3,ty);
1139             fjz3             = _mm256_add_pd(fjz3,tz);
1140
1141             }
1142
1143             /**************************
1144              * CALCULATE INTERACTIONS *
1145              **************************/
1146
1147             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1148             {
1149
1150             /* REACTION-FIELD ELECTROSTATICS */
1151             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1152             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1153
1154             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1155
1156             /* Update potential sum for this i atom from the interaction with this j atom. */
1157             velec            = _mm256_and_pd(velec,cutoff_mask);
1158             velec            = _mm256_andnot_pd(dummy_mask,velec);
1159             velecsum         = _mm256_add_pd(velecsum,velec);
1160
1161             fscal            = felec;
1162
1163             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1164
1165             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1166
1167             /* Calculate temporary vectorial force */
1168             tx               = _mm256_mul_pd(fscal,dx31);
1169             ty               = _mm256_mul_pd(fscal,dy31);
1170             tz               = _mm256_mul_pd(fscal,dz31);
1171
1172             /* Update vectorial force */
1173             fix3             = _mm256_add_pd(fix3,tx);
1174             fiy3             = _mm256_add_pd(fiy3,ty);
1175             fiz3             = _mm256_add_pd(fiz3,tz);
1176
1177             fjx1             = _mm256_add_pd(fjx1,tx);
1178             fjy1             = _mm256_add_pd(fjy1,ty);
1179             fjz1             = _mm256_add_pd(fjz1,tz);
1180
1181             }
1182
1183             /**************************
1184              * CALCULATE INTERACTIONS *
1185              **************************/
1186
1187             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1188             {
1189
1190             /* REACTION-FIELD ELECTROSTATICS */
1191             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1192             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1193
1194             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1195
1196             /* Update potential sum for this i atom from the interaction with this j atom. */
1197             velec            = _mm256_and_pd(velec,cutoff_mask);
1198             velec            = _mm256_andnot_pd(dummy_mask,velec);
1199             velecsum         = _mm256_add_pd(velecsum,velec);
1200
1201             fscal            = felec;
1202
1203             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1204
1205             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1206
1207             /* Calculate temporary vectorial force */
1208             tx               = _mm256_mul_pd(fscal,dx32);
1209             ty               = _mm256_mul_pd(fscal,dy32);
1210             tz               = _mm256_mul_pd(fscal,dz32);
1211
1212             /* Update vectorial force */
1213             fix3             = _mm256_add_pd(fix3,tx);
1214             fiy3             = _mm256_add_pd(fiy3,ty);
1215             fiz3             = _mm256_add_pd(fiz3,tz);
1216
1217             fjx2             = _mm256_add_pd(fjx2,tx);
1218             fjy2             = _mm256_add_pd(fjy2,ty);
1219             fjz2             = _mm256_add_pd(fjz2,tz);
1220
1221             }
1222
1223             /**************************
1224              * CALCULATE INTERACTIONS *
1225              **************************/
1226
1227             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1228             {
1229
1230             /* REACTION-FIELD ELECTROSTATICS */
1231             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1232             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1233
1234             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1235
1236             /* Update potential sum for this i atom from the interaction with this j atom. */
1237             velec            = _mm256_and_pd(velec,cutoff_mask);
1238             velec            = _mm256_andnot_pd(dummy_mask,velec);
1239             velecsum         = _mm256_add_pd(velecsum,velec);
1240
1241             fscal            = felec;
1242
1243             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1244
1245             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1246
1247             /* Calculate temporary vectorial force */
1248             tx               = _mm256_mul_pd(fscal,dx33);
1249             ty               = _mm256_mul_pd(fscal,dy33);
1250             tz               = _mm256_mul_pd(fscal,dz33);
1251
1252             /* Update vectorial force */
1253             fix3             = _mm256_add_pd(fix3,tx);
1254             fiy3             = _mm256_add_pd(fiy3,ty);
1255             fiz3             = _mm256_add_pd(fiz3,tz);
1256
1257             fjx3             = _mm256_add_pd(fjx3,tx);
1258             fjy3             = _mm256_add_pd(fjy3,ty);
1259             fjz3             = _mm256_add_pd(fjz3,tz);
1260
1261             }
1262
1263             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1264             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1265             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1266             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1267
1268             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1269                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1270                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1271
1272             /* Inner loop uses 387 flops */
1273         }
1274
1275         /* End of innermost loop */
1276
1277         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1278                                                  f+i_coord_offset,fshift+i_shift_offset);
1279
1280         ggid                        = gid[iidx];
1281         /* Update potential energies */
1282         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1283         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1284
1285         /* Increment number of inner iterations */
1286         inneriter                  += j_index_end - j_index_start;
1287
1288         /* Outer loop uses 26 flops */
1289     }
1290
1291     /* Increment number of outer iterations */
1292     outeriter        += nri;
1293
1294     /* Update outer/inner flops */
1295
1296     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1297 }
1298 /*
1299  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1300  * Electrostatics interaction: ReactionField
1301  * VdW interaction:            LennardJones
1302  * Geometry:                   Water4-Water4
1303  * Calculate force/pot:        Force
1304  */
1305 void
1306 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1307                     (t_nblist                    * gmx_restrict       nlist,
1308                      rvec                        * gmx_restrict          xx,
1309                      rvec                        * gmx_restrict          ff,
1310                      t_forcerec                  * gmx_restrict          fr,
1311                      t_mdatoms                   * gmx_restrict     mdatoms,
1312                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1313                      t_nrnb                      * gmx_restrict        nrnb)
1314 {
1315     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1316      * just 0 for non-waters.
1317      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1318      * jnr indices corresponding to data put in the four positions in the SIMD register.
1319      */
1320     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1321     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1322     int              jnrA,jnrB,jnrC,jnrD;
1323     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1324     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1325     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1326     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1327     real             rcutoff_scalar;
1328     real             *shiftvec,*fshift,*x,*f;
1329     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1330     real             scratch[4*DIM];
1331     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1332     real *           vdwioffsetptr0;
1333     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1334     real *           vdwioffsetptr1;
1335     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1336     real *           vdwioffsetptr2;
1337     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1338     real *           vdwioffsetptr3;
1339     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1340     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1341     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1342     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1343     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1344     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1345     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1346     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1347     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1348     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1349     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1350     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1351     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1352     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1353     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1354     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1355     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1356     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1357     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1358     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1359     real             *charge;
1360     int              nvdwtype;
1361     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1362     int              *vdwtype;
1363     real             *vdwparam;
1364     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1365     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1366     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1367     real             rswitch_scalar,d_scalar;
1368     __m256d          dummy_mask,cutoff_mask;
1369     __m128           tmpmask0,tmpmask1;
1370     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1371     __m256d          one     = _mm256_set1_pd(1.0);
1372     __m256d          two     = _mm256_set1_pd(2.0);
1373     x                = xx[0];
1374     f                = ff[0];
1375
1376     nri              = nlist->nri;
1377     iinr             = nlist->iinr;
1378     jindex           = nlist->jindex;
1379     jjnr             = nlist->jjnr;
1380     shiftidx         = nlist->shift;
1381     gid              = nlist->gid;
1382     shiftvec         = fr->shift_vec[0];
1383     fshift           = fr->fshift[0];
1384     facel            = _mm256_set1_pd(fr->epsfac);
1385     charge           = mdatoms->chargeA;
1386     krf              = _mm256_set1_pd(fr->ic->k_rf);
1387     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1388     crf              = _mm256_set1_pd(fr->ic->c_rf);
1389     nvdwtype         = fr->ntype;
1390     vdwparam         = fr->nbfp;
1391     vdwtype          = mdatoms->typeA;
1392
1393     /* Setup water-specific parameters */
1394     inr              = nlist->iinr[0];
1395     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1396     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1397     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1398     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1399
1400     jq1              = _mm256_set1_pd(charge[inr+1]);
1401     jq2              = _mm256_set1_pd(charge[inr+2]);
1402     jq3              = _mm256_set1_pd(charge[inr+3]);
1403     vdwjidx0A        = 2*vdwtype[inr+0];
1404     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1405     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1406     qq11             = _mm256_mul_pd(iq1,jq1);
1407     qq12             = _mm256_mul_pd(iq1,jq2);
1408     qq13             = _mm256_mul_pd(iq1,jq3);
1409     qq21             = _mm256_mul_pd(iq2,jq1);
1410     qq22             = _mm256_mul_pd(iq2,jq2);
1411     qq23             = _mm256_mul_pd(iq2,jq3);
1412     qq31             = _mm256_mul_pd(iq3,jq1);
1413     qq32             = _mm256_mul_pd(iq3,jq2);
1414     qq33             = _mm256_mul_pd(iq3,jq3);
1415
1416     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1417     rcutoff_scalar   = fr->rcoulomb;
1418     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1419     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1420
1421     rswitch_scalar   = fr->rvdw_switch;
1422     rswitch          = _mm256_set1_pd(rswitch_scalar);
1423     /* Setup switch parameters */
1424     d_scalar         = rcutoff_scalar-rswitch_scalar;
1425     d                = _mm256_set1_pd(d_scalar);
1426     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1427     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1428     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1429     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1430     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1431     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1432
1433     /* Avoid stupid compiler warnings */
1434     jnrA = jnrB = jnrC = jnrD = 0;
1435     j_coord_offsetA = 0;
1436     j_coord_offsetB = 0;
1437     j_coord_offsetC = 0;
1438     j_coord_offsetD = 0;
1439
1440     outeriter        = 0;
1441     inneriter        = 0;
1442
1443     for(iidx=0;iidx<4*DIM;iidx++)
1444     {
1445         scratch[iidx] = 0.0;
1446     }
1447
1448     /* Start outer loop over neighborlists */
1449     for(iidx=0; iidx<nri; iidx++)
1450     {
1451         /* Load shift vector for this list */
1452         i_shift_offset   = DIM*shiftidx[iidx];
1453
1454         /* Load limits for loop over neighbors */
1455         j_index_start    = jindex[iidx];
1456         j_index_end      = jindex[iidx+1];
1457
1458         /* Get outer coordinate index */
1459         inr              = iinr[iidx];
1460         i_coord_offset   = DIM*inr;
1461
1462         /* Load i particle coords and add shift vector */
1463         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1464                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1465
1466         fix0             = _mm256_setzero_pd();
1467         fiy0             = _mm256_setzero_pd();
1468         fiz0             = _mm256_setzero_pd();
1469         fix1             = _mm256_setzero_pd();
1470         fiy1             = _mm256_setzero_pd();
1471         fiz1             = _mm256_setzero_pd();
1472         fix2             = _mm256_setzero_pd();
1473         fiy2             = _mm256_setzero_pd();
1474         fiz2             = _mm256_setzero_pd();
1475         fix3             = _mm256_setzero_pd();
1476         fiy3             = _mm256_setzero_pd();
1477         fiz3             = _mm256_setzero_pd();
1478
1479         /* Start inner kernel loop */
1480         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1481         {
1482
1483             /* Get j neighbor index, and coordinate index */
1484             jnrA             = jjnr[jidx];
1485             jnrB             = jjnr[jidx+1];
1486             jnrC             = jjnr[jidx+2];
1487             jnrD             = jjnr[jidx+3];
1488             j_coord_offsetA  = DIM*jnrA;
1489             j_coord_offsetB  = DIM*jnrB;
1490             j_coord_offsetC  = DIM*jnrC;
1491             j_coord_offsetD  = DIM*jnrD;
1492
1493             /* load j atom coordinates */
1494             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1495                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1496                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1497                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1498
1499             /* Calculate displacement vector */
1500             dx00             = _mm256_sub_pd(ix0,jx0);
1501             dy00             = _mm256_sub_pd(iy0,jy0);
1502             dz00             = _mm256_sub_pd(iz0,jz0);
1503             dx11             = _mm256_sub_pd(ix1,jx1);
1504             dy11             = _mm256_sub_pd(iy1,jy1);
1505             dz11             = _mm256_sub_pd(iz1,jz1);
1506             dx12             = _mm256_sub_pd(ix1,jx2);
1507             dy12             = _mm256_sub_pd(iy1,jy2);
1508             dz12             = _mm256_sub_pd(iz1,jz2);
1509             dx13             = _mm256_sub_pd(ix1,jx3);
1510             dy13             = _mm256_sub_pd(iy1,jy3);
1511             dz13             = _mm256_sub_pd(iz1,jz3);
1512             dx21             = _mm256_sub_pd(ix2,jx1);
1513             dy21             = _mm256_sub_pd(iy2,jy1);
1514             dz21             = _mm256_sub_pd(iz2,jz1);
1515             dx22             = _mm256_sub_pd(ix2,jx2);
1516             dy22             = _mm256_sub_pd(iy2,jy2);
1517             dz22             = _mm256_sub_pd(iz2,jz2);
1518             dx23             = _mm256_sub_pd(ix2,jx3);
1519             dy23             = _mm256_sub_pd(iy2,jy3);
1520             dz23             = _mm256_sub_pd(iz2,jz3);
1521             dx31             = _mm256_sub_pd(ix3,jx1);
1522             dy31             = _mm256_sub_pd(iy3,jy1);
1523             dz31             = _mm256_sub_pd(iz3,jz1);
1524             dx32             = _mm256_sub_pd(ix3,jx2);
1525             dy32             = _mm256_sub_pd(iy3,jy2);
1526             dz32             = _mm256_sub_pd(iz3,jz2);
1527             dx33             = _mm256_sub_pd(ix3,jx3);
1528             dy33             = _mm256_sub_pd(iy3,jy3);
1529             dz33             = _mm256_sub_pd(iz3,jz3);
1530
1531             /* Calculate squared distance and things based on it */
1532             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1533             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1534             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1535             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1536             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1537             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1538             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1539             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1540             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1541             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1542
1543             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1544             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1545             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1546             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1547             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1548             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1549             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1550             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1551             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1552             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1553
1554             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1555             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1556             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1557             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1558             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1559             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1560             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1561             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1562             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1563             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1564
1565             fjx0             = _mm256_setzero_pd();
1566             fjy0             = _mm256_setzero_pd();
1567             fjz0             = _mm256_setzero_pd();
1568             fjx1             = _mm256_setzero_pd();
1569             fjy1             = _mm256_setzero_pd();
1570             fjz1             = _mm256_setzero_pd();
1571             fjx2             = _mm256_setzero_pd();
1572             fjy2             = _mm256_setzero_pd();
1573             fjz2             = _mm256_setzero_pd();
1574             fjx3             = _mm256_setzero_pd();
1575             fjy3             = _mm256_setzero_pd();
1576             fjz3             = _mm256_setzero_pd();
1577
1578             /**************************
1579              * CALCULATE INTERACTIONS *
1580              **************************/
1581
1582             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1583             {
1584
1585             r00              = _mm256_mul_pd(rsq00,rinv00);
1586
1587             /* LENNARD-JONES DISPERSION/REPULSION */
1588
1589             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1590             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1591             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1592             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1593             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1594
1595             d                = _mm256_sub_pd(r00,rswitch);
1596             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1597             d2               = _mm256_mul_pd(d,d);
1598             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1599
1600             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1601
1602             /* Evaluate switch function */
1603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1604             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1605             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1606
1607             fscal            = fvdw;
1608
1609             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1610
1611             /* Calculate temporary vectorial force */
1612             tx               = _mm256_mul_pd(fscal,dx00);
1613             ty               = _mm256_mul_pd(fscal,dy00);
1614             tz               = _mm256_mul_pd(fscal,dz00);
1615
1616             /* Update vectorial force */
1617             fix0             = _mm256_add_pd(fix0,tx);
1618             fiy0             = _mm256_add_pd(fiy0,ty);
1619             fiz0             = _mm256_add_pd(fiz0,tz);
1620
1621             fjx0             = _mm256_add_pd(fjx0,tx);
1622             fjy0             = _mm256_add_pd(fjy0,ty);
1623             fjz0             = _mm256_add_pd(fjz0,tz);
1624
1625             }
1626
1627             /**************************
1628              * CALCULATE INTERACTIONS *
1629              **************************/
1630
1631             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1632             {
1633
1634             /* REACTION-FIELD ELECTROSTATICS */
1635             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1636
1637             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1638
1639             fscal            = felec;
1640
1641             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1642
1643             /* Calculate temporary vectorial force */
1644             tx               = _mm256_mul_pd(fscal,dx11);
1645             ty               = _mm256_mul_pd(fscal,dy11);
1646             tz               = _mm256_mul_pd(fscal,dz11);
1647
1648             /* Update vectorial force */
1649             fix1             = _mm256_add_pd(fix1,tx);
1650             fiy1             = _mm256_add_pd(fiy1,ty);
1651             fiz1             = _mm256_add_pd(fiz1,tz);
1652
1653             fjx1             = _mm256_add_pd(fjx1,tx);
1654             fjy1             = _mm256_add_pd(fjy1,ty);
1655             fjz1             = _mm256_add_pd(fjz1,tz);
1656
1657             }
1658
1659             /**************************
1660              * CALCULATE INTERACTIONS *
1661              **************************/
1662
1663             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1664             {
1665
1666             /* REACTION-FIELD ELECTROSTATICS */
1667             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1668
1669             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1670
1671             fscal            = felec;
1672
1673             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1674
1675             /* Calculate temporary vectorial force */
1676             tx               = _mm256_mul_pd(fscal,dx12);
1677             ty               = _mm256_mul_pd(fscal,dy12);
1678             tz               = _mm256_mul_pd(fscal,dz12);
1679
1680             /* Update vectorial force */
1681             fix1             = _mm256_add_pd(fix1,tx);
1682             fiy1             = _mm256_add_pd(fiy1,ty);
1683             fiz1             = _mm256_add_pd(fiz1,tz);
1684
1685             fjx2             = _mm256_add_pd(fjx2,tx);
1686             fjy2             = _mm256_add_pd(fjy2,ty);
1687             fjz2             = _mm256_add_pd(fjz2,tz);
1688
1689             }
1690
1691             /**************************
1692              * CALCULATE INTERACTIONS *
1693              **************************/
1694
1695             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1696             {
1697
1698             /* REACTION-FIELD ELECTROSTATICS */
1699             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1700
1701             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1702
1703             fscal            = felec;
1704
1705             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1706
1707             /* Calculate temporary vectorial force */
1708             tx               = _mm256_mul_pd(fscal,dx13);
1709             ty               = _mm256_mul_pd(fscal,dy13);
1710             tz               = _mm256_mul_pd(fscal,dz13);
1711
1712             /* Update vectorial force */
1713             fix1             = _mm256_add_pd(fix1,tx);
1714             fiy1             = _mm256_add_pd(fiy1,ty);
1715             fiz1             = _mm256_add_pd(fiz1,tz);
1716
1717             fjx3             = _mm256_add_pd(fjx3,tx);
1718             fjy3             = _mm256_add_pd(fjy3,ty);
1719             fjz3             = _mm256_add_pd(fjz3,tz);
1720
1721             }
1722
1723             /**************************
1724              * CALCULATE INTERACTIONS *
1725              **************************/
1726
1727             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1728             {
1729
1730             /* REACTION-FIELD ELECTROSTATICS */
1731             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1732
1733             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1734
1735             fscal            = felec;
1736
1737             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1738
1739             /* Calculate temporary vectorial force */
1740             tx               = _mm256_mul_pd(fscal,dx21);
1741             ty               = _mm256_mul_pd(fscal,dy21);
1742             tz               = _mm256_mul_pd(fscal,dz21);
1743
1744             /* Update vectorial force */
1745             fix2             = _mm256_add_pd(fix2,tx);
1746             fiy2             = _mm256_add_pd(fiy2,ty);
1747             fiz2             = _mm256_add_pd(fiz2,tz);
1748
1749             fjx1             = _mm256_add_pd(fjx1,tx);
1750             fjy1             = _mm256_add_pd(fjy1,ty);
1751             fjz1             = _mm256_add_pd(fjz1,tz);
1752
1753             }
1754
1755             /**************************
1756              * CALCULATE INTERACTIONS *
1757              **************************/
1758
1759             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1760             {
1761
1762             /* REACTION-FIELD ELECTROSTATICS */
1763             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1764
1765             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1766
1767             fscal            = felec;
1768
1769             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1770
1771             /* Calculate temporary vectorial force */
1772             tx               = _mm256_mul_pd(fscal,dx22);
1773             ty               = _mm256_mul_pd(fscal,dy22);
1774             tz               = _mm256_mul_pd(fscal,dz22);
1775
1776             /* Update vectorial force */
1777             fix2             = _mm256_add_pd(fix2,tx);
1778             fiy2             = _mm256_add_pd(fiy2,ty);
1779             fiz2             = _mm256_add_pd(fiz2,tz);
1780
1781             fjx2             = _mm256_add_pd(fjx2,tx);
1782             fjy2             = _mm256_add_pd(fjy2,ty);
1783             fjz2             = _mm256_add_pd(fjz2,tz);
1784
1785             }
1786
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1792             {
1793
1794             /* REACTION-FIELD ELECTROSTATICS */
1795             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1796
1797             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1798
1799             fscal            = felec;
1800
1801             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm256_mul_pd(fscal,dx23);
1805             ty               = _mm256_mul_pd(fscal,dy23);
1806             tz               = _mm256_mul_pd(fscal,dz23);
1807
1808             /* Update vectorial force */
1809             fix2             = _mm256_add_pd(fix2,tx);
1810             fiy2             = _mm256_add_pd(fiy2,ty);
1811             fiz2             = _mm256_add_pd(fiz2,tz);
1812
1813             fjx3             = _mm256_add_pd(fjx3,tx);
1814             fjy3             = _mm256_add_pd(fjy3,ty);
1815             fjz3             = _mm256_add_pd(fjz3,tz);
1816
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1824             {
1825
1826             /* REACTION-FIELD ELECTROSTATICS */
1827             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1828
1829             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1830
1831             fscal            = felec;
1832
1833             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1834
1835             /* Calculate temporary vectorial force */
1836             tx               = _mm256_mul_pd(fscal,dx31);
1837             ty               = _mm256_mul_pd(fscal,dy31);
1838             tz               = _mm256_mul_pd(fscal,dz31);
1839
1840             /* Update vectorial force */
1841             fix3             = _mm256_add_pd(fix3,tx);
1842             fiy3             = _mm256_add_pd(fiy3,ty);
1843             fiz3             = _mm256_add_pd(fiz3,tz);
1844
1845             fjx1             = _mm256_add_pd(fjx1,tx);
1846             fjy1             = _mm256_add_pd(fjy1,ty);
1847             fjz1             = _mm256_add_pd(fjz1,tz);
1848
1849             }
1850
1851             /**************************
1852              * CALCULATE INTERACTIONS *
1853              **************************/
1854
1855             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1856             {
1857
1858             /* REACTION-FIELD ELECTROSTATICS */
1859             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1860
1861             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1862
1863             fscal            = felec;
1864
1865             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1866
1867             /* Calculate temporary vectorial force */
1868             tx               = _mm256_mul_pd(fscal,dx32);
1869             ty               = _mm256_mul_pd(fscal,dy32);
1870             tz               = _mm256_mul_pd(fscal,dz32);
1871
1872             /* Update vectorial force */
1873             fix3             = _mm256_add_pd(fix3,tx);
1874             fiy3             = _mm256_add_pd(fiy3,ty);
1875             fiz3             = _mm256_add_pd(fiz3,tz);
1876
1877             fjx2             = _mm256_add_pd(fjx2,tx);
1878             fjy2             = _mm256_add_pd(fjy2,ty);
1879             fjz2             = _mm256_add_pd(fjz2,tz);
1880
1881             }
1882
1883             /**************************
1884              * CALCULATE INTERACTIONS *
1885              **************************/
1886
1887             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1888             {
1889
1890             /* REACTION-FIELD ELECTROSTATICS */
1891             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1892
1893             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1894
1895             fscal            = felec;
1896
1897             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1898
1899             /* Calculate temporary vectorial force */
1900             tx               = _mm256_mul_pd(fscal,dx33);
1901             ty               = _mm256_mul_pd(fscal,dy33);
1902             tz               = _mm256_mul_pd(fscal,dz33);
1903
1904             /* Update vectorial force */
1905             fix3             = _mm256_add_pd(fix3,tx);
1906             fiy3             = _mm256_add_pd(fiy3,ty);
1907             fiz3             = _mm256_add_pd(fiz3,tz);
1908
1909             fjx3             = _mm256_add_pd(fjx3,tx);
1910             fjy3             = _mm256_add_pd(fjy3,ty);
1911             fjz3             = _mm256_add_pd(fjz3,tz);
1912
1913             }
1914
1915             fjptrA             = f+j_coord_offsetA;
1916             fjptrB             = f+j_coord_offsetB;
1917             fjptrC             = f+j_coord_offsetC;
1918             fjptrD             = f+j_coord_offsetD;
1919
1920             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1921                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1922                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1923
1924             /* Inner loop uses 329 flops */
1925         }
1926
1927         if(jidx<j_index_end)
1928         {
1929
1930             /* Get j neighbor index, and coordinate index */
1931             jnrlistA         = jjnr[jidx];
1932             jnrlistB         = jjnr[jidx+1];
1933             jnrlistC         = jjnr[jidx+2];
1934             jnrlistD         = jjnr[jidx+3];
1935             /* Sign of each element will be negative for non-real atoms.
1936              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1937              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1938              */
1939             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1940
1941             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1942             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1943             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1944
1945             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1946             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1947             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1948             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1949             j_coord_offsetA  = DIM*jnrA;
1950             j_coord_offsetB  = DIM*jnrB;
1951             j_coord_offsetC  = DIM*jnrC;
1952             j_coord_offsetD  = DIM*jnrD;
1953
1954             /* load j atom coordinates */
1955             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1956                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1957                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1958                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1959
1960             /* Calculate displacement vector */
1961             dx00             = _mm256_sub_pd(ix0,jx0);
1962             dy00             = _mm256_sub_pd(iy0,jy0);
1963             dz00             = _mm256_sub_pd(iz0,jz0);
1964             dx11             = _mm256_sub_pd(ix1,jx1);
1965             dy11             = _mm256_sub_pd(iy1,jy1);
1966             dz11             = _mm256_sub_pd(iz1,jz1);
1967             dx12             = _mm256_sub_pd(ix1,jx2);
1968             dy12             = _mm256_sub_pd(iy1,jy2);
1969             dz12             = _mm256_sub_pd(iz1,jz2);
1970             dx13             = _mm256_sub_pd(ix1,jx3);
1971             dy13             = _mm256_sub_pd(iy1,jy3);
1972             dz13             = _mm256_sub_pd(iz1,jz3);
1973             dx21             = _mm256_sub_pd(ix2,jx1);
1974             dy21             = _mm256_sub_pd(iy2,jy1);
1975             dz21             = _mm256_sub_pd(iz2,jz1);
1976             dx22             = _mm256_sub_pd(ix2,jx2);
1977             dy22             = _mm256_sub_pd(iy2,jy2);
1978             dz22             = _mm256_sub_pd(iz2,jz2);
1979             dx23             = _mm256_sub_pd(ix2,jx3);
1980             dy23             = _mm256_sub_pd(iy2,jy3);
1981             dz23             = _mm256_sub_pd(iz2,jz3);
1982             dx31             = _mm256_sub_pd(ix3,jx1);
1983             dy31             = _mm256_sub_pd(iy3,jy1);
1984             dz31             = _mm256_sub_pd(iz3,jz1);
1985             dx32             = _mm256_sub_pd(ix3,jx2);
1986             dy32             = _mm256_sub_pd(iy3,jy2);
1987             dz32             = _mm256_sub_pd(iz3,jz2);
1988             dx33             = _mm256_sub_pd(ix3,jx3);
1989             dy33             = _mm256_sub_pd(iy3,jy3);
1990             dz33             = _mm256_sub_pd(iz3,jz3);
1991
1992             /* Calculate squared distance and things based on it */
1993             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1994             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1995             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1996             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1997             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1998             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1999             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2000             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2001             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2002             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2003
2004             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
2005             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
2006             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
2007             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
2008             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
2009             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
2010             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
2011             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
2012             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
2013             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
2014
2015             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
2016             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
2017             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
2018             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
2019             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
2020             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
2021             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
2022             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
2023             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
2024             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
2025
2026             fjx0             = _mm256_setzero_pd();
2027             fjy0             = _mm256_setzero_pd();
2028             fjz0             = _mm256_setzero_pd();
2029             fjx1             = _mm256_setzero_pd();
2030             fjy1             = _mm256_setzero_pd();
2031             fjz1             = _mm256_setzero_pd();
2032             fjx2             = _mm256_setzero_pd();
2033             fjy2             = _mm256_setzero_pd();
2034             fjz2             = _mm256_setzero_pd();
2035             fjx3             = _mm256_setzero_pd();
2036             fjy3             = _mm256_setzero_pd();
2037             fjz3             = _mm256_setzero_pd();
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2044             {
2045
2046             r00              = _mm256_mul_pd(rsq00,rinv00);
2047             r00              = _mm256_andnot_pd(dummy_mask,r00);
2048
2049             /* LENNARD-JONES DISPERSION/REPULSION */
2050
2051             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2052             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
2053             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2054             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2055             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2056
2057             d                = _mm256_sub_pd(r00,rswitch);
2058             d                = _mm256_max_pd(d,_mm256_setzero_pd());
2059             d2               = _mm256_mul_pd(d,d);
2060             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2061
2062             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2063
2064             /* Evaluate switch function */
2065             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2066             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2067             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2068
2069             fscal            = fvdw;
2070
2071             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2072
2073             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2074
2075             /* Calculate temporary vectorial force */
2076             tx               = _mm256_mul_pd(fscal,dx00);
2077             ty               = _mm256_mul_pd(fscal,dy00);
2078             tz               = _mm256_mul_pd(fscal,dz00);
2079
2080             /* Update vectorial force */
2081             fix0             = _mm256_add_pd(fix0,tx);
2082             fiy0             = _mm256_add_pd(fiy0,ty);
2083             fiz0             = _mm256_add_pd(fiz0,tz);
2084
2085             fjx0             = _mm256_add_pd(fjx0,tx);
2086             fjy0             = _mm256_add_pd(fjy0,ty);
2087             fjz0             = _mm256_add_pd(fjz0,tz);
2088
2089             }
2090
2091             /**************************
2092              * CALCULATE INTERACTIONS *
2093              **************************/
2094
2095             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2096             {
2097
2098             /* REACTION-FIELD ELECTROSTATICS */
2099             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2100
2101             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2102
2103             fscal            = felec;
2104
2105             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2106
2107             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2108
2109             /* Calculate temporary vectorial force */
2110             tx               = _mm256_mul_pd(fscal,dx11);
2111             ty               = _mm256_mul_pd(fscal,dy11);
2112             tz               = _mm256_mul_pd(fscal,dz11);
2113
2114             /* Update vectorial force */
2115             fix1             = _mm256_add_pd(fix1,tx);
2116             fiy1             = _mm256_add_pd(fiy1,ty);
2117             fiz1             = _mm256_add_pd(fiz1,tz);
2118
2119             fjx1             = _mm256_add_pd(fjx1,tx);
2120             fjy1             = _mm256_add_pd(fjy1,ty);
2121             fjz1             = _mm256_add_pd(fjz1,tz);
2122
2123             }
2124
2125             /**************************
2126              * CALCULATE INTERACTIONS *
2127              **************************/
2128
2129             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2130             {
2131
2132             /* REACTION-FIELD ELECTROSTATICS */
2133             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2134
2135             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2136
2137             fscal            = felec;
2138
2139             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2140
2141             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2142
2143             /* Calculate temporary vectorial force */
2144             tx               = _mm256_mul_pd(fscal,dx12);
2145             ty               = _mm256_mul_pd(fscal,dy12);
2146             tz               = _mm256_mul_pd(fscal,dz12);
2147
2148             /* Update vectorial force */
2149             fix1             = _mm256_add_pd(fix1,tx);
2150             fiy1             = _mm256_add_pd(fiy1,ty);
2151             fiz1             = _mm256_add_pd(fiz1,tz);
2152
2153             fjx2             = _mm256_add_pd(fjx2,tx);
2154             fjy2             = _mm256_add_pd(fjy2,ty);
2155             fjz2             = _mm256_add_pd(fjz2,tz);
2156
2157             }
2158
2159             /**************************
2160              * CALCULATE INTERACTIONS *
2161              **************************/
2162
2163             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2164             {
2165
2166             /* REACTION-FIELD ELECTROSTATICS */
2167             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2168
2169             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2170
2171             fscal            = felec;
2172
2173             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2174
2175             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2176
2177             /* Calculate temporary vectorial force */
2178             tx               = _mm256_mul_pd(fscal,dx13);
2179             ty               = _mm256_mul_pd(fscal,dy13);
2180             tz               = _mm256_mul_pd(fscal,dz13);
2181
2182             /* Update vectorial force */
2183             fix1             = _mm256_add_pd(fix1,tx);
2184             fiy1             = _mm256_add_pd(fiy1,ty);
2185             fiz1             = _mm256_add_pd(fiz1,tz);
2186
2187             fjx3             = _mm256_add_pd(fjx3,tx);
2188             fjy3             = _mm256_add_pd(fjy3,ty);
2189             fjz3             = _mm256_add_pd(fjz3,tz);
2190
2191             }
2192
2193             /**************************
2194              * CALCULATE INTERACTIONS *
2195              **************************/
2196
2197             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2198             {
2199
2200             /* REACTION-FIELD ELECTROSTATICS */
2201             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2202
2203             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2204
2205             fscal            = felec;
2206
2207             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2208
2209             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2210
2211             /* Calculate temporary vectorial force */
2212             tx               = _mm256_mul_pd(fscal,dx21);
2213             ty               = _mm256_mul_pd(fscal,dy21);
2214             tz               = _mm256_mul_pd(fscal,dz21);
2215
2216             /* Update vectorial force */
2217             fix2             = _mm256_add_pd(fix2,tx);
2218             fiy2             = _mm256_add_pd(fiy2,ty);
2219             fiz2             = _mm256_add_pd(fiz2,tz);
2220
2221             fjx1             = _mm256_add_pd(fjx1,tx);
2222             fjy1             = _mm256_add_pd(fjy1,ty);
2223             fjz1             = _mm256_add_pd(fjz1,tz);
2224
2225             }
2226
2227             /**************************
2228              * CALCULATE INTERACTIONS *
2229              **************************/
2230
2231             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2232             {
2233
2234             /* REACTION-FIELD ELECTROSTATICS */
2235             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2236
2237             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2238
2239             fscal            = felec;
2240
2241             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2242
2243             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2244
2245             /* Calculate temporary vectorial force */
2246             tx               = _mm256_mul_pd(fscal,dx22);
2247             ty               = _mm256_mul_pd(fscal,dy22);
2248             tz               = _mm256_mul_pd(fscal,dz22);
2249
2250             /* Update vectorial force */
2251             fix2             = _mm256_add_pd(fix2,tx);
2252             fiy2             = _mm256_add_pd(fiy2,ty);
2253             fiz2             = _mm256_add_pd(fiz2,tz);
2254
2255             fjx2             = _mm256_add_pd(fjx2,tx);
2256             fjy2             = _mm256_add_pd(fjy2,ty);
2257             fjz2             = _mm256_add_pd(fjz2,tz);
2258
2259             }
2260
2261             /**************************
2262              * CALCULATE INTERACTIONS *
2263              **************************/
2264
2265             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2266             {
2267
2268             /* REACTION-FIELD ELECTROSTATICS */
2269             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2270
2271             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2272
2273             fscal            = felec;
2274
2275             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2276
2277             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2278
2279             /* Calculate temporary vectorial force */
2280             tx               = _mm256_mul_pd(fscal,dx23);
2281             ty               = _mm256_mul_pd(fscal,dy23);
2282             tz               = _mm256_mul_pd(fscal,dz23);
2283
2284             /* Update vectorial force */
2285             fix2             = _mm256_add_pd(fix2,tx);
2286             fiy2             = _mm256_add_pd(fiy2,ty);
2287             fiz2             = _mm256_add_pd(fiz2,tz);
2288
2289             fjx3             = _mm256_add_pd(fjx3,tx);
2290             fjy3             = _mm256_add_pd(fjy3,ty);
2291             fjz3             = _mm256_add_pd(fjz3,tz);
2292
2293             }
2294
2295             /**************************
2296              * CALCULATE INTERACTIONS *
2297              **************************/
2298
2299             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2300             {
2301
2302             /* REACTION-FIELD ELECTROSTATICS */
2303             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2304
2305             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2306
2307             fscal            = felec;
2308
2309             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2310
2311             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2312
2313             /* Calculate temporary vectorial force */
2314             tx               = _mm256_mul_pd(fscal,dx31);
2315             ty               = _mm256_mul_pd(fscal,dy31);
2316             tz               = _mm256_mul_pd(fscal,dz31);
2317
2318             /* Update vectorial force */
2319             fix3             = _mm256_add_pd(fix3,tx);
2320             fiy3             = _mm256_add_pd(fiy3,ty);
2321             fiz3             = _mm256_add_pd(fiz3,tz);
2322
2323             fjx1             = _mm256_add_pd(fjx1,tx);
2324             fjy1             = _mm256_add_pd(fjy1,ty);
2325             fjz1             = _mm256_add_pd(fjz1,tz);
2326
2327             }
2328
2329             /**************************
2330              * CALCULATE INTERACTIONS *
2331              **************************/
2332
2333             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2334             {
2335
2336             /* REACTION-FIELD ELECTROSTATICS */
2337             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2338
2339             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2340
2341             fscal            = felec;
2342
2343             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2344
2345             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2346
2347             /* Calculate temporary vectorial force */
2348             tx               = _mm256_mul_pd(fscal,dx32);
2349             ty               = _mm256_mul_pd(fscal,dy32);
2350             tz               = _mm256_mul_pd(fscal,dz32);
2351
2352             /* Update vectorial force */
2353             fix3             = _mm256_add_pd(fix3,tx);
2354             fiy3             = _mm256_add_pd(fiy3,ty);
2355             fiz3             = _mm256_add_pd(fiz3,tz);
2356
2357             fjx2             = _mm256_add_pd(fjx2,tx);
2358             fjy2             = _mm256_add_pd(fjy2,ty);
2359             fjz2             = _mm256_add_pd(fjz2,tz);
2360
2361             }
2362
2363             /**************************
2364              * CALCULATE INTERACTIONS *
2365              **************************/
2366
2367             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2368             {
2369
2370             /* REACTION-FIELD ELECTROSTATICS */
2371             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2372
2373             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2374
2375             fscal            = felec;
2376
2377             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2378
2379             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2380
2381             /* Calculate temporary vectorial force */
2382             tx               = _mm256_mul_pd(fscal,dx33);
2383             ty               = _mm256_mul_pd(fscal,dy33);
2384             tz               = _mm256_mul_pd(fscal,dz33);
2385
2386             /* Update vectorial force */
2387             fix3             = _mm256_add_pd(fix3,tx);
2388             fiy3             = _mm256_add_pd(fiy3,ty);
2389             fiz3             = _mm256_add_pd(fiz3,tz);
2390
2391             fjx3             = _mm256_add_pd(fjx3,tx);
2392             fjy3             = _mm256_add_pd(fjy3,ty);
2393             fjz3             = _mm256_add_pd(fjz3,tz);
2394
2395             }
2396
2397             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2398             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2399             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2400             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2401
2402             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2403                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2404                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2405
2406             /* Inner loop uses 330 flops */
2407         }
2408
2409         /* End of innermost loop */
2410
2411         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2412                                                  f+i_coord_offset,fshift+i_shift_offset);
2413
2414         /* Increment number of inner iterations */
2415         inneriter                  += j_index_end - j_index_start;
2416
2417         /* Outer loop uses 24 flops */
2418     }
2419
2420     /* Increment number of outer iterations */
2421     outeriter        += nri;
2422
2423     /* Update outer/inner flops */
2424
2425     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);
2426 }