Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     int              nvdwtype;
110     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111     int              *vdwtype;
112     real             *vdwparam;
113     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
114     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
115     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116     real             rswitch_scalar,d_scalar;
117     __m256           dummy_mask,cutoff_mask;
118     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
119     __m256           one     = _mm256_set1_ps(1.0);
120     __m256           two     = _mm256_set1_ps(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm256_set1_ps(fr->epsfac);
133     charge           = mdatoms->chargeA;
134     krf              = _mm256_set1_ps(fr->ic->k_rf);
135     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
136     crf              = _mm256_set1_ps(fr->ic->c_rf);
137     nvdwtype         = fr->ntype;
138     vdwparam         = fr->nbfp;
139     vdwtype          = mdatoms->typeA;
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
144     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
145     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
146     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
147
148     jq0              = _mm256_set1_ps(charge[inr+0]);
149     jq1              = _mm256_set1_ps(charge[inr+1]);
150     jq2              = _mm256_set1_ps(charge[inr+2]);
151     vdwjidx0A        = 2*vdwtype[inr+0];
152     qq00             = _mm256_mul_ps(iq0,jq0);
153     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
154     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
155     qq01             = _mm256_mul_ps(iq0,jq1);
156     qq02             = _mm256_mul_ps(iq0,jq2);
157     qq10             = _mm256_mul_ps(iq1,jq0);
158     qq11             = _mm256_mul_ps(iq1,jq1);
159     qq12             = _mm256_mul_ps(iq1,jq2);
160     qq20             = _mm256_mul_ps(iq2,jq0);
161     qq21             = _mm256_mul_ps(iq2,jq1);
162     qq22             = _mm256_mul_ps(iq2,jq2);
163
164     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165     rcutoff_scalar   = fr->rcoulomb;
166     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
167     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
168
169     rswitch_scalar   = fr->rvdw_switch;
170     rswitch          = _mm256_set1_ps(rswitch_scalar);
171     /* Setup switch parameters */
172     d_scalar         = rcutoff_scalar-rswitch_scalar;
173     d                = _mm256_set1_ps(d_scalar);
174     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
175     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
178     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
179     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180
181     /* Avoid stupid compiler warnings */
182     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
183     j_coord_offsetA = 0;
184     j_coord_offsetB = 0;
185     j_coord_offsetC = 0;
186     j_coord_offsetD = 0;
187     j_coord_offsetE = 0;
188     j_coord_offsetF = 0;
189     j_coord_offsetG = 0;
190     j_coord_offsetH = 0;
191
192     outeriter        = 0;
193     inneriter        = 0;
194
195     for(iidx=0;iidx<4*DIM;iidx++)
196     {
197         scratch[iidx] = 0.0;
198     }
199
200     /* Start outer loop over neighborlists */
201     for(iidx=0; iidx<nri; iidx++)
202     {
203         /* Load shift vector for this list */
204         i_shift_offset   = DIM*shiftidx[iidx];
205
206         /* Load limits for loop over neighbors */
207         j_index_start    = jindex[iidx];
208         j_index_end      = jindex[iidx+1];
209
210         /* Get outer coordinate index */
211         inr              = iinr[iidx];
212         i_coord_offset   = DIM*inr;
213
214         /* Load i particle coords and add shift vector */
215         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
216                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
217
218         fix0             = _mm256_setzero_ps();
219         fiy0             = _mm256_setzero_ps();
220         fiz0             = _mm256_setzero_ps();
221         fix1             = _mm256_setzero_ps();
222         fiy1             = _mm256_setzero_ps();
223         fiz1             = _mm256_setzero_ps();
224         fix2             = _mm256_setzero_ps();
225         fiy2             = _mm256_setzero_ps();
226         fiz2             = _mm256_setzero_ps();
227
228         /* Reset potential sums */
229         velecsum         = _mm256_setzero_ps();
230         vvdwsum          = _mm256_setzero_ps();
231
232         /* Start inner kernel loop */
233         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
234         {
235
236             /* Get j neighbor index, and coordinate index */
237             jnrA             = jjnr[jidx];
238             jnrB             = jjnr[jidx+1];
239             jnrC             = jjnr[jidx+2];
240             jnrD             = jjnr[jidx+3];
241             jnrE             = jjnr[jidx+4];
242             jnrF             = jjnr[jidx+5];
243             jnrG             = jjnr[jidx+6];
244             jnrH             = jjnr[jidx+7];
245             j_coord_offsetA  = DIM*jnrA;
246             j_coord_offsetB  = DIM*jnrB;
247             j_coord_offsetC  = DIM*jnrC;
248             j_coord_offsetD  = DIM*jnrD;
249             j_coord_offsetE  = DIM*jnrE;
250             j_coord_offsetF  = DIM*jnrF;
251             j_coord_offsetG  = DIM*jnrG;
252             j_coord_offsetH  = DIM*jnrH;
253
254             /* load j atom coordinates */
255             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
256                                                  x+j_coord_offsetC,x+j_coord_offsetD,
257                                                  x+j_coord_offsetE,x+j_coord_offsetF,
258                                                  x+j_coord_offsetG,x+j_coord_offsetH,
259                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
260
261             /* Calculate displacement vector */
262             dx00             = _mm256_sub_ps(ix0,jx0);
263             dy00             = _mm256_sub_ps(iy0,jy0);
264             dz00             = _mm256_sub_ps(iz0,jz0);
265             dx01             = _mm256_sub_ps(ix0,jx1);
266             dy01             = _mm256_sub_ps(iy0,jy1);
267             dz01             = _mm256_sub_ps(iz0,jz1);
268             dx02             = _mm256_sub_ps(ix0,jx2);
269             dy02             = _mm256_sub_ps(iy0,jy2);
270             dz02             = _mm256_sub_ps(iz0,jz2);
271             dx10             = _mm256_sub_ps(ix1,jx0);
272             dy10             = _mm256_sub_ps(iy1,jy0);
273             dz10             = _mm256_sub_ps(iz1,jz0);
274             dx11             = _mm256_sub_ps(ix1,jx1);
275             dy11             = _mm256_sub_ps(iy1,jy1);
276             dz11             = _mm256_sub_ps(iz1,jz1);
277             dx12             = _mm256_sub_ps(ix1,jx2);
278             dy12             = _mm256_sub_ps(iy1,jy2);
279             dz12             = _mm256_sub_ps(iz1,jz2);
280             dx20             = _mm256_sub_ps(ix2,jx0);
281             dy20             = _mm256_sub_ps(iy2,jy0);
282             dz20             = _mm256_sub_ps(iz2,jz0);
283             dx21             = _mm256_sub_ps(ix2,jx1);
284             dy21             = _mm256_sub_ps(iy2,jy1);
285             dz21             = _mm256_sub_ps(iz2,jz1);
286             dx22             = _mm256_sub_ps(ix2,jx2);
287             dy22             = _mm256_sub_ps(iy2,jy2);
288             dz22             = _mm256_sub_ps(iz2,jz2);
289
290             /* Calculate squared distance and things based on it */
291             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
292             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
293             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
294             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
295             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
296             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
297             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
298             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
299             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
300
301             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
302             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
303             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
304             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
305             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
306             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
307             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
308             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
309             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
310
311             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
312             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
313             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
314             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
315             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
316             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
317             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
318             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
319             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
320
321             fjx0             = _mm256_setzero_ps();
322             fjy0             = _mm256_setzero_ps();
323             fjz0             = _mm256_setzero_ps();
324             fjx1             = _mm256_setzero_ps();
325             fjy1             = _mm256_setzero_ps();
326             fjz1             = _mm256_setzero_ps();
327             fjx2             = _mm256_setzero_ps();
328             fjy2             = _mm256_setzero_ps();
329             fjz2             = _mm256_setzero_ps();
330
331             /**************************
332              * CALCULATE INTERACTIONS *
333              **************************/
334
335             if (gmx_mm256_any_lt(rsq00,rcutoff2))
336             {
337
338             r00              = _mm256_mul_ps(rsq00,rinv00);
339
340             /* REACTION-FIELD ELECTROSTATICS */
341             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
342             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
343
344             /* LENNARD-JONES DISPERSION/REPULSION */
345
346             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
348             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
349             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
350             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
351
352             d                = _mm256_sub_ps(r00,rswitch);
353             d                = _mm256_max_ps(d,_mm256_setzero_ps());
354             d2               = _mm256_mul_ps(d,d);
355             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
356
357             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
358
359             /* Evaluate switch function */
360             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
362             vvdw             = _mm256_mul_ps(vvdw,sw);
363             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
364
365             /* Update potential sum for this i atom from the interaction with this j atom. */
366             velec            = _mm256_and_ps(velec,cutoff_mask);
367             velecsum         = _mm256_add_ps(velecsum,velec);
368             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
369             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
370
371             fscal            = _mm256_add_ps(felec,fvdw);
372
373             fscal            = _mm256_and_ps(fscal,cutoff_mask);
374
375             /* Calculate temporary vectorial force */
376             tx               = _mm256_mul_ps(fscal,dx00);
377             ty               = _mm256_mul_ps(fscal,dy00);
378             tz               = _mm256_mul_ps(fscal,dz00);
379
380             /* Update vectorial force */
381             fix0             = _mm256_add_ps(fix0,tx);
382             fiy0             = _mm256_add_ps(fiy0,ty);
383             fiz0             = _mm256_add_ps(fiz0,tz);
384
385             fjx0             = _mm256_add_ps(fjx0,tx);
386             fjy0             = _mm256_add_ps(fjy0,ty);
387             fjz0             = _mm256_add_ps(fjz0,tz);
388
389             }
390
391             /**************************
392              * CALCULATE INTERACTIONS *
393              **************************/
394
395             if (gmx_mm256_any_lt(rsq01,rcutoff2))
396             {
397
398             /* REACTION-FIELD ELECTROSTATICS */
399             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
400             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
401
402             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velec            = _mm256_and_ps(velec,cutoff_mask);
406             velecsum         = _mm256_add_ps(velecsum,velec);
407
408             fscal            = felec;
409
410             fscal            = _mm256_and_ps(fscal,cutoff_mask);
411
412             /* Calculate temporary vectorial force */
413             tx               = _mm256_mul_ps(fscal,dx01);
414             ty               = _mm256_mul_ps(fscal,dy01);
415             tz               = _mm256_mul_ps(fscal,dz01);
416
417             /* Update vectorial force */
418             fix0             = _mm256_add_ps(fix0,tx);
419             fiy0             = _mm256_add_ps(fiy0,ty);
420             fiz0             = _mm256_add_ps(fiz0,tz);
421
422             fjx1             = _mm256_add_ps(fjx1,tx);
423             fjy1             = _mm256_add_ps(fjy1,ty);
424             fjz1             = _mm256_add_ps(fjz1,tz);
425
426             }
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             if (gmx_mm256_any_lt(rsq02,rcutoff2))
433             {
434
435             /* REACTION-FIELD ELECTROSTATICS */
436             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
437             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
438
439             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velec            = _mm256_and_ps(velec,cutoff_mask);
443             velecsum         = _mm256_add_ps(velecsum,velec);
444
445             fscal            = felec;
446
447             fscal            = _mm256_and_ps(fscal,cutoff_mask);
448
449             /* Calculate temporary vectorial force */
450             tx               = _mm256_mul_ps(fscal,dx02);
451             ty               = _mm256_mul_ps(fscal,dy02);
452             tz               = _mm256_mul_ps(fscal,dz02);
453
454             /* Update vectorial force */
455             fix0             = _mm256_add_ps(fix0,tx);
456             fiy0             = _mm256_add_ps(fiy0,ty);
457             fiz0             = _mm256_add_ps(fiz0,tz);
458
459             fjx2             = _mm256_add_ps(fjx2,tx);
460             fjy2             = _mm256_add_ps(fjy2,ty);
461             fjz2             = _mm256_add_ps(fjz2,tz);
462
463             }
464
465             /**************************
466              * CALCULATE INTERACTIONS *
467              **************************/
468
469             if (gmx_mm256_any_lt(rsq10,rcutoff2))
470             {
471
472             /* REACTION-FIELD ELECTROSTATICS */
473             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
474             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
475
476             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velec            = _mm256_and_ps(velec,cutoff_mask);
480             velecsum         = _mm256_add_ps(velecsum,velec);
481
482             fscal            = felec;
483
484             fscal            = _mm256_and_ps(fscal,cutoff_mask);
485
486             /* Calculate temporary vectorial force */
487             tx               = _mm256_mul_ps(fscal,dx10);
488             ty               = _mm256_mul_ps(fscal,dy10);
489             tz               = _mm256_mul_ps(fscal,dz10);
490
491             /* Update vectorial force */
492             fix1             = _mm256_add_ps(fix1,tx);
493             fiy1             = _mm256_add_ps(fiy1,ty);
494             fiz1             = _mm256_add_ps(fiz1,tz);
495
496             fjx0             = _mm256_add_ps(fjx0,tx);
497             fjy0             = _mm256_add_ps(fjy0,ty);
498             fjz0             = _mm256_add_ps(fjz0,tz);
499
500             }
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             if (gmx_mm256_any_lt(rsq11,rcutoff2))
507             {
508
509             /* REACTION-FIELD ELECTROSTATICS */
510             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
511             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
512
513             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
514
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velec            = _mm256_and_ps(velec,cutoff_mask);
517             velecsum         = _mm256_add_ps(velecsum,velec);
518
519             fscal            = felec;
520
521             fscal            = _mm256_and_ps(fscal,cutoff_mask);
522
523             /* Calculate temporary vectorial force */
524             tx               = _mm256_mul_ps(fscal,dx11);
525             ty               = _mm256_mul_ps(fscal,dy11);
526             tz               = _mm256_mul_ps(fscal,dz11);
527
528             /* Update vectorial force */
529             fix1             = _mm256_add_ps(fix1,tx);
530             fiy1             = _mm256_add_ps(fiy1,ty);
531             fiz1             = _mm256_add_ps(fiz1,tz);
532
533             fjx1             = _mm256_add_ps(fjx1,tx);
534             fjy1             = _mm256_add_ps(fjy1,ty);
535             fjz1             = _mm256_add_ps(fjz1,tz);
536
537             }
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             if (gmx_mm256_any_lt(rsq12,rcutoff2))
544             {
545
546             /* REACTION-FIELD ELECTROSTATICS */
547             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
548             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
549
550             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velec            = _mm256_and_ps(velec,cutoff_mask);
554             velecsum         = _mm256_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558             fscal            = _mm256_and_ps(fscal,cutoff_mask);
559
560             /* Calculate temporary vectorial force */
561             tx               = _mm256_mul_ps(fscal,dx12);
562             ty               = _mm256_mul_ps(fscal,dy12);
563             tz               = _mm256_mul_ps(fscal,dz12);
564
565             /* Update vectorial force */
566             fix1             = _mm256_add_ps(fix1,tx);
567             fiy1             = _mm256_add_ps(fiy1,ty);
568             fiz1             = _mm256_add_ps(fiz1,tz);
569
570             fjx2             = _mm256_add_ps(fjx2,tx);
571             fjy2             = _mm256_add_ps(fjy2,ty);
572             fjz2             = _mm256_add_ps(fjz2,tz);
573
574             }
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             if (gmx_mm256_any_lt(rsq20,rcutoff2))
581             {
582
583             /* REACTION-FIELD ELECTROSTATICS */
584             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
585             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
586
587             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velec            = _mm256_and_ps(velec,cutoff_mask);
591             velecsum         = _mm256_add_ps(velecsum,velec);
592
593             fscal            = felec;
594
595             fscal            = _mm256_and_ps(fscal,cutoff_mask);
596
597             /* Calculate temporary vectorial force */
598             tx               = _mm256_mul_ps(fscal,dx20);
599             ty               = _mm256_mul_ps(fscal,dy20);
600             tz               = _mm256_mul_ps(fscal,dz20);
601
602             /* Update vectorial force */
603             fix2             = _mm256_add_ps(fix2,tx);
604             fiy2             = _mm256_add_ps(fiy2,ty);
605             fiz2             = _mm256_add_ps(fiz2,tz);
606
607             fjx0             = _mm256_add_ps(fjx0,tx);
608             fjy0             = _mm256_add_ps(fjy0,ty);
609             fjz0             = _mm256_add_ps(fjz0,tz);
610
611             }
612
613             /**************************
614              * CALCULATE INTERACTIONS *
615              **************************/
616
617             if (gmx_mm256_any_lt(rsq21,rcutoff2))
618             {
619
620             /* REACTION-FIELD ELECTROSTATICS */
621             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
622             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
623
624             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velec            = _mm256_and_ps(velec,cutoff_mask);
628             velecsum         = _mm256_add_ps(velecsum,velec);
629
630             fscal            = felec;
631
632             fscal            = _mm256_and_ps(fscal,cutoff_mask);
633
634             /* Calculate temporary vectorial force */
635             tx               = _mm256_mul_ps(fscal,dx21);
636             ty               = _mm256_mul_ps(fscal,dy21);
637             tz               = _mm256_mul_ps(fscal,dz21);
638
639             /* Update vectorial force */
640             fix2             = _mm256_add_ps(fix2,tx);
641             fiy2             = _mm256_add_ps(fiy2,ty);
642             fiz2             = _mm256_add_ps(fiz2,tz);
643
644             fjx1             = _mm256_add_ps(fjx1,tx);
645             fjy1             = _mm256_add_ps(fjy1,ty);
646             fjz1             = _mm256_add_ps(fjz1,tz);
647
648             }
649
650             /**************************
651              * CALCULATE INTERACTIONS *
652              **************************/
653
654             if (gmx_mm256_any_lt(rsq22,rcutoff2))
655             {
656
657             /* REACTION-FIELD ELECTROSTATICS */
658             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
659             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
660
661             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
662
663             /* Update potential sum for this i atom from the interaction with this j atom. */
664             velec            = _mm256_and_ps(velec,cutoff_mask);
665             velecsum         = _mm256_add_ps(velecsum,velec);
666
667             fscal            = felec;
668
669             fscal            = _mm256_and_ps(fscal,cutoff_mask);
670
671             /* Calculate temporary vectorial force */
672             tx               = _mm256_mul_ps(fscal,dx22);
673             ty               = _mm256_mul_ps(fscal,dy22);
674             tz               = _mm256_mul_ps(fscal,dz22);
675
676             /* Update vectorial force */
677             fix2             = _mm256_add_ps(fix2,tx);
678             fiy2             = _mm256_add_ps(fiy2,ty);
679             fiz2             = _mm256_add_ps(fiz2,tz);
680
681             fjx2             = _mm256_add_ps(fjx2,tx);
682             fjy2             = _mm256_add_ps(fjy2,ty);
683             fjz2             = _mm256_add_ps(fjz2,tz);
684
685             }
686
687             fjptrA             = f+j_coord_offsetA;
688             fjptrB             = f+j_coord_offsetB;
689             fjptrC             = f+j_coord_offsetC;
690             fjptrD             = f+j_coord_offsetD;
691             fjptrE             = f+j_coord_offsetE;
692             fjptrF             = f+j_coord_offsetF;
693             fjptrG             = f+j_coord_offsetG;
694             fjptrH             = f+j_coord_offsetH;
695
696             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
697                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
698
699             /* Inner loop uses 358 flops */
700         }
701
702         if(jidx<j_index_end)
703         {
704
705             /* Get j neighbor index, and coordinate index */
706             jnrlistA         = jjnr[jidx];
707             jnrlistB         = jjnr[jidx+1];
708             jnrlistC         = jjnr[jidx+2];
709             jnrlistD         = jjnr[jidx+3];
710             jnrlistE         = jjnr[jidx+4];
711             jnrlistF         = jjnr[jidx+5];
712             jnrlistG         = jjnr[jidx+6];
713             jnrlistH         = jjnr[jidx+7];
714             /* Sign of each element will be negative for non-real atoms.
715              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
716              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
717              */
718             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
719                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
720                                             
721             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
722             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
723             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
724             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
725             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
726             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
727             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
728             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
729             j_coord_offsetA  = DIM*jnrA;
730             j_coord_offsetB  = DIM*jnrB;
731             j_coord_offsetC  = DIM*jnrC;
732             j_coord_offsetD  = DIM*jnrD;
733             j_coord_offsetE  = DIM*jnrE;
734             j_coord_offsetF  = DIM*jnrF;
735             j_coord_offsetG  = DIM*jnrG;
736             j_coord_offsetH  = DIM*jnrH;
737
738             /* load j atom coordinates */
739             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
740                                                  x+j_coord_offsetC,x+j_coord_offsetD,
741                                                  x+j_coord_offsetE,x+j_coord_offsetF,
742                                                  x+j_coord_offsetG,x+j_coord_offsetH,
743                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
744
745             /* Calculate displacement vector */
746             dx00             = _mm256_sub_ps(ix0,jx0);
747             dy00             = _mm256_sub_ps(iy0,jy0);
748             dz00             = _mm256_sub_ps(iz0,jz0);
749             dx01             = _mm256_sub_ps(ix0,jx1);
750             dy01             = _mm256_sub_ps(iy0,jy1);
751             dz01             = _mm256_sub_ps(iz0,jz1);
752             dx02             = _mm256_sub_ps(ix0,jx2);
753             dy02             = _mm256_sub_ps(iy0,jy2);
754             dz02             = _mm256_sub_ps(iz0,jz2);
755             dx10             = _mm256_sub_ps(ix1,jx0);
756             dy10             = _mm256_sub_ps(iy1,jy0);
757             dz10             = _mm256_sub_ps(iz1,jz0);
758             dx11             = _mm256_sub_ps(ix1,jx1);
759             dy11             = _mm256_sub_ps(iy1,jy1);
760             dz11             = _mm256_sub_ps(iz1,jz1);
761             dx12             = _mm256_sub_ps(ix1,jx2);
762             dy12             = _mm256_sub_ps(iy1,jy2);
763             dz12             = _mm256_sub_ps(iz1,jz2);
764             dx20             = _mm256_sub_ps(ix2,jx0);
765             dy20             = _mm256_sub_ps(iy2,jy0);
766             dz20             = _mm256_sub_ps(iz2,jz0);
767             dx21             = _mm256_sub_ps(ix2,jx1);
768             dy21             = _mm256_sub_ps(iy2,jy1);
769             dz21             = _mm256_sub_ps(iz2,jz1);
770             dx22             = _mm256_sub_ps(ix2,jx2);
771             dy22             = _mm256_sub_ps(iy2,jy2);
772             dz22             = _mm256_sub_ps(iz2,jz2);
773
774             /* Calculate squared distance and things based on it */
775             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
776             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
777             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
778             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
779             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
780             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
781             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
782             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
783             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
784
785             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
786             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
787             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
788             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
789             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
790             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
791             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
792             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
793             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
794
795             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
796             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
797             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
798             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
799             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
800             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
801             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
802             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
803             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
804
805             fjx0             = _mm256_setzero_ps();
806             fjy0             = _mm256_setzero_ps();
807             fjz0             = _mm256_setzero_ps();
808             fjx1             = _mm256_setzero_ps();
809             fjy1             = _mm256_setzero_ps();
810             fjz1             = _mm256_setzero_ps();
811             fjx2             = _mm256_setzero_ps();
812             fjy2             = _mm256_setzero_ps();
813             fjz2             = _mm256_setzero_ps();
814
815             /**************************
816              * CALCULATE INTERACTIONS *
817              **************************/
818
819             if (gmx_mm256_any_lt(rsq00,rcutoff2))
820             {
821
822             r00              = _mm256_mul_ps(rsq00,rinv00);
823             r00              = _mm256_andnot_ps(dummy_mask,r00);
824
825             /* REACTION-FIELD ELECTROSTATICS */
826             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
827             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
828
829             /* LENNARD-JONES DISPERSION/REPULSION */
830
831             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
832             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
833             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
834             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
835             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
836
837             d                = _mm256_sub_ps(r00,rswitch);
838             d                = _mm256_max_ps(d,_mm256_setzero_ps());
839             d2               = _mm256_mul_ps(d,d);
840             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
841
842             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
843
844             /* Evaluate switch function */
845             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
847             vvdw             = _mm256_mul_ps(vvdw,sw);
848             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
849
850             /* Update potential sum for this i atom from the interaction with this j atom. */
851             velec            = _mm256_and_ps(velec,cutoff_mask);
852             velec            = _mm256_andnot_ps(dummy_mask,velec);
853             velecsum         = _mm256_add_ps(velecsum,velec);
854             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
855             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
856             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
857
858             fscal            = _mm256_add_ps(felec,fvdw);
859
860             fscal            = _mm256_and_ps(fscal,cutoff_mask);
861
862             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
863
864             /* Calculate temporary vectorial force */
865             tx               = _mm256_mul_ps(fscal,dx00);
866             ty               = _mm256_mul_ps(fscal,dy00);
867             tz               = _mm256_mul_ps(fscal,dz00);
868
869             /* Update vectorial force */
870             fix0             = _mm256_add_ps(fix0,tx);
871             fiy0             = _mm256_add_ps(fiy0,ty);
872             fiz0             = _mm256_add_ps(fiz0,tz);
873
874             fjx0             = _mm256_add_ps(fjx0,tx);
875             fjy0             = _mm256_add_ps(fjy0,ty);
876             fjz0             = _mm256_add_ps(fjz0,tz);
877
878             }
879
880             /**************************
881              * CALCULATE INTERACTIONS *
882              **************************/
883
884             if (gmx_mm256_any_lt(rsq01,rcutoff2))
885             {
886
887             /* REACTION-FIELD ELECTROSTATICS */
888             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
889             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
890
891             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
892
893             /* Update potential sum for this i atom from the interaction with this j atom. */
894             velec            = _mm256_and_ps(velec,cutoff_mask);
895             velec            = _mm256_andnot_ps(dummy_mask,velec);
896             velecsum         = _mm256_add_ps(velecsum,velec);
897
898             fscal            = felec;
899
900             fscal            = _mm256_and_ps(fscal,cutoff_mask);
901
902             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
903
904             /* Calculate temporary vectorial force */
905             tx               = _mm256_mul_ps(fscal,dx01);
906             ty               = _mm256_mul_ps(fscal,dy01);
907             tz               = _mm256_mul_ps(fscal,dz01);
908
909             /* Update vectorial force */
910             fix0             = _mm256_add_ps(fix0,tx);
911             fiy0             = _mm256_add_ps(fiy0,ty);
912             fiz0             = _mm256_add_ps(fiz0,tz);
913
914             fjx1             = _mm256_add_ps(fjx1,tx);
915             fjy1             = _mm256_add_ps(fjy1,ty);
916             fjz1             = _mm256_add_ps(fjz1,tz);
917
918             }
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             if (gmx_mm256_any_lt(rsq02,rcutoff2))
925             {
926
927             /* REACTION-FIELD ELECTROSTATICS */
928             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
929             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
930
931             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
932
933             /* Update potential sum for this i atom from the interaction with this j atom. */
934             velec            = _mm256_and_ps(velec,cutoff_mask);
935             velec            = _mm256_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm256_add_ps(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm256_and_ps(fscal,cutoff_mask);
941
942             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
943
944             /* Calculate temporary vectorial force */
945             tx               = _mm256_mul_ps(fscal,dx02);
946             ty               = _mm256_mul_ps(fscal,dy02);
947             tz               = _mm256_mul_ps(fscal,dz02);
948
949             /* Update vectorial force */
950             fix0             = _mm256_add_ps(fix0,tx);
951             fiy0             = _mm256_add_ps(fiy0,ty);
952             fiz0             = _mm256_add_ps(fiz0,tz);
953
954             fjx2             = _mm256_add_ps(fjx2,tx);
955             fjy2             = _mm256_add_ps(fjy2,ty);
956             fjz2             = _mm256_add_ps(fjz2,tz);
957
958             }
959
960             /**************************
961              * CALCULATE INTERACTIONS *
962              **************************/
963
964             if (gmx_mm256_any_lt(rsq10,rcutoff2))
965             {
966
967             /* REACTION-FIELD ELECTROSTATICS */
968             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
969             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
970
971             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
972
973             /* Update potential sum for this i atom from the interaction with this j atom. */
974             velec            = _mm256_and_ps(velec,cutoff_mask);
975             velec            = _mm256_andnot_ps(dummy_mask,velec);
976             velecsum         = _mm256_add_ps(velecsum,velec);
977
978             fscal            = felec;
979
980             fscal            = _mm256_and_ps(fscal,cutoff_mask);
981
982             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
983
984             /* Calculate temporary vectorial force */
985             tx               = _mm256_mul_ps(fscal,dx10);
986             ty               = _mm256_mul_ps(fscal,dy10);
987             tz               = _mm256_mul_ps(fscal,dz10);
988
989             /* Update vectorial force */
990             fix1             = _mm256_add_ps(fix1,tx);
991             fiy1             = _mm256_add_ps(fiy1,ty);
992             fiz1             = _mm256_add_ps(fiz1,tz);
993
994             fjx0             = _mm256_add_ps(fjx0,tx);
995             fjy0             = _mm256_add_ps(fjy0,ty);
996             fjz0             = _mm256_add_ps(fjz0,tz);
997
998             }
999
1000             /**************************
1001              * CALCULATE INTERACTIONS *
1002              **************************/
1003
1004             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1005             {
1006
1007             /* REACTION-FIELD ELECTROSTATICS */
1008             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
1009             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1010
1011             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1012
1013             /* Update potential sum for this i atom from the interaction with this j atom. */
1014             velec            = _mm256_and_ps(velec,cutoff_mask);
1015             velec            = _mm256_andnot_ps(dummy_mask,velec);
1016             velecsum         = _mm256_add_ps(velecsum,velec);
1017
1018             fscal            = felec;
1019
1020             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1021
1022             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1023
1024             /* Calculate temporary vectorial force */
1025             tx               = _mm256_mul_ps(fscal,dx11);
1026             ty               = _mm256_mul_ps(fscal,dy11);
1027             tz               = _mm256_mul_ps(fscal,dz11);
1028
1029             /* Update vectorial force */
1030             fix1             = _mm256_add_ps(fix1,tx);
1031             fiy1             = _mm256_add_ps(fiy1,ty);
1032             fiz1             = _mm256_add_ps(fiz1,tz);
1033
1034             fjx1             = _mm256_add_ps(fjx1,tx);
1035             fjy1             = _mm256_add_ps(fjy1,ty);
1036             fjz1             = _mm256_add_ps(fjz1,tz);
1037
1038             }
1039
1040             /**************************
1041              * CALCULATE INTERACTIONS *
1042              **************************/
1043
1044             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1045             {
1046
1047             /* REACTION-FIELD ELECTROSTATICS */
1048             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1049             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1050
1051             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1052
1053             /* Update potential sum for this i atom from the interaction with this j atom. */
1054             velec            = _mm256_and_ps(velec,cutoff_mask);
1055             velec            = _mm256_andnot_ps(dummy_mask,velec);
1056             velecsum         = _mm256_add_ps(velecsum,velec);
1057
1058             fscal            = felec;
1059
1060             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1061
1062             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1063
1064             /* Calculate temporary vectorial force */
1065             tx               = _mm256_mul_ps(fscal,dx12);
1066             ty               = _mm256_mul_ps(fscal,dy12);
1067             tz               = _mm256_mul_ps(fscal,dz12);
1068
1069             /* Update vectorial force */
1070             fix1             = _mm256_add_ps(fix1,tx);
1071             fiy1             = _mm256_add_ps(fiy1,ty);
1072             fiz1             = _mm256_add_ps(fiz1,tz);
1073
1074             fjx2             = _mm256_add_ps(fjx2,tx);
1075             fjy2             = _mm256_add_ps(fjy2,ty);
1076             fjz2             = _mm256_add_ps(fjz2,tz);
1077
1078             }
1079
1080             /**************************
1081              * CALCULATE INTERACTIONS *
1082              **************************/
1083
1084             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1085             {
1086
1087             /* REACTION-FIELD ELECTROSTATICS */
1088             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1089             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1090
1091             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1092
1093             /* Update potential sum for this i atom from the interaction with this j atom. */
1094             velec            = _mm256_and_ps(velec,cutoff_mask);
1095             velec            = _mm256_andnot_ps(dummy_mask,velec);
1096             velecsum         = _mm256_add_ps(velecsum,velec);
1097
1098             fscal            = felec;
1099
1100             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1101
1102             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1103
1104             /* Calculate temporary vectorial force */
1105             tx               = _mm256_mul_ps(fscal,dx20);
1106             ty               = _mm256_mul_ps(fscal,dy20);
1107             tz               = _mm256_mul_ps(fscal,dz20);
1108
1109             /* Update vectorial force */
1110             fix2             = _mm256_add_ps(fix2,tx);
1111             fiy2             = _mm256_add_ps(fiy2,ty);
1112             fiz2             = _mm256_add_ps(fiz2,tz);
1113
1114             fjx0             = _mm256_add_ps(fjx0,tx);
1115             fjy0             = _mm256_add_ps(fjy0,ty);
1116             fjz0             = _mm256_add_ps(fjz0,tz);
1117
1118             }
1119
1120             /**************************
1121              * CALCULATE INTERACTIONS *
1122              **************************/
1123
1124             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1125             {
1126
1127             /* REACTION-FIELD ELECTROSTATICS */
1128             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1129             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1130
1131             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _mm256_and_ps(velec,cutoff_mask);
1135             velec            = _mm256_andnot_ps(dummy_mask,velec);
1136             velecsum         = _mm256_add_ps(velecsum,velec);
1137
1138             fscal            = felec;
1139
1140             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1141
1142             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1143
1144             /* Calculate temporary vectorial force */
1145             tx               = _mm256_mul_ps(fscal,dx21);
1146             ty               = _mm256_mul_ps(fscal,dy21);
1147             tz               = _mm256_mul_ps(fscal,dz21);
1148
1149             /* Update vectorial force */
1150             fix2             = _mm256_add_ps(fix2,tx);
1151             fiy2             = _mm256_add_ps(fiy2,ty);
1152             fiz2             = _mm256_add_ps(fiz2,tz);
1153
1154             fjx1             = _mm256_add_ps(fjx1,tx);
1155             fjy1             = _mm256_add_ps(fjy1,ty);
1156             fjz1             = _mm256_add_ps(fjz1,tz);
1157
1158             }
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1165             {
1166
1167             /* REACTION-FIELD ELECTROSTATICS */
1168             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1169             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1170
1171             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1172
1173             /* Update potential sum for this i atom from the interaction with this j atom. */
1174             velec            = _mm256_and_ps(velec,cutoff_mask);
1175             velec            = _mm256_andnot_ps(dummy_mask,velec);
1176             velecsum         = _mm256_add_ps(velecsum,velec);
1177
1178             fscal            = felec;
1179
1180             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1181
1182             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1183
1184             /* Calculate temporary vectorial force */
1185             tx               = _mm256_mul_ps(fscal,dx22);
1186             ty               = _mm256_mul_ps(fscal,dy22);
1187             tz               = _mm256_mul_ps(fscal,dz22);
1188
1189             /* Update vectorial force */
1190             fix2             = _mm256_add_ps(fix2,tx);
1191             fiy2             = _mm256_add_ps(fiy2,ty);
1192             fiz2             = _mm256_add_ps(fiz2,tz);
1193
1194             fjx2             = _mm256_add_ps(fjx2,tx);
1195             fjy2             = _mm256_add_ps(fjy2,ty);
1196             fjz2             = _mm256_add_ps(fjz2,tz);
1197
1198             }
1199
1200             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1201             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1202             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1203             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1204             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1205             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1206             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1207             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1208
1209             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1210                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1211
1212             /* Inner loop uses 359 flops */
1213         }
1214
1215         /* End of innermost loop */
1216
1217         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1218                                                  f+i_coord_offset,fshift+i_shift_offset);
1219
1220         ggid                        = gid[iidx];
1221         /* Update potential energies */
1222         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1223         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1224
1225         /* Increment number of inner iterations */
1226         inneriter                  += j_index_end - j_index_start;
1227
1228         /* Outer loop uses 20 flops */
1229     }
1230
1231     /* Increment number of outer iterations */
1232     outeriter        += nri;
1233
1234     /* Update outer/inner flops */
1235
1236     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1237 }
1238 /*
1239  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1240  * Electrostatics interaction: ReactionField
1241  * VdW interaction:            LennardJones
1242  * Geometry:                   Water3-Water3
1243  * Calculate force/pot:        Force
1244  */
1245 void
1246 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1247                     (t_nblist                    * gmx_restrict       nlist,
1248                      rvec                        * gmx_restrict          xx,
1249                      rvec                        * gmx_restrict          ff,
1250                      t_forcerec                  * gmx_restrict          fr,
1251                      t_mdatoms                   * gmx_restrict     mdatoms,
1252                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1253                      t_nrnb                      * gmx_restrict        nrnb)
1254 {
1255     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1256      * just 0 for non-waters.
1257      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1258      * jnr indices corresponding to data put in the four positions in the SIMD register.
1259      */
1260     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1261     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1262     int              jnrA,jnrB,jnrC,jnrD;
1263     int              jnrE,jnrF,jnrG,jnrH;
1264     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1265     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1266     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1267     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1268     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1269     real             rcutoff_scalar;
1270     real             *shiftvec,*fshift,*x,*f;
1271     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1272     real             scratch[4*DIM];
1273     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1274     real *           vdwioffsetptr0;
1275     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1276     real *           vdwioffsetptr1;
1277     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1278     real *           vdwioffsetptr2;
1279     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1280     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1281     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1282     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1283     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1284     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1285     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1286     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1287     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1288     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1289     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1290     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1291     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1292     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1293     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1294     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1295     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1296     real             *charge;
1297     int              nvdwtype;
1298     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1299     int              *vdwtype;
1300     real             *vdwparam;
1301     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1302     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1303     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1304     real             rswitch_scalar,d_scalar;
1305     __m256           dummy_mask,cutoff_mask;
1306     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1307     __m256           one     = _mm256_set1_ps(1.0);
1308     __m256           two     = _mm256_set1_ps(2.0);
1309     x                = xx[0];
1310     f                = ff[0];
1311
1312     nri              = nlist->nri;
1313     iinr             = nlist->iinr;
1314     jindex           = nlist->jindex;
1315     jjnr             = nlist->jjnr;
1316     shiftidx         = nlist->shift;
1317     gid              = nlist->gid;
1318     shiftvec         = fr->shift_vec[0];
1319     fshift           = fr->fshift[0];
1320     facel            = _mm256_set1_ps(fr->epsfac);
1321     charge           = mdatoms->chargeA;
1322     krf              = _mm256_set1_ps(fr->ic->k_rf);
1323     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1324     crf              = _mm256_set1_ps(fr->ic->c_rf);
1325     nvdwtype         = fr->ntype;
1326     vdwparam         = fr->nbfp;
1327     vdwtype          = mdatoms->typeA;
1328
1329     /* Setup water-specific parameters */
1330     inr              = nlist->iinr[0];
1331     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1332     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1333     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1334     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1335
1336     jq0              = _mm256_set1_ps(charge[inr+0]);
1337     jq1              = _mm256_set1_ps(charge[inr+1]);
1338     jq2              = _mm256_set1_ps(charge[inr+2]);
1339     vdwjidx0A        = 2*vdwtype[inr+0];
1340     qq00             = _mm256_mul_ps(iq0,jq0);
1341     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1342     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1343     qq01             = _mm256_mul_ps(iq0,jq1);
1344     qq02             = _mm256_mul_ps(iq0,jq2);
1345     qq10             = _mm256_mul_ps(iq1,jq0);
1346     qq11             = _mm256_mul_ps(iq1,jq1);
1347     qq12             = _mm256_mul_ps(iq1,jq2);
1348     qq20             = _mm256_mul_ps(iq2,jq0);
1349     qq21             = _mm256_mul_ps(iq2,jq1);
1350     qq22             = _mm256_mul_ps(iq2,jq2);
1351
1352     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1353     rcutoff_scalar   = fr->rcoulomb;
1354     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1355     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1356
1357     rswitch_scalar   = fr->rvdw_switch;
1358     rswitch          = _mm256_set1_ps(rswitch_scalar);
1359     /* Setup switch parameters */
1360     d_scalar         = rcutoff_scalar-rswitch_scalar;
1361     d                = _mm256_set1_ps(d_scalar);
1362     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1363     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1364     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1365     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1366     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1367     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1368
1369     /* Avoid stupid compiler warnings */
1370     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1371     j_coord_offsetA = 0;
1372     j_coord_offsetB = 0;
1373     j_coord_offsetC = 0;
1374     j_coord_offsetD = 0;
1375     j_coord_offsetE = 0;
1376     j_coord_offsetF = 0;
1377     j_coord_offsetG = 0;
1378     j_coord_offsetH = 0;
1379
1380     outeriter        = 0;
1381     inneriter        = 0;
1382
1383     for(iidx=0;iidx<4*DIM;iidx++)
1384     {
1385         scratch[iidx] = 0.0;
1386     }
1387
1388     /* Start outer loop over neighborlists */
1389     for(iidx=0; iidx<nri; iidx++)
1390     {
1391         /* Load shift vector for this list */
1392         i_shift_offset   = DIM*shiftidx[iidx];
1393
1394         /* Load limits for loop over neighbors */
1395         j_index_start    = jindex[iidx];
1396         j_index_end      = jindex[iidx+1];
1397
1398         /* Get outer coordinate index */
1399         inr              = iinr[iidx];
1400         i_coord_offset   = DIM*inr;
1401
1402         /* Load i particle coords and add shift vector */
1403         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1404                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1405
1406         fix0             = _mm256_setzero_ps();
1407         fiy0             = _mm256_setzero_ps();
1408         fiz0             = _mm256_setzero_ps();
1409         fix1             = _mm256_setzero_ps();
1410         fiy1             = _mm256_setzero_ps();
1411         fiz1             = _mm256_setzero_ps();
1412         fix2             = _mm256_setzero_ps();
1413         fiy2             = _mm256_setzero_ps();
1414         fiz2             = _mm256_setzero_ps();
1415
1416         /* Start inner kernel loop */
1417         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1418         {
1419
1420             /* Get j neighbor index, and coordinate index */
1421             jnrA             = jjnr[jidx];
1422             jnrB             = jjnr[jidx+1];
1423             jnrC             = jjnr[jidx+2];
1424             jnrD             = jjnr[jidx+3];
1425             jnrE             = jjnr[jidx+4];
1426             jnrF             = jjnr[jidx+5];
1427             jnrG             = jjnr[jidx+6];
1428             jnrH             = jjnr[jidx+7];
1429             j_coord_offsetA  = DIM*jnrA;
1430             j_coord_offsetB  = DIM*jnrB;
1431             j_coord_offsetC  = DIM*jnrC;
1432             j_coord_offsetD  = DIM*jnrD;
1433             j_coord_offsetE  = DIM*jnrE;
1434             j_coord_offsetF  = DIM*jnrF;
1435             j_coord_offsetG  = DIM*jnrG;
1436             j_coord_offsetH  = DIM*jnrH;
1437
1438             /* load j atom coordinates */
1439             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1440                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1441                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1442                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1443                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1444
1445             /* Calculate displacement vector */
1446             dx00             = _mm256_sub_ps(ix0,jx0);
1447             dy00             = _mm256_sub_ps(iy0,jy0);
1448             dz00             = _mm256_sub_ps(iz0,jz0);
1449             dx01             = _mm256_sub_ps(ix0,jx1);
1450             dy01             = _mm256_sub_ps(iy0,jy1);
1451             dz01             = _mm256_sub_ps(iz0,jz1);
1452             dx02             = _mm256_sub_ps(ix0,jx2);
1453             dy02             = _mm256_sub_ps(iy0,jy2);
1454             dz02             = _mm256_sub_ps(iz0,jz2);
1455             dx10             = _mm256_sub_ps(ix1,jx0);
1456             dy10             = _mm256_sub_ps(iy1,jy0);
1457             dz10             = _mm256_sub_ps(iz1,jz0);
1458             dx11             = _mm256_sub_ps(ix1,jx1);
1459             dy11             = _mm256_sub_ps(iy1,jy1);
1460             dz11             = _mm256_sub_ps(iz1,jz1);
1461             dx12             = _mm256_sub_ps(ix1,jx2);
1462             dy12             = _mm256_sub_ps(iy1,jy2);
1463             dz12             = _mm256_sub_ps(iz1,jz2);
1464             dx20             = _mm256_sub_ps(ix2,jx0);
1465             dy20             = _mm256_sub_ps(iy2,jy0);
1466             dz20             = _mm256_sub_ps(iz2,jz0);
1467             dx21             = _mm256_sub_ps(ix2,jx1);
1468             dy21             = _mm256_sub_ps(iy2,jy1);
1469             dz21             = _mm256_sub_ps(iz2,jz1);
1470             dx22             = _mm256_sub_ps(ix2,jx2);
1471             dy22             = _mm256_sub_ps(iy2,jy2);
1472             dz22             = _mm256_sub_ps(iz2,jz2);
1473
1474             /* Calculate squared distance and things based on it */
1475             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1476             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1477             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1478             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1479             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1480             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1481             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1482             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1483             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1484
1485             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1486             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1487             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1488             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1489             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1490             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1491             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1492             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1493             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1494
1495             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1496             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1497             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1498             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1499             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1500             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1501             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1502             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1503             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1504
1505             fjx0             = _mm256_setzero_ps();
1506             fjy0             = _mm256_setzero_ps();
1507             fjz0             = _mm256_setzero_ps();
1508             fjx1             = _mm256_setzero_ps();
1509             fjy1             = _mm256_setzero_ps();
1510             fjz1             = _mm256_setzero_ps();
1511             fjx2             = _mm256_setzero_ps();
1512             fjy2             = _mm256_setzero_ps();
1513             fjz2             = _mm256_setzero_ps();
1514
1515             /**************************
1516              * CALCULATE INTERACTIONS *
1517              **************************/
1518
1519             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1520             {
1521
1522             r00              = _mm256_mul_ps(rsq00,rinv00);
1523
1524             /* REACTION-FIELD ELECTROSTATICS */
1525             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1526
1527             /* LENNARD-JONES DISPERSION/REPULSION */
1528
1529             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1530             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1531             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1532             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1533             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1534
1535             d                = _mm256_sub_ps(r00,rswitch);
1536             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1537             d2               = _mm256_mul_ps(d,d);
1538             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1539
1540             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1541
1542             /* Evaluate switch function */
1543             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1544             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1545             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1546
1547             fscal            = _mm256_add_ps(felec,fvdw);
1548
1549             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1550
1551             /* Calculate temporary vectorial force */
1552             tx               = _mm256_mul_ps(fscal,dx00);
1553             ty               = _mm256_mul_ps(fscal,dy00);
1554             tz               = _mm256_mul_ps(fscal,dz00);
1555
1556             /* Update vectorial force */
1557             fix0             = _mm256_add_ps(fix0,tx);
1558             fiy0             = _mm256_add_ps(fiy0,ty);
1559             fiz0             = _mm256_add_ps(fiz0,tz);
1560
1561             fjx0             = _mm256_add_ps(fjx0,tx);
1562             fjy0             = _mm256_add_ps(fjy0,ty);
1563             fjz0             = _mm256_add_ps(fjz0,tz);
1564
1565             }
1566
1567             /**************************
1568              * CALCULATE INTERACTIONS *
1569              **************************/
1570
1571             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1572             {
1573
1574             /* REACTION-FIELD ELECTROSTATICS */
1575             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1576
1577             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1578
1579             fscal            = felec;
1580
1581             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1582
1583             /* Calculate temporary vectorial force */
1584             tx               = _mm256_mul_ps(fscal,dx01);
1585             ty               = _mm256_mul_ps(fscal,dy01);
1586             tz               = _mm256_mul_ps(fscal,dz01);
1587
1588             /* Update vectorial force */
1589             fix0             = _mm256_add_ps(fix0,tx);
1590             fiy0             = _mm256_add_ps(fiy0,ty);
1591             fiz0             = _mm256_add_ps(fiz0,tz);
1592
1593             fjx1             = _mm256_add_ps(fjx1,tx);
1594             fjy1             = _mm256_add_ps(fjy1,ty);
1595             fjz1             = _mm256_add_ps(fjz1,tz);
1596
1597             }
1598
1599             /**************************
1600              * CALCULATE INTERACTIONS *
1601              **************************/
1602
1603             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1604             {
1605
1606             /* REACTION-FIELD ELECTROSTATICS */
1607             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1608
1609             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1610
1611             fscal            = felec;
1612
1613             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1614
1615             /* Calculate temporary vectorial force */
1616             tx               = _mm256_mul_ps(fscal,dx02);
1617             ty               = _mm256_mul_ps(fscal,dy02);
1618             tz               = _mm256_mul_ps(fscal,dz02);
1619
1620             /* Update vectorial force */
1621             fix0             = _mm256_add_ps(fix0,tx);
1622             fiy0             = _mm256_add_ps(fiy0,ty);
1623             fiz0             = _mm256_add_ps(fiz0,tz);
1624
1625             fjx2             = _mm256_add_ps(fjx2,tx);
1626             fjy2             = _mm256_add_ps(fjy2,ty);
1627             fjz2             = _mm256_add_ps(fjz2,tz);
1628
1629             }
1630
1631             /**************************
1632              * CALCULATE INTERACTIONS *
1633              **************************/
1634
1635             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1636             {
1637
1638             /* REACTION-FIELD ELECTROSTATICS */
1639             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1640
1641             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1642
1643             fscal            = felec;
1644
1645             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1646
1647             /* Calculate temporary vectorial force */
1648             tx               = _mm256_mul_ps(fscal,dx10);
1649             ty               = _mm256_mul_ps(fscal,dy10);
1650             tz               = _mm256_mul_ps(fscal,dz10);
1651
1652             /* Update vectorial force */
1653             fix1             = _mm256_add_ps(fix1,tx);
1654             fiy1             = _mm256_add_ps(fiy1,ty);
1655             fiz1             = _mm256_add_ps(fiz1,tz);
1656
1657             fjx0             = _mm256_add_ps(fjx0,tx);
1658             fjy0             = _mm256_add_ps(fjy0,ty);
1659             fjz0             = _mm256_add_ps(fjz0,tz);
1660
1661             }
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1668             {
1669
1670             /* REACTION-FIELD ELECTROSTATICS */
1671             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1672
1673             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1674
1675             fscal            = felec;
1676
1677             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1678
1679             /* Calculate temporary vectorial force */
1680             tx               = _mm256_mul_ps(fscal,dx11);
1681             ty               = _mm256_mul_ps(fscal,dy11);
1682             tz               = _mm256_mul_ps(fscal,dz11);
1683
1684             /* Update vectorial force */
1685             fix1             = _mm256_add_ps(fix1,tx);
1686             fiy1             = _mm256_add_ps(fiy1,ty);
1687             fiz1             = _mm256_add_ps(fiz1,tz);
1688
1689             fjx1             = _mm256_add_ps(fjx1,tx);
1690             fjy1             = _mm256_add_ps(fjy1,ty);
1691             fjz1             = _mm256_add_ps(fjz1,tz);
1692
1693             }
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1700             {
1701
1702             /* REACTION-FIELD ELECTROSTATICS */
1703             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1704
1705             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1706
1707             fscal            = felec;
1708
1709             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1710
1711             /* Calculate temporary vectorial force */
1712             tx               = _mm256_mul_ps(fscal,dx12);
1713             ty               = _mm256_mul_ps(fscal,dy12);
1714             tz               = _mm256_mul_ps(fscal,dz12);
1715
1716             /* Update vectorial force */
1717             fix1             = _mm256_add_ps(fix1,tx);
1718             fiy1             = _mm256_add_ps(fiy1,ty);
1719             fiz1             = _mm256_add_ps(fiz1,tz);
1720
1721             fjx2             = _mm256_add_ps(fjx2,tx);
1722             fjy2             = _mm256_add_ps(fjy2,ty);
1723             fjz2             = _mm256_add_ps(fjz2,tz);
1724
1725             }
1726
1727             /**************************
1728              * CALCULATE INTERACTIONS *
1729              **************************/
1730
1731             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1732             {
1733
1734             /* REACTION-FIELD ELECTROSTATICS */
1735             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1736
1737             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1738
1739             fscal            = felec;
1740
1741             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1742
1743             /* Calculate temporary vectorial force */
1744             tx               = _mm256_mul_ps(fscal,dx20);
1745             ty               = _mm256_mul_ps(fscal,dy20);
1746             tz               = _mm256_mul_ps(fscal,dz20);
1747
1748             /* Update vectorial force */
1749             fix2             = _mm256_add_ps(fix2,tx);
1750             fiy2             = _mm256_add_ps(fiy2,ty);
1751             fiz2             = _mm256_add_ps(fiz2,tz);
1752
1753             fjx0             = _mm256_add_ps(fjx0,tx);
1754             fjy0             = _mm256_add_ps(fjy0,ty);
1755             fjz0             = _mm256_add_ps(fjz0,tz);
1756
1757             }
1758
1759             /**************************
1760              * CALCULATE INTERACTIONS *
1761              **************************/
1762
1763             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1764             {
1765
1766             /* REACTION-FIELD ELECTROSTATICS */
1767             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1768
1769             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1770
1771             fscal            = felec;
1772
1773             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1774
1775             /* Calculate temporary vectorial force */
1776             tx               = _mm256_mul_ps(fscal,dx21);
1777             ty               = _mm256_mul_ps(fscal,dy21);
1778             tz               = _mm256_mul_ps(fscal,dz21);
1779
1780             /* Update vectorial force */
1781             fix2             = _mm256_add_ps(fix2,tx);
1782             fiy2             = _mm256_add_ps(fiy2,ty);
1783             fiz2             = _mm256_add_ps(fiz2,tz);
1784
1785             fjx1             = _mm256_add_ps(fjx1,tx);
1786             fjy1             = _mm256_add_ps(fjy1,ty);
1787             fjz1             = _mm256_add_ps(fjz1,tz);
1788
1789             }
1790
1791             /**************************
1792              * CALCULATE INTERACTIONS *
1793              **************************/
1794
1795             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1796             {
1797
1798             /* REACTION-FIELD ELECTROSTATICS */
1799             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1800
1801             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1802
1803             fscal            = felec;
1804
1805             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1806
1807             /* Calculate temporary vectorial force */
1808             tx               = _mm256_mul_ps(fscal,dx22);
1809             ty               = _mm256_mul_ps(fscal,dy22);
1810             tz               = _mm256_mul_ps(fscal,dz22);
1811
1812             /* Update vectorial force */
1813             fix2             = _mm256_add_ps(fix2,tx);
1814             fiy2             = _mm256_add_ps(fiy2,ty);
1815             fiz2             = _mm256_add_ps(fiz2,tz);
1816
1817             fjx2             = _mm256_add_ps(fjx2,tx);
1818             fjy2             = _mm256_add_ps(fjy2,ty);
1819             fjz2             = _mm256_add_ps(fjz2,tz);
1820
1821             }
1822
1823             fjptrA             = f+j_coord_offsetA;
1824             fjptrB             = f+j_coord_offsetB;
1825             fjptrC             = f+j_coord_offsetC;
1826             fjptrD             = f+j_coord_offsetD;
1827             fjptrE             = f+j_coord_offsetE;
1828             fjptrF             = f+j_coord_offsetF;
1829             fjptrG             = f+j_coord_offsetG;
1830             fjptrH             = f+j_coord_offsetH;
1831
1832             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1833                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1834
1835             /* Inner loop uses 301 flops */
1836         }
1837
1838         if(jidx<j_index_end)
1839         {
1840
1841             /* Get j neighbor index, and coordinate index */
1842             jnrlistA         = jjnr[jidx];
1843             jnrlistB         = jjnr[jidx+1];
1844             jnrlistC         = jjnr[jidx+2];
1845             jnrlistD         = jjnr[jidx+3];
1846             jnrlistE         = jjnr[jidx+4];
1847             jnrlistF         = jjnr[jidx+5];
1848             jnrlistG         = jjnr[jidx+6];
1849             jnrlistH         = jjnr[jidx+7];
1850             /* Sign of each element will be negative for non-real atoms.
1851              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1852              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1853              */
1854             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1855                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1856                                             
1857             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1858             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1859             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1860             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1861             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1862             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1863             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1864             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1865             j_coord_offsetA  = DIM*jnrA;
1866             j_coord_offsetB  = DIM*jnrB;
1867             j_coord_offsetC  = DIM*jnrC;
1868             j_coord_offsetD  = DIM*jnrD;
1869             j_coord_offsetE  = DIM*jnrE;
1870             j_coord_offsetF  = DIM*jnrF;
1871             j_coord_offsetG  = DIM*jnrG;
1872             j_coord_offsetH  = DIM*jnrH;
1873
1874             /* load j atom coordinates */
1875             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1876                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1877                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1878                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1879                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1880
1881             /* Calculate displacement vector */
1882             dx00             = _mm256_sub_ps(ix0,jx0);
1883             dy00             = _mm256_sub_ps(iy0,jy0);
1884             dz00             = _mm256_sub_ps(iz0,jz0);
1885             dx01             = _mm256_sub_ps(ix0,jx1);
1886             dy01             = _mm256_sub_ps(iy0,jy1);
1887             dz01             = _mm256_sub_ps(iz0,jz1);
1888             dx02             = _mm256_sub_ps(ix0,jx2);
1889             dy02             = _mm256_sub_ps(iy0,jy2);
1890             dz02             = _mm256_sub_ps(iz0,jz2);
1891             dx10             = _mm256_sub_ps(ix1,jx0);
1892             dy10             = _mm256_sub_ps(iy1,jy0);
1893             dz10             = _mm256_sub_ps(iz1,jz0);
1894             dx11             = _mm256_sub_ps(ix1,jx1);
1895             dy11             = _mm256_sub_ps(iy1,jy1);
1896             dz11             = _mm256_sub_ps(iz1,jz1);
1897             dx12             = _mm256_sub_ps(ix1,jx2);
1898             dy12             = _mm256_sub_ps(iy1,jy2);
1899             dz12             = _mm256_sub_ps(iz1,jz2);
1900             dx20             = _mm256_sub_ps(ix2,jx0);
1901             dy20             = _mm256_sub_ps(iy2,jy0);
1902             dz20             = _mm256_sub_ps(iz2,jz0);
1903             dx21             = _mm256_sub_ps(ix2,jx1);
1904             dy21             = _mm256_sub_ps(iy2,jy1);
1905             dz21             = _mm256_sub_ps(iz2,jz1);
1906             dx22             = _mm256_sub_ps(ix2,jx2);
1907             dy22             = _mm256_sub_ps(iy2,jy2);
1908             dz22             = _mm256_sub_ps(iz2,jz2);
1909
1910             /* Calculate squared distance and things based on it */
1911             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1912             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1913             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1914             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1915             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1916             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1917             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1918             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1919             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1920
1921             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1922             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1923             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1924             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1925             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1926             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1927             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1928             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1929             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1930
1931             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1932             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1933             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1934             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1935             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1936             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1937             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1938             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1939             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1940
1941             fjx0             = _mm256_setzero_ps();
1942             fjy0             = _mm256_setzero_ps();
1943             fjz0             = _mm256_setzero_ps();
1944             fjx1             = _mm256_setzero_ps();
1945             fjy1             = _mm256_setzero_ps();
1946             fjz1             = _mm256_setzero_ps();
1947             fjx2             = _mm256_setzero_ps();
1948             fjy2             = _mm256_setzero_ps();
1949             fjz2             = _mm256_setzero_ps();
1950
1951             /**************************
1952              * CALCULATE INTERACTIONS *
1953              **************************/
1954
1955             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1956             {
1957
1958             r00              = _mm256_mul_ps(rsq00,rinv00);
1959             r00              = _mm256_andnot_ps(dummy_mask,r00);
1960
1961             /* REACTION-FIELD ELECTROSTATICS */
1962             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1963
1964             /* LENNARD-JONES DISPERSION/REPULSION */
1965
1966             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1967             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1968             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1969             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1970             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1971
1972             d                = _mm256_sub_ps(r00,rswitch);
1973             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1974             d2               = _mm256_mul_ps(d,d);
1975             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1976
1977             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1978
1979             /* Evaluate switch function */
1980             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1981             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1982             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1983
1984             fscal            = _mm256_add_ps(felec,fvdw);
1985
1986             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1987
1988             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1989
1990             /* Calculate temporary vectorial force */
1991             tx               = _mm256_mul_ps(fscal,dx00);
1992             ty               = _mm256_mul_ps(fscal,dy00);
1993             tz               = _mm256_mul_ps(fscal,dz00);
1994
1995             /* Update vectorial force */
1996             fix0             = _mm256_add_ps(fix0,tx);
1997             fiy0             = _mm256_add_ps(fiy0,ty);
1998             fiz0             = _mm256_add_ps(fiz0,tz);
1999
2000             fjx0             = _mm256_add_ps(fjx0,tx);
2001             fjy0             = _mm256_add_ps(fjy0,ty);
2002             fjz0             = _mm256_add_ps(fjz0,tz);
2003
2004             }
2005
2006             /**************************
2007              * CALCULATE INTERACTIONS *
2008              **************************/
2009
2010             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2011             {
2012
2013             /* REACTION-FIELD ELECTROSTATICS */
2014             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
2015
2016             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2017
2018             fscal            = felec;
2019
2020             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2021
2022             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2023
2024             /* Calculate temporary vectorial force */
2025             tx               = _mm256_mul_ps(fscal,dx01);
2026             ty               = _mm256_mul_ps(fscal,dy01);
2027             tz               = _mm256_mul_ps(fscal,dz01);
2028
2029             /* Update vectorial force */
2030             fix0             = _mm256_add_ps(fix0,tx);
2031             fiy0             = _mm256_add_ps(fiy0,ty);
2032             fiz0             = _mm256_add_ps(fiz0,tz);
2033
2034             fjx1             = _mm256_add_ps(fjx1,tx);
2035             fjy1             = _mm256_add_ps(fjy1,ty);
2036             fjz1             = _mm256_add_ps(fjz1,tz);
2037
2038             }
2039
2040             /**************************
2041              * CALCULATE INTERACTIONS *
2042              **************************/
2043
2044             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2045             {
2046
2047             /* REACTION-FIELD ELECTROSTATICS */
2048             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
2049
2050             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2051
2052             fscal            = felec;
2053
2054             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2055
2056             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2057
2058             /* Calculate temporary vectorial force */
2059             tx               = _mm256_mul_ps(fscal,dx02);
2060             ty               = _mm256_mul_ps(fscal,dy02);
2061             tz               = _mm256_mul_ps(fscal,dz02);
2062
2063             /* Update vectorial force */
2064             fix0             = _mm256_add_ps(fix0,tx);
2065             fiy0             = _mm256_add_ps(fiy0,ty);
2066             fiz0             = _mm256_add_ps(fiz0,tz);
2067
2068             fjx2             = _mm256_add_ps(fjx2,tx);
2069             fjy2             = _mm256_add_ps(fjy2,ty);
2070             fjz2             = _mm256_add_ps(fjz2,tz);
2071
2072             }
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2079             {
2080
2081             /* REACTION-FIELD ELECTROSTATICS */
2082             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2083
2084             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2085
2086             fscal            = felec;
2087
2088             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2089
2090             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2091
2092             /* Calculate temporary vectorial force */
2093             tx               = _mm256_mul_ps(fscal,dx10);
2094             ty               = _mm256_mul_ps(fscal,dy10);
2095             tz               = _mm256_mul_ps(fscal,dz10);
2096
2097             /* Update vectorial force */
2098             fix1             = _mm256_add_ps(fix1,tx);
2099             fiy1             = _mm256_add_ps(fiy1,ty);
2100             fiz1             = _mm256_add_ps(fiz1,tz);
2101
2102             fjx0             = _mm256_add_ps(fjx0,tx);
2103             fjy0             = _mm256_add_ps(fjy0,ty);
2104             fjz0             = _mm256_add_ps(fjz0,tz);
2105
2106             }
2107
2108             /**************************
2109              * CALCULATE INTERACTIONS *
2110              **************************/
2111
2112             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2113             {
2114
2115             /* REACTION-FIELD ELECTROSTATICS */
2116             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2117
2118             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2119
2120             fscal            = felec;
2121
2122             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2123
2124             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2125
2126             /* Calculate temporary vectorial force */
2127             tx               = _mm256_mul_ps(fscal,dx11);
2128             ty               = _mm256_mul_ps(fscal,dy11);
2129             tz               = _mm256_mul_ps(fscal,dz11);
2130
2131             /* Update vectorial force */
2132             fix1             = _mm256_add_ps(fix1,tx);
2133             fiy1             = _mm256_add_ps(fiy1,ty);
2134             fiz1             = _mm256_add_ps(fiz1,tz);
2135
2136             fjx1             = _mm256_add_ps(fjx1,tx);
2137             fjy1             = _mm256_add_ps(fjy1,ty);
2138             fjz1             = _mm256_add_ps(fjz1,tz);
2139
2140             }
2141
2142             /**************************
2143              * CALCULATE INTERACTIONS *
2144              **************************/
2145
2146             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2147             {
2148
2149             /* REACTION-FIELD ELECTROSTATICS */
2150             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2151
2152             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2153
2154             fscal            = felec;
2155
2156             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2157
2158             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2159
2160             /* Calculate temporary vectorial force */
2161             tx               = _mm256_mul_ps(fscal,dx12);
2162             ty               = _mm256_mul_ps(fscal,dy12);
2163             tz               = _mm256_mul_ps(fscal,dz12);
2164
2165             /* Update vectorial force */
2166             fix1             = _mm256_add_ps(fix1,tx);
2167             fiy1             = _mm256_add_ps(fiy1,ty);
2168             fiz1             = _mm256_add_ps(fiz1,tz);
2169
2170             fjx2             = _mm256_add_ps(fjx2,tx);
2171             fjy2             = _mm256_add_ps(fjy2,ty);
2172             fjz2             = _mm256_add_ps(fjz2,tz);
2173
2174             }
2175
2176             /**************************
2177              * CALCULATE INTERACTIONS *
2178              **************************/
2179
2180             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2181             {
2182
2183             /* REACTION-FIELD ELECTROSTATICS */
2184             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2185
2186             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2187
2188             fscal            = felec;
2189
2190             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2191
2192             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2193
2194             /* Calculate temporary vectorial force */
2195             tx               = _mm256_mul_ps(fscal,dx20);
2196             ty               = _mm256_mul_ps(fscal,dy20);
2197             tz               = _mm256_mul_ps(fscal,dz20);
2198
2199             /* Update vectorial force */
2200             fix2             = _mm256_add_ps(fix2,tx);
2201             fiy2             = _mm256_add_ps(fiy2,ty);
2202             fiz2             = _mm256_add_ps(fiz2,tz);
2203
2204             fjx0             = _mm256_add_ps(fjx0,tx);
2205             fjy0             = _mm256_add_ps(fjy0,ty);
2206             fjz0             = _mm256_add_ps(fjz0,tz);
2207
2208             }
2209
2210             /**************************
2211              * CALCULATE INTERACTIONS *
2212              **************************/
2213
2214             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2215             {
2216
2217             /* REACTION-FIELD ELECTROSTATICS */
2218             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2219
2220             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2221
2222             fscal            = felec;
2223
2224             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2225
2226             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2227
2228             /* Calculate temporary vectorial force */
2229             tx               = _mm256_mul_ps(fscal,dx21);
2230             ty               = _mm256_mul_ps(fscal,dy21);
2231             tz               = _mm256_mul_ps(fscal,dz21);
2232
2233             /* Update vectorial force */
2234             fix2             = _mm256_add_ps(fix2,tx);
2235             fiy2             = _mm256_add_ps(fiy2,ty);
2236             fiz2             = _mm256_add_ps(fiz2,tz);
2237
2238             fjx1             = _mm256_add_ps(fjx1,tx);
2239             fjy1             = _mm256_add_ps(fjy1,ty);
2240             fjz1             = _mm256_add_ps(fjz1,tz);
2241
2242             }
2243
2244             /**************************
2245              * CALCULATE INTERACTIONS *
2246              **************************/
2247
2248             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2249             {
2250
2251             /* REACTION-FIELD ELECTROSTATICS */
2252             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2253
2254             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2255
2256             fscal            = felec;
2257
2258             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2259
2260             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2261
2262             /* Calculate temporary vectorial force */
2263             tx               = _mm256_mul_ps(fscal,dx22);
2264             ty               = _mm256_mul_ps(fscal,dy22);
2265             tz               = _mm256_mul_ps(fscal,dz22);
2266
2267             /* Update vectorial force */
2268             fix2             = _mm256_add_ps(fix2,tx);
2269             fiy2             = _mm256_add_ps(fiy2,ty);
2270             fiz2             = _mm256_add_ps(fiz2,tz);
2271
2272             fjx2             = _mm256_add_ps(fjx2,tx);
2273             fjy2             = _mm256_add_ps(fjy2,ty);
2274             fjz2             = _mm256_add_ps(fjz2,tz);
2275
2276             }
2277
2278             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2279             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2280             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2281             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2282             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2283             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2284             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2285             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2286
2287             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2288                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2289
2290             /* Inner loop uses 302 flops */
2291         }
2292
2293         /* End of innermost loop */
2294
2295         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2296                                                  f+i_coord_offset,fshift+i_shift_offset);
2297
2298         /* Increment number of inner iterations */
2299         inneriter                  += j_index_end - j_index_start;
2300
2301         /* Outer loop uses 18 flops */
2302     }
2303
2304     /* Increment number of outer iterations */
2305     outeriter        += nri;
2306
2307     /* Update outer/inner flops */
2308
2309     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
2310 }