Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     real *           vdwioffsetptr2;
90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
114     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115     real             rswitch_scalar,d_scalar;
116     __m256           dummy_mask,cutoff_mask;
117     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
118     __m256           one     = _mm256_set1_ps(1.0);
119     __m256           two     = _mm256_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm256_set1_ps(fr->ic->epsfac);
132     charge           = mdatoms->chargeA;
133     krf              = _mm256_set1_ps(fr->ic->k_rf);
134     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
135     crf              = _mm256_set1_ps(fr->ic->c_rf);
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
143     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
144     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
145     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
146
147     jq0              = _mm256_set1_ps(charge[inr+0]);
148     jq1              = _mm256_set1_ps(charge[inr+1]);
149     jq2              = _mm256_set1_ps(charge[inr+2]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     qq00             = _mm256_mul_ps(iq0,jq0);
152     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
153     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
154     qq01             = _mm256_mul_ps(iq0,jq1);
155     qq02             = _mm256_mul_ps(iq0,jq2);
156     qq10             = _mm256_mul_ps(iq1,jq0);
157     qq11             = _mm256_mul_ps(iq1,jq1);
158     qq12             = _mm256_mul_ps(iq1,jq2);
159     qq20             = _mm256_mul_ps(iq2,jq0);
160     qq21             = _mm256_mul_ps(iq2,jq1);
161     qq22             = _mm256_mul_ps(iq2,jq2);
162
163     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164     rcutoff_scalar   = fr->ic->rcoulomb;
165     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
166     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
167
168     rswitch_scalar   = fr->ic->rvdw_switch;
169     rswitch          = _mm256_set1_ps(rswitch_scalar);
170     /* Setup switch parameters */
171     d_scalar         = rcutoff_scalar-rswitch_scalar;
172     d                = _mm256_set1_ps(d_scalar);
173     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
174     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
177     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179
180     /* Avoid stupid compiler warnings */
181     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
182     j_coord_offsetA = 0;
183     j_coord_offsetB = 0;
184     j_coord_offsetC = 0;
185     j_coord_offsetD = 0;
186     j_coord_offsetE = 0;
187     j_coord_offsetF = 0;
188     j_coord_offsetG = 0;
189     j_coord_offsetH = 0;
190
191     outeriter        = 0;
192     inneriter        = 0;
193
194     for(iidx=0;iidx<4*DIM;iidx++)
195     {
196         scratch[iidx] = 0.0;
197     }
198
199     /* Start outer loop over neighborlists */
200     for(iidx=0; iidx<nri; iidx++)
201     {
202         /* Load shift vector for this list */
203         i_shift_offset   = DIM*shiftidx[iidx];
204
205         /* Load limits for loop over neighbors */
206         j_index_start    = jindex[iidx];
207         j_index_end      = jindex[iidx+1];
208
209         /* Get outer coordinate index */
210         inr              = iinr[iidx];
211         i_coord_offset   = DIM*inr;
212
213         /* Load i particle coords and add shift vector */
214         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216
217         fix0             = _mm256_setzero_ps();
218         fiy0             = _mm256_setzero_ps();
219         fiz0             = _mm256_setzero_ps();
220         fix1             = _mm256_setzero_ps();
221         fiy1             = _mm256_setzero_ps();
222         fiz1             = _mm256_setzero_ps();
223         fix2             = _mm256_setzero_ps();
224         fiy2             = _mm256_setzero_ps();
225         fiz2             = _mm256_setzero_ps();
226
227         /* Reset potential sums */
228         velecsum         = _mm256_setzero_ps();
229         vvdwsum          = _mm256_setzero_ps();
230
231         /* Start inner kernel loop */
232         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
233         {
234
235             /* Get j neighbor index, and coordinate index */
236             jnrA             = jjnr[jidx];
237             jnrB             = jjnr[jidx+1];
238             jnrC             = jjnr[jidx+2];
239             jnrD             = jjnr[jidx+3];
240             jnrE             = jjnr[jidx+4];
241             jnrF             = jjnr[jidx+5];
242             jnrG             = jjnr[jidx+6];
243             jnrH             = jjnr[jidx+7];
244             j_coord_offsetA  = DIM*jnrA;
245             j_coord_offsetB  = DIM*jnrB;
246             j_coord_offsetC  = DIM*jnrC;
247             j_coord_offsetD  = DIM*jnrD;
248             j_coord_offsetE  = DIM*jnrE;
249             j_coord_offsetF  = DIM*jnrF;
250             j_coord_offsetG  = DIM*jnrG;
251             j_coord_offsetH  = DIM*jnrH;
252
253             /* load j atom coordinates */
254             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
255                                                  x+j_coord_offsetC,x+j_coord_offsetD,
256                                                  x+j_coord_offsetE,x+j_coord_offsetF,
257                                                  x+j_coord_offsetG,x+j_coord_offsetH,
258                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
259
260             /* Calculate displacement vector */
261             dx00             = _mm256_sub_ps(ix0,jx0);
262             dy00             = _mm256_sub_ps(iy0,jy0);
263             dz00             = _mm256_sub_ps(iz0,jz0);
264             dx01             = _mm256_sub_ps(ix0,jx1);
265             dy01             = _mm256_sub_ps(iy0,jy1);
266             dz01             = _mm256_sub_ps(iz0,jz1);
267             dx02             = _mm256_sub_ps(ix0,jx2);
268             dy02             = _mm256_sub_ps(iy0,jy2);
269             dz02             = _mm256_sub_ps(iz0,jz2);
270             dx10             = _mm256_sub_ps(ix1,jx0);
271             dy10             = _mm256_sub_ps(iy1,jy0);
272             dz10             = _mm256_sub_ps(iz1,jz0);
273             dx11             = _mm256_sub_ps(ix1,jx1);
274             dy11             = _mm256_sub_ps(iy1,jy1);
275             dz11             = _mm256_sub_ps(iz1,jz1);
276             dx12             = _mm256_sub_ps(ix1,jx2);
277             dy12             = _mm256_sub_ps(iy1,jy2);
278             dz12             = _mm256_sub_ps(iz1,jz2);
279             dx20             = _mm256_sub_ps(ix2,jx0);
280             dy20             = _mm256_sub_ps(iy2,jy0);
281             dz20             = _mm256_sub_ps(iz2,jz0);
282             dx21             = _mm256_sub_ps(ix2,jx1);
283             dy21             = _mm256_sub_ps(iy2,jy1);
284             dz21             = _mm256_sub_ps(iz2,jz1);
285             dx22             = _mm256_sub_ps(ix2,jx2);
286             dy22             = _mm256_sub_ps(iy2,jy2);
287             dz22             = _mm256_sub_ps(iz2,jz2);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
291             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
292             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
293             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
294             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
295             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
296             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
297             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
298             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
299
300             rinv00           = avx256_invsqrt_f(rsq00);
301             rinv01           = avx256_invsqrt_f(rsq01);
302             rinv02           = avx256_invsqrt_f(rsq02);
303             rinv10           = avx256_invsqrt_f(rsq10);
304             rinv11           = avx256_invsqrt_f(rsq11);
305             rinv12           = avx256_invsqrt_f(rsq12);
306             rinv20           = avx256_invsqrt_f(rsq20);
307             rinv21           = avx256_invsqrt_f(rsq21);
308             rinv22           = avx256_invsqrt_f(rsq22);
309
310             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
311             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
312             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
313             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
314             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
315             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
316             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
317             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
318             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
319
320             fjx0             = _mm256_setzero_ps();
321             fjy0             = _mm256_setzero_ps();
322             fjz0             = _mm256_setzero_ps();
323             fjx1             = _mm256_setzero_ps();
324             fjy1             = _mm256_setzero_ps();
325             fjz1             = _mm256_setzero_ps();
326             fjx2             = _mm256_setzero_ps();
327             fjy2             = _mm256_setzero_ps();
328             fjz2             = _mm256_setzero_ps();
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             if (gmx_mm256_any_lt(rsq00,rcutoff2))
335             {
336
337             r00              = _mm256_mul_ps(rsq00,rinv00);
338
339             /* REACTION-FIELD ELECTROSTATICS */
340             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
341             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
342
343             /* LENNARD-JONES DISPERSION/REPULSION */
344
345             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
346             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
347             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
348             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
349             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
350
351             d                = _mm256_sub_ps(r00,rswitch);
352             d                = _mm256_max_ps(d,_mm256_setzero_ps());
353             d2               = _mm256_mul_ps(d,d);
354             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
355
356             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
357
358             /* Evaluate switch function */
359             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
360             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
361             vvdw             = _mm256_mul_ps(vvdw,sw);
362             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             velec            = _mm256_and_ps(velec,cutoff_mask);
366             velecsum         = _mm256_add_ps(velecsum,velec);
367             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
368             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
369
370             fscal            = _mm256_add_ps(felec,fvdw);
371
372             fscal            = _mm256_and_ps(fscal,cutoff_mask);
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm256_mul_ps(fscal,dx00);
376             ty               = _mm256_mul_ps(fscal,dy00);
377             tz               = _mm256_mul_ps(fscal,dz00);
378
379             /* Update vectorial force */
380             fix0             = _mm256_add_ps(fix0,tx);
381             fiy0             = _mm256_add_ps(fiy0,ty);
382             fiz0             = _mm256_add_ps(fiz0,tz);
383
384             fjx0             = _mm256_add_ps(fjx0,tx);
385             fjy0             = _mm256_add_ps(fjy0,ty);
386             fjz0             = _mm256_add_ps(fjz0,tz);
387
388             }
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             if (gmx_mm256_any_lt(rsq01,rcutoff2))
395             {
396
397             /* REACTION-FIELD ELECTROSTATICS */
398             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
399             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
400
401             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
402
403             /* Update potential sum for this i atom from the interaction with this j atom. */
404             velec            = _mm256_and_ps(velec,cutoff_mask);
405             velecsum         = _mm256_add_ps(velecsum,velec);
406
407             fscal            = felec;
408
409             fscal            = _mm256_and_ps(fscal,cutoff_mask);
410
411             /* Calculate temporary vectorial force */
412             tx               = _mm256_mul_ps(fscal,dx01);
413             ty               = _mm256_mul_ps(fscal,dy01);
414             tz               = _mm256_mul_ps(fscal,dz01);
415
416             /* Update vectorial force */
417             fix0             = _mm256_add_ps(fix0,tx);
418             fiy0             = _mm256_add_ps(fiy0,ty);
419             fiz0             = _mm256_add_ps(fiz0,tz);
420
421             fjx1             = _mm256_add_ps(fjx1,tx);
422             fjy1             = _mm256_add_ps(fjy1,ty);
423             fjz1             = _mm256_add_ps(fjz1,tz);
424
425             }
426
427             /**************************
428              * CALCULATE INTERACTIONS *
429              **************************/
430
431             if (gmx_mm256_any_lt(rsq02,rcutoff2))
432             {
433
434             /* REACTION-FIELD ELECTROSTATICS */
435             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
436             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
437
438             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
439
440             /* Update potential sum for this i atom from the interaction with this j atom. */
441             velec            = _mm256_and_ps(velec,cutoff_mask);
442             velecsum         = _mm256_add_ps(velecsum,velec);
443
444             fscal            = felec;
445
446             fscal            = _mm256_and_ps(fscal,cutoff_mask);
447
448             /* Calculate temporary vectorial force */
449             tx               = _mm256_mul_ps(fscal,dx02);
450             ty               = _mm256_mul_ps(fscal,dy02);
451             tz               = _mm256_mul_ps(fscal,dz02);
452
453             /* Update vectorial force */
454             fix0             = _mm256_add_ps(fix0,tx);
455             fiy0             = _mm256_add_ps(fiy0,ty);
456             fiz0             = _mm256_add_ps(fiz0,tz);
457
458             fjx2             = _mm256_add_ps(fjx2,tx);
459             fjy2             = _mm256_add_ps(fjy2,ty);
460             fjz2             = _mm256_add_ps(fjz2,tz);
461
462             }
463
464             /**************************
465              * CALCULATE INTERACTIONS *
466              **************************/
467
468             if (gmx_mm256_any_lt(rsq10,rcutoff2))
469             {
470
471             /* REACTION-FIELD ELECTROSTATICS */
472             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
473             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
474
475             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velec            = _mm256_and_ps(velec,cutoff_mask);
479             velecsum         = _mm256_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483             fscal            = _mm256_and_ps(fscal,cutoff_mask);
484
485             /* Calculate temporary vectorial force */
486             tx               = _mm256_mul_ps(fscal,dx10);
487             ty               = _mm256_mul_ps(fscal,dy10);
488             tz               = _mm256_mul_ps(fscal,dz10);
489
490             /* Update vectorial force */
491             fix1             = _mm256_add_ps(fix1,tx);
492             fiy1             = _mm256_add_ps(fiy1,ty);
493             fiz1             = _mm256_add_ps(fiz1,tz);
494
495             fjx0             = _mm256_add_ps(fjx0,tx);
496             fjy0             = _mm256_add_ps(fjy0,ty);
497             fjz0             = _mm256_add_ps(fjz0,tz);
498
499             }
500
501             /**************************
502              * CALCULATE INTERACTIONS *
503              **************************/
504
505             if (gmx_mm256_any_lt(rsq11,rcutoff2))
506             {
507
508             /* REACTION-FIELD ELECTROSTATICS */
509             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
510             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
511
512             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velec            = _mm256_and_ps(velec,cutoff_mask);
516             velecsum         = _mm256_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520             fscal            = _mm256_and_ps(fscal,cutoff_mask);
521
522             /* Calculate temporary vectorial force */
523             tx               = _mm256_mul_ps(fscal,dx11);
524             ty               = _mm256_mul_ps(fscal,dy11);
525             tz               = _mm256_mul_ps(fscal,dz11);
526
527             /* Update vectorial force */
528             fix1             = _mm256_add_ps(fix1,tx);
529             fiy1             = _mm256_add_ps(fiy1,ty);
530             fiz1             = _mm256_add_ps(fiz1,tz);
531
532             fjx1             = _mm256_add_ps(fjx1,tx);
533             fjy1             = _mm256_add_ps(fjy1,ty);
534             fjz1             = _mm256_add_ps(fjz1,tz);
535
536             }
537
538             /**************************
539              * CALCULATE INTERACTIONS *
540              **************************/
541
542             if (gmx_mm256_any_lt(rsq12,rcutoff2))
543             {
544
545             /* REACTION-FIELD ELECTROSTATICS */
546             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
547             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
548
549             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
550
551             /* Update potential sum for this i atom from the interaction with this j atom. */
552             velec            = _mm256_and_ps(velec,cutoff_mask);
553             velecsum         = _mm256_add_ps(velecsum,velec);
554
555             fscal            = felec;
556
557             fscal            = _mm256_and_ps(fscal,cutoff_mask);
558
559             /* Calculate temporary vectorial force */
560             tx               = _mm256_mul_ps(fscal,dx12);
561             ty               = _mm256_mul_ps(fscal,dy12);
562             tz               = _mm256_mul_ps(fscal,dz12);
563
564             /* Update vectorial force */
565             fix1             = _mm256_add_ps(fix1,tx);
566             fiy1             = _mm256_add_ps(fiy1,ty);
567             fiz1             = _mm256_add_ps(fiz1,tz);
568
569             fjx2             = _mm256_add_ps(fjx2,tx);
570             fjy2             = _mm256_add_ps(fjy2,ty);
571             fjz2             = _mm256_add_ps(fjz2,tz);
572
573             }
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             if (gmx_mm256_any_lt(rsq20,rcutoff2))
580             {
581
582             /* REACTION-FIELD ELECTROSTATICS */
583             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
584             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
585
586             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
587
588             /* Update potential sum for this i atom from the interaction with this j atom. */
589             velec            = _mm256_and_ps(velec,cutoff_mask);
590             velecsum         = _mm256_add_ps(velecsum,velec);
591
592             fscal            = felec;
593
594             fscal            = _mm256_and_ps(fscal,cutoff_mask);
595
596             /* Calculate temporary vectorial force */
597             tx               = _mm256_mul_ps(fscal,dx20);
598             ty               = _mm256_mul_ps(fscal,dy20);
599             tz               = _mm256_mul_ps(fscal,dz20);
600
601             /* Update vectorial force */
602             fix2             = _mm256_add_ps(fix2,tx);
603             fiy2             = _mm256_add_ps(fiy2,ty);
604             fiz2             = _mm256_add_ps(fiz2,tz);
605
606             fjx0             = _mm256_add_ps(fjx0,tx);
607             fjy0             = _mm256_add_ps(fjy0,ty);
608             fjz0             = _mm256_add_ps(fjz0,tz);
609
610             }
611
612             /**************************
613              * CALCULATE INTERACTIONS *
614              **************************/
615
616             if (gmx_mm256_any_lt(rsq21,rcutoff2))
617             {
618
619             /* REACTION-FIELD ELECTROSTATICS */
620             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
621             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
622
623             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
624
625             /* Update potential sum for this i atom from the interaction with this j atom. */
626             velec            = _mm256_and_ps(velec,cutoff_mask);
627             velecsum         = _mm256_add_ps(velecsum,velec);
628
629             fscal            = felec;
630
631             fscal            = _mm256_and_ps(fscal,cutoff_mask);
632
633             /* Calculate temporary vectorial force */
634             tx               = _mm256_mul_ps(fscal,dx21);
635             ty               = _mm256_mul_ps(fscal,dy21);
636             tz               = _mm256_mul_ps(fscal,dz21);
637
638             /* Update vectorial force */
639             fix2             = _mm256_add_ps(fix2,tx);
640             fiy2             = _mm256_add_ps(fiy2,ty);
641             fiz2             = _mm256_add_ps(fiz2,tz);
642
643             fjx1             = _mm256_add_ps(fjx1,tx);
644             fjy1             = _mm256_add_ps(fjy1,ty);
645             fjz1             = _mm256_add_ps(fjz1,tz);
646
647             }
648
649             /**************************
650              * CALCULATE INTERACTIONS *
651              **************************/
652
653             if (gmx_mm256_any_lt(rsq22,rcutoff2))
654             {
655
656             /* REACTION-FIELD ELECTROSTATICS */
657             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
658             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
659
660             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velec            = _mm256_and_ps(velec,cutoff_mask);
664             velecsum         = _mm256_add_ps(velecsum,velec);
665
666             fscal            = felec;
667
668             fscal            = _mm256_and_ps(fscal,cutoff_mask);
669
670             /* Calculate temporary vectorial force */
671             tx               = _mm256_mul_ps(fscal,dx22);
672             ty               = _mm256_mul_ps(fscal,dy22);
673             tz               = _mm256_mul_ps(fscal,dz22);
674
675             /* Update vectorial force */
676             fix2             = _mm256_add_ps(fix2,tx);
677             fiy2             = _mm256_add_ps(fiy2,ty);
678             fiz2             = _mm256_add_ps(fiz2,tz);
679
680             fjx2             = _mm256_add_ps(fjx2,tx);
681             fjy2             = _mm256_add_ps(fjy2,ty);
682             fjz2             = _mm256_add_ps(fjz2,tz);
683
684             }
685
686             fjptrA             = f+j_coord_offsetA;
687             fjptrB             = f+j_coord_offsetB;
688             fjptrC             = f+j_coord_offsetC;
689             fjptrD             = f+j_coord_offsetD;
690             fjptrE             = f+j_coord_offsetE;
691             fjptrF             = f+j_coord_offsetF;
692             fjptrG             = f+j_coord_offsetG;
693             fjptrH             = f+j_coord_offsetH;
694
695             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
696                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
697
698             /* Inner loop uses 358 flops */
699         }
700
701         if(jidx<j_index_end)
702         {
703
704             /* Get j neighbor index, and coordinate index */
705             jnrlistA         = jjnr[jidx];
706             jnrlistB         = jjnr[jidx+1];
707             jnrlistC         = jjnr[jidx+2];
708             jnrlistD         = jjnr[jidx+3];
709             jnrlistE         = jjnr[jidx+4];
710             jnrlistF         = jjnr[jidx+5];
711             jnrlistG         = jjnr[jidx+6];
712             jnrlistH         = jjnr[jidx+7];
713             /* Sign of each element will be negative for non-real atoms.
714              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
715              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
716              */
717             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
718                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
719                                             
720             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
721             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
722             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
723             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
724             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
725             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
726             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
727             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
728             j_coord_offsetA  = DIM*jnrA;
729             j_coord_offsetB  = DIM*jnrB;
730             j_coord_offsetC  = DIM*jnrC;
731             j_coord_offsetD  = DIM*jnrD;
732             j_coord_offsetE  = DIM*jnrE;
733             j_coord_offsetF  = DIM*jnrF;
734             j_coord_offsetG  = DIM*jnrG;
735             j_coord_offsetH  = DIM*jnrH;
736
737             /* load j atom coordinates */
738             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
739                                                  x+j_coord_offsetC,x+j_coord_offsetD,
740                                                  x+j_coord_offsetE,x+j_coord_offsetF,
741                                                  x+j_coord_offsetG,x+j_coord_offsetH,
742                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
743
744             /* Calculate displacement vector */
745             dx00             = _mm256_sub_ps(ix0,jx0);
746             dy00             = _mm256_sub_ps(iy0,jy0);
747             dz00             = _mm256_sub_ps(iz0,jz0);
748             dx01             = _mm256_sub_ps(ix0,jx1);
749             dy01             = _mm256_sub_ps(iy0,jy1);
750             dz01             = _mm256_sub_ps(iz0,jz1);
751             dx02             = _mm256_sub_ps(ix0,jx2);
752             dy02             = _mm256_sub_ps(iy0,jy2);
753             dz02             = _mm256_sub_ps(iz0,jz2);
754             dx10             = _mm256_sub_ps(ix1,jx0);
755             dy10             = _mm256_sub_ps(iy1,jy0);
756             dz10             = _mm256_sub_ps(iz1,jz0);
757             dx11             = _mm256_sub_ps(ix1,jx1);
758             dy11             = _mm256_sub_ps(iy1,jy1);
759             dz11             = _mm256_sub_ps(iz1,jz1);
760             dx12             = _mm256_sub_ps(ix1,jx2);
761             dy12             = _mm256_sub_ps(iy1,jy2);
762             dz12             = _mm256_sub_ps(iz1,jz2);
763             dx20             = _mm256_sub_ps(ix2,jx0);
764             dy20             = _mm256_sub_ps(iy2,jy0);
765             dz20             = _mm256_sub_ps(iz2,jz0);
766             dx21             = _mm256_sub_ps(ix2,jx1);
767             dy21             = _mm256_sub_ps(iy2,jy1);
768             dz21             = _mm256_sub_ps(iz2,jz1);
769             dx22             = _mm256_sub_ps(ix2,jx2);
770             dy22             = _mm256_sub_ps(iy2,jy2);
771             dz22             = _mm256_sub_ps(iz2,jz2);
772
773             /* Calculate squared distance and things based on it */
774             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
775             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
776             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
777             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
778             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
779             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
780             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
781             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
782             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
783
784             rinv00           = avx256_invsqrt_f(rsq00);
785             rinv01           = avx256_invsqrt_f(rsq01);
786             rinv02           = avx256_invsqrt_f(rsq02);
787             rinv10           = avx256_invsqrt_f(rsq10);
788             rinv11           = avx256_invsqrt_f(rsq11);
789             rinv12           = avx256_invsqrt_f(rsq12);
790             rinv20           = avx256_invsqrt_f(rsq20);
791             rinv21           = avx256_invsqrt_f(rsq21);
792             rinv22           = avx256_invsqrt_f(rsq22);
793
794             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
795             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
796             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
797             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
798             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
799             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
800             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
801             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
802             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
803
804             fjx0             = _mm256_setzero_ps();
805             fjy0             = _mm256_setzero_ps();
806             fjz0             = _mm256_setzero_ps();
807             fjx1             = _mm256_setzero_ps();
808             fjy1             = _mm256_setzero_ps();
809             fjz1             = _mm256_setzero_ps();
810             fjx2             = _mm256_setzero_ps();
811             fjy2             = _mm256_setzero_ps();
812             fjz2             = _mm256_setzero_ps();
813
814             /**************************
815              * CALCULATE INTERACTIONS *
816              **************************/
817
818             if (gmx_mm256_any_lt(rsq00,rcutoff2))
819             {
820
821             r00              = _mm256_mul_ps(rsq00,rinv00);
822             r00              = _mm256_andnot_ps(dummy_mask,r00);
823
824             /* REACTION-FIELD ELECTROSTATICS */
825             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
826             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
827
828             /* LENNARD-JONES DISPERSION/REPULSION */
829
830             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
831             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
832             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
833             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
834             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
835
836             d                = _mm256_sub_ps(r00,rswitch);
837             d                = _mm256_max_ps(d,_mm256_setzero_ps());
838             d2               = _mm256_mul_ps(d,d);
839             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
840
841             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
842
843             /* Evaluate switch function */
844             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
845             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
846             vvdw             = _mm256_mul_ps(vvdw,sw);
847             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
848
849             /* Update potential sum for this i atom from the interaction with this j atom. */
850             velec            = _mm256_and_ps(velec,cutoff_mask);
851             velec            = _mm256_andnot_ps(dummy_mask,velec);
852             velecsum         = _mm256_add_ps(velecsum,velec);
853             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
854             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
855             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
856
857             fscal            = _mm256_add_ps(felec,fvdw);
858
859             fscal            = _mm256_and_ps(fscal,cutoff_mask);
860
861             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
862
863             /* Calculate temporary vectorial force */
864             tx               = _mm256_mul_ps(fscal,dx00);
865             ty               = _mm256_mul_ps(fscal,dy00);
866             tz               = _mm256_mul_ps(fscal,dz00);
867
868             /* Update vectorial force */
869             fix0             = _mm256_add_ps(fix0,tx);
870             fiy0             = _mm256_add_ps(fiy0,ty);
871             fiz0             = _mm256_add_ps(fiz0,tz);
872
873             fjx0             = _mm256_add_ps(fjx0,tx);
874             fjy0             = _mm256_add_ps(fjy0,ty);
875             fjz0             = _mm256_add_ps(fjz0,tz);
876
877             }
878
879             /**************************
880              * CALCULATE INTERACTIONS *
881              **************************/
882
883             if (gmx_mm256_any_lt(rsq01,rcutoff2))
884             {
885
886             /* REACTION-FIELD ELECTROSTATICS */
887             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
888             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
889
890             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
891
892             /* Update potential sum for this i atom from the interaction with this j atom. */
893             velec            = _mm256_and_ps(velec,cutoff_mask);
894             velec            = _mm256_andnot_ps(dummy_mask,velec);
895             velecsum         = _mm256_add_ps(velecsum,velec);
896
897             fscal            = felec;
898
899             fscal            = _mm256_and_ps(fscal,cutoff_mask);
900
901             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
902
903             /* Calculate temporary vectorial force */
904             tx               = _mm256_mul_ps(fscal,dx01);
905             ty               = _mm256_mul_ps(fscal,dy01);
906             tz               = _mm256_mul_ps(fscal,dz01);
907
908             /* Update vectorial force */
909             fix0             = _mm256_add_ps(fix0,tx);
910             fiy0             = _mm256_add_ps(fiy0,ty);
911             fiz0             = _mm256_add_ps(fiz0,tz);
912
913             fjx1             = _mm256_add_ps(fjx1,tx);
914             fjy1             = _mm256_add_ps(fjy1,ty);
915             fjz1             = _mm256_add_ps(fjz1,tz);
916
917             }
918
919             /**************************
920              * CALCULATE INTERACTIONS *
921              **************************/
922
923             if (gmx_mm256_any_lt(rsq02,rcutoff2))
924             {
925
926             /* REACTION-FIELD ELECTROSTATICS */
927             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
928             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
929
930             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
931
932             /* Update potential sum for this i atom from the interaction with this j atom. */
933             velec            = _mm256_and_ps(velec,cutoff_mask);
934             velec            = _mm256_andnot_ps(dummy_mask,velec);
935             velecsum         = _mm256_add_ps(velecsum,velec);
936
937             fscal            = felec;
938
939             fscal            = _mm256_and_ps(fscal,cutoff_mask);
940
941             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
942
943             /* Calculate temporary vectorial force */
944             tx               = _mm256_mul_ps(fscal,dx02);
945             ty               = _mm256_mul_ps(fscal,dy02);
946             tz               = _mm256_mul_ps(fscal,dz02);
947
948             /* Update vectorial force */
949             fix0             = _mm256_add_ps(fix0,tx);
950             fiy0             = _mm256_add_ps(fiy0,ty);
951             fiz0             = _mm256_add_ps(fiz0,tz);
952
953             fjx2             = _mm256_add_ps(fjx2,tx);
954             fjy2             = _mm256_add_ps(fjy2,ty);
955             fjz2             = _mm256_add_ps(fjz2,tz);
956
957             }
958
959             /**************************
960              * CALCULATE INTERACTIONS *
961              **************************/
962
963             if (gmx_mm256_any_lt(rsq10,rcutoff2))
964             {
965
966             /* REACTION-FIELD ELECTROSTATICS */
967             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
968             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
969
970             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
971
972             /* Update potential sum for this i atom from the interaction with this j atom. */
973             velec            = _mm256_and_ps(velec,cutoff_mask);
974             velec            = _mm256_andnot_ps(dummy_mask,velec);
975             velecsum         = _mm256_add_ps(velecsum,velec);
976
977             fscal            = felec;
978
979             fscal            = _mm256_and_ps(fscal,cutoff_mask);
980
981             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
982
983             /* Calculate temporary vectorial force */
984             tx               = _mm256_mul_ps(fscal,dx10);
985             ty               = _mm256_mul_ps(fscal,dy10);
986             tz               = _mm256_mul_ps(fscal,dz10);
987
988             /* Update vectorial force */
989             fix1             = _mm256_add_ps(fix1,tx);
990             fiy1             = _mm256_add_ps(fiy1,ty);
991             fiz1             = _mm256_add_ps(fiz1,tz);
992
993             fjx0             = _mm256_add_ps(fjx0,tx);
994             fjy0             = _mm256_add_ps(fjy0,ty);
995             fjz0             = _mm256_add_ps(fjz0,tz);
996
997             }
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1004             {
1005
1006             /* REACTION-FIELD ELECTROSTATICS */
1007             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
1008             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1009
1010             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1011
1012             /* Update potential sum for this i atom from the interaction with this j atom. */
1013             velec            = _mm256_and_ps(velec,cutoff_mask);
1014             velec            = _mm256_andnot_ps(dummy_mask,velec);
1015             velecsum         = _mm256_add_ps(velecsum,velec);
1016
1017             fscal            = felec;
1018
1019             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1020
1021             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1022
1023             /* Calculate temporary vectorial force */
1024             tx               = _mm256_mul_ps(fscal,dx11);
1025             ty               = _mm256_mul_ps(fscal,dy11);
1026             tz               = _mm256_mul_ps(fscal,dz11);
1027
1028             /* Update vectorial force */
1029             fix1             = _mm256_add_ps(fix1,tx);
1030             fiy1             = _mm256_add_ps(fiy1,ty);
1031             fiz1             = _mm256_add_ps(fiz1,tz);
1032
1033             fjx1             = _mm256_add_ps(fjx1,tx);
1034             fjy1             = _mm256_add_ps(fjy1,ty);
1035             fjz1             = _mm256_add_ps(fjz1,tz);
1036
1037             }
1038
1039             /**************************
1040              * CALCULATE INTERACTIONS *
1041              **************************/
1042
1043             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1044             {
1045
1046             /* REACTION-FIELD ELECTROSTATICS */
1047             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1048             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1049
1050             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1051
1052             /* Update potential sum for this i atom from the interaction with this j atom. */
1053             velec            = _mm256_and_ps(velec,cutoff_mask);
1054             velec            = _mm256_andnot_ps(dummy_mask,velec);
1055             velecsum         = _mm256_add_ps(velecsum,velec);
1056
1057             fscal            = felec;
1058
1059             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1060
1061             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1062
1063             /* Calculate temporary vectorial force */
1064             tx               = _mm256_mul_ps(fscal,dx12);
1065             ty               = _mm256_mul_ps(fscal,dy12);
1066             tz               = _mm256_mul_ps(fscal,dz12);
1067
1068             /* Update vectorial force */
1069             fix1             = _mm256_add_ps(fix1,tx);
1070             fiy1             = _mm256_add_ps(fiy1,ty);
1071             fiz1             = _mm256_add_ps(fiz1,tz);
1072
1073             fjx2             = _mm256_add_ps(fjx2,tx);
1074             fjy2             = _mm256_add_ps(fjy2,ty);
1075             fjz2             = _mm256_add_ps(fjz2,tz);
1076
1077             }
1078
1079             /**************************
1080              * CALCULATE INTERACTIONS *
1081              **************************/
1082
1083             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1084             {
1085
1086             /* REACTION-FIELD ELECTROSTATICS */
1087             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1088             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1089
1090             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1091
1092             /* Update potential sum for this i atom from the interaction with this j atom. */
1093             velec            = _mm256_and_ps(velec,cutoff_mask);
1094             velec            = _mm256_andnot_ps(dummy_mask,velec);
1095             velecsum         = _mm256_add_ps(velecsum,velec);
1096
1097             fscal            = felec;
1098
1099             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1100
1101             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1102
1103             /* Calculate temporary vectorial force */
1104             tx               = _mm256_mul_ps(fscal,dx20);
1105             ty               = _mm256_mul_ps(fscal,dy20);
1106             tz               = _mm256_mul_ps(fscal,dz20);
1107
1108             /* Update vectorial force */
1109             fix2             = _mm256_add_ps(fix2,tx);
1110             fiy2             = _mm256_add_ps(fiy2,ty);
1111             fiz2             = _mm256_add_ps(fiz2,tz);
1112
1113             fjx0             = _mm256_add_ps(fjx0,tx);
1114             fjy0             = _mm256_add_ps(fjy0,ty);
1115             fjz0             = _mm256_add_ps(fjz0,tz);
1116
1117             }
1118
1119             /**************************
1120              * CALCULATE INTERACTIONS *
1121              **************************/
1122
1123             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1124             {
1125
1126             /* REACTION-FIELD ELECTROSTATICS */
1127             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1128             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1129
1130             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1131
1132             /* Update potential sum for this i atom from the interaction with this j atom. */
1133             velec            = _mm256_and_ps(velec,cutoff_mask);
1134             velec            = _mm256_andnot_ps(dummy_mask,velec);
1135             velecsum         = _mm256_add_ps(velecsum,velec);
1136
1137             fscal            = felec;
1138
1139             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1140
1141             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1142
1143             /* Calculate temporary vectorial force */
1144             tx               = _mm256_mul_ps(fscal,dx21);
1145             ty               = _mm256_mul_ps(fscal,dy21);
1146             tz               = _mm256_mul_ps(fscal,dz21);
1147
1148             /* Update vectorial force */
1149             fix2             = _mm256_add_ps(fix2,tx);
1150             fiy2             = _mm256_add_ps(fiy2,ty);
1151             fiz2             = _mm256_add_ps(fiz2,tz);
1152
1153             fjx1             = _mm256_add_ps(fjx1,tx);
1154             fjy1             = _mm256_add_ps(fjy1,ty);
1155             fjz1             = _mm256_add_ps(fjz1,tz);
1156
1157             }
1158
1159             /**************************
1160              * CALCULATE INTERACTIONS *
1161              **************************/
1162
1163             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1164             {
1165
1166             /* REACTION-FIELD ELECTROSTATICS */
1167             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1168             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1169
1170             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1171
1172             /* Update potential sum for this i atom from the interaction with this j atom. */
1173             velec            = _mm256_and_ps(velec,cutoff_mask);
1174             velec            = _mm256_andnot_ps(dummy_mask,velec);
1175             velecsum         = _mm256_add_ps(velecsum,velec);
1176
1177             fscal            = felec;
1178
1179             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1180
1181             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1182
1183             /* Calculate temporary vectorial force */
1184             tx               = _mm256_mul_ps(fscal,dx22);
1185             ty               = _mm256_mul_ps(fscal,dy22);
1186             tz               = _mm256_mul_ps(fscal,dz22);
1187
1188             /* Update vectorial force */
1189             fix2             = _mm256_add_ps(fix2,tx);
1190             fiy2             = _mm256_add_ps(fiy2,ty);
1191             fiz2             = _mm256_add_ps(fiz2,tz);
1192
1193             fjx2             = _mm256_add_ps(fjx2,tx);
1194             fjy2             = _mm256_add_ps(fjy2,ty);
1195             fjz2             = _mm256_add_ps(fjz2,tz);
1196
1197             }
1198
1199             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1200             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1201             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1202             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1203             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1204             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1205             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1206             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1207
1208             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1209                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1210
1211             /* Inner loop uses 359 flops */
1212         }
1213
1214         /* End of innermost loop */
1215
1216         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1217                                                  f+i_coord_offset,fshift+i_shift_offset);
1218
1219         ggid                        = gid[iidx];
1220         /* Update potential energies */
1221         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1222         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1223
1224         /* Increment number of inner iterations */
1225         inneriter                  += j_index_end - j_index_start;
1226
1227         /* Outer loop uses 20 flops */
1228     }
1229
1230     /* Increment number of outer iterations */
1231     outeriter        += nri;
1232
1233     /* Update outer/inner flops */
1234
1235     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1236 }
1237 /*
1238  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1239  * Electrostatics interaction: ReactionField
1240  * VdW interaction:            LennardJones
1241  * Geometry:                   Water3-Water3
1242  * Calculate force/pot:        Force
1243  */
1244 void
1245 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_single
1246                     (t_nblist                    * gmx_restrict       nlist,
1247                      rvec                        * gmx_restrict          xx,
1248                      rvec                        * gmx_restrict          ff,
1249                      struct t_forcerec           * gmx_restrict          fr,
1250                      t_mdatoms                   * gmx_restrict     mdatoms,
1251                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1252                      t_nrnb                      * gmx_restrict        nrnb)
1253 {
1254     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1255      * just 0 for non-waters.
1256      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1257      * jnr indices corresponding to data put in the four positions in the SIMD register.
1258      */
1259     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1260     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1261     int              jnrA,jnrB,jnrC,jnrD;
1262     int              jnrE,jnrF,jnrG,jnrH;
1263     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1264     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1265     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1266     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1267     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1268     real             rcutoff_scalar;
1269     real             *shiftvec,*fshift,*x,*f;
1270     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1271     real             scratch[4*DIM];
1272     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1273     real *           vdwioffsetptr0;
1274     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1275     real *           vdwioffsetptr1;
1276     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1277     real *           vdwioffsetptr2;
1278     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1279     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1280     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1281     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1282     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1283     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1284     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1285     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1286     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1287     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1288     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1289     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1290     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1291     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1292     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1293     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1294     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1295     real             *charge;
1296     int              nvdwtype;
1297     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1298     int              *vdwtype;
1299     real             *vdwparam;
1300     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1301     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1302     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1303     real             rswitch_scalar,d_scalar;
1304     __m256           dummy_mask,cutoff_mask;
1305     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1306     __m256           one     = _mm256_set1_ps(1.0);
1307     __m256           two     = _mm256_set1_ps(2.0);
1308     x                = xx[0];
1309     f                = ff[0];
1310
1311     nri              = nlist->nri;
1312     iinr             = nlist->iinr;
1313     jindex           = nlist->jindex;
1314     jjnr             = nlist->jjnr;
1315     shiftidx         = nlist->shift;
1316     gid              = nlist->gid;
1317     shiftvec         = fr->shift_vec[0];
1318     fshift           = fr->fshift[0];
1319     facel            = _mm256_set1_ps(fr->ic->epsfac);
1320     charge           = mdatoms->chargeA;
1321     krf              = _mm256_set1_ps(fr->ic->k_rf);
1322     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1323     crf              = _mm256_set1_ps(fr->ic->c_rf);
1324     nvdwtype         = fr->ntype;
1325     vdwparam         = fr->nbfp;
1326     vdwtype          = mdatoms->typeA;
1327
1328     /* Setup water-specific parameters */
1329     inr              = nlist->iinr[0];
1330     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1331     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1332     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1333     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1334
1335     jq0              = _mm256_set1_ps(charge[inr+0]);
1336     jq1              = _mm256_set1_ps(charge[inr+1]);
1337     jq2              = _mm256_set1_ps(charge[inr+2]);
1338     vdwjidx0A        = 2*vdwtype[inr+0];
1339     qq00             = _mm256_mul_ps(iq0,jq0);
1340     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1341     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1342     qq01             = _mm256_mul_ps(iq0,jq1);
1343     qq02             = _mm256_mul_ps(iq0,jq2);
1344     qq10             = _mm256_mul_ps(iq1,jq0);
1345     qq11             = _mm256_mul_ps(iq1,jq1);
1346     qq12             = _mm256_mul_ps(iq1,jq2);
1347     qq20             = _mm256_mul_ps(iq2,jq0);
1348     qq21             = _mm256_mul_ps(iq2,jq1);
1349     qq22             = _mm256_mul_ps(iq2,jq2);
1350
1351     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1352     rcutoff_scalar   = fr->ic->rcoulomb;
1353     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1354     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1355
1356     rswitch_scalar   = fr->ic->rvdw_switch;
1357     rswitch          = _mm256_set1_ps(rswitch_scalar);
1358     /* Setup switch parameters */
1359     d_scalar         = rcutoff_scalar-rswitch_scalar;
1360     d                = _mm256_set1_ps(d_scalar);
1361     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1362     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1363     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1364     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1365     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1366     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1367
1368     /* Avoid stupid compiler warnings */
1369     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1370     j_coord_offsetA = 0;
1371     j_coord_offsetB = 0;
1372     j_coord_offsetC = 0;
1373     j_coord_offsetD = 0;
1374     j_coord_offsetE = 0;
1375     j_coord_offsetF = 0;
1376     j_coord_offsetG = 0;
1377     j_coord_offsetH = 0;
1378
1379     outeriter        = 0;
1380     inneriter        = 0;
1381
1382     for(iidx=0;iidx<4*DIM;iidx++)
1383     {
1384         scratch[iidx] = 0.0;
1385     }
1386
1387     /* Start outer loop over neighborlists */
1388     for(iidx=0; iidx<nri; iidx++)
1389     {
1390         /* Load shift vector for this list */
1391         i_shift_offset   = DIM*shiftidx[iidx];
1392
1393         /* Load limits for loop over neighbors */
1394         j_index_start    = jindex[iidx];
1395         j_index_end      = jindex[iidx+1];
1396
1397         /* Get outer coordinate index */
1398         inr              = iinr[iidx];
1399         i_coord_offset   = DIM*inr;
1400
1401         /* Load i particle coords and add shift vector */
1402         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1403                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1404
1405         fix0             = _mm256_setzero_ps();
1406         fiy0             = _mm256_setzero_ps();
1407         fiz0             = _mm256_setzero_ps();
1408         fix1             = _mm256_setzero_ps();
1409         fiy1             = _mm256_setzero_ps();
1410         fiz1             = _mm256_setzero_ps();
1411         fix2             = _mm256_setzero_ps();
1412         fiy2             = _mm256_setzero_ps();
1413         fiz2             = _mm256_setzero_ps();
1414
1415         /* Start inner kernel loop */
1416         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1417         {
1418
1419             /* Get j neighbor index, and coordinate index */
1420             jnrA             = jjnr[jidx];
1421             jnrB             = jjnr[jidx+1];
1422             jnrC             = jjnr[jidx+2];
1423             jnrD             = jjnr[jidx+3];
1424             jnrE             = jjnr[jidx+4];
1425             jnrF             = jjnr[jidx+5];
1426             jnrG             = jjnr[jidx+6];
1427             jnrH             = jjnr[jidx+7];
1428             j_coord_offsetA  = DIM*jnrA;
1429             j_coord_offsetB  = DIM*jnrB;
1430             j_coord_offsetC  = DIM*jnrC;
1431             j_coord_offsetD  = DIM*jnrD;
1432             j_coord_offsetE  = DIM*jnrE;
1433             j_coord_offsetF  = DIM*jnrF;
1434             j_coord_offsetG  = DIM*jnrG;
1435             j_coord_offsetH  = DIM*jnrH;
1436
1437             /* load j atom coordinates */
1438             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1439                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1440                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1441                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1442                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1443
1444             /* Calculate displacement vector */
1445             dx00             = _mm256_sub_ps(ix0,jx0);
1446             dy00             = _mm256_sub_ps(iy0,jy0);
1447             dz00             = _mm256_sub_ps(iz0,jz0);
1448             dx01             = _mm256_sub_ps(ix0,jx1);
1449             dy01             = _mm256_sub_ps(iy0,jy1);
1450             dz01             = _mm256_sub_ps(iz0,jz1);
1451             dx02             = _mm256_sub_ps(ix0,jx2);
1452             dy02             = _mm256_sub_ps(iy0,jy2);
1453             dz02             = _mm256_sub_ps(iz0,jz2);
1454             dx10             = _mm256_sub_ps(ix1,jx0);
1455             dy10             = _mm256_sub_ps(iy1,jy0);
1456             dz10             = _mm256_sub_ps(iz1,jz0);
1457             dx11             = _mm256_sub_ps(ix1,jx1);
1458             dy11             = _mm256_sub_ps(iy1,jy1);
1459             dz11             = _mm256_sub_ps(iz1,jz1);
1460             dx12             = _mm256_sub_ps(ix1,jx2);
1461             dy12             = _mm256_sub_ps(iy1,jy2);
1462             dz12             = _mm256_sub_ps(iz1,jz2);
1463             dx20             = _mm256_sub_ps(ix2,jx0);
1464             dy20             = _mm256_sub_ps(iy2,jy0);
1465             dz20             = _mm256_sub_ps(iz2,jz0);
1466             dx21             = _mm256_sub_ps(ix2,jx1);
1467             dy21             = _mm256_sub_ps(iy2,jy1);
1468             dz21             = _mm256_sub_ps(iz2,jz1);
1469             dx22             = _mm256_sub_ps(ix2,jx2);
1470             dy22             = _mm256_sub_ps(iy2,jy2);
1471             dz22             = _mm256_sub_ps(iz2,jz2);
1472
1473             /* Calculate squared distance and things based on it */
1474             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1475             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1476             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1477             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1478             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1479             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1480             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1481             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1482             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1483
1484             rinv00           = avx256_invsqrt_f(rsq00);
1485             rinv01           = avx256_invsqrt_f(rsq01);
1486             rinv02           = avx256_invsqrt_f(rsq02);
1487             rinv10           = avx256_invsqrt_f(rsq10);
1488             rinv11           = avx256_invsqrt_f(rsq11);
1489             rinv12           = avx256_invsqrt_f(rsq12);
1490             rinv20           = avx256_invsqrt_f(rsq20);
1491             rinv21           = avx256_invsqrt_f(rsq21);
1492             rinv22           = avx256_invsqrt_f(rsq22);
1493
1494             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1495             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1496             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1497             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1498             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1499             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1500             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1501             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1502             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1503
1504             fjx0             = _mm256_setzero_ps();
1505             fjy0             = _mm256_setzero_ps();
1506             fjz0             = _mm256_setzero_ps();
1507             fjx1             = _mm256_setzero_ps();
1508             fjy1             = _mm256_setzero_ps();
1509             fjz1             = _mm256_setzero_ps();
1510             fjx2             = _mm256_setzero_ps();
1511             fjy2             = _mm256_setzero_ps();
1512             fjz2             = _mm256_setzero_ps();
1513
1514             /**************************
1515              * CALCULATE INTERACTIONS *
1516              **************************/
1517
1518             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1519             {
1520
1521             r00              = _mm256_mul_ps(rsq00,rinv00);
1522
1523             /* REACTION-FIELD ELECTROSTATICS */
1524             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1525
1526             /* LENNARD-JONES DISPERSION/REPULSION */
1527
1528             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1529             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1530             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1531             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1532             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1533
1534             d                = _mm256_sub_ps(r00,rswitch);
1535             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1536             d2               = _mm256_mul_ps(d,d);
1537             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1538
1539             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1540
1541             /* Evaluate switch function */
1542             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1543             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1544             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1545
1546             fscal            = _mm256_add_ps(felec,fvdw);
1547
1548             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1549
1550             /* Calculate temporary vectorial force */
1551             tx               = _mm256_mul_ps(fscal,dx00);
1552             ty               = _mm256_mul_ps(fscal,dy00);
1553             tz               = _mm256_mul_ps(fscal,dz00);
1554
1555             /* Update vectorial force */
1556             fix0             = _mm256_add_ps(fix0,tx);
1557             fiy0             = _mm256_add_ps(fiy0,ty);
1558             fiz0             = _mm256_add_ps(fiz0,tz);
1559
1560             fjx0             = _mm256_add_ps(fjx0,tx);
1561             fjy0             = _mm256_add_ps(fjy0,ty);
1562             fjz0             = _mm256_add_ps(fjz0,tz);
1563
1564             }
1565
1566             /**************************
1567              * CALCULATE INTERACTIONS *
1568              **************************/
1569
1570             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1571             {
1572
1573             /* REACTION-FIELD ELECTROSTATICS */
1574             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1575
1576             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1577
1578             fscal            = felec;
1579
1580             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1581
1582             /* Calculate temporary vectorial force */
1583             tx               = _mm256_mul_ps(fscal,dx01);
1584             ty               = _mm256_mul_ps(fscal,dy01);
1585             tz               = _mm256_mul_ps(fscal,dz01);
1586
1587             /* Update vectorial force */
1588             fix0             = _mm256_add_ps(fix0,tx);
1589             fiy0             = _mm256_add_ps(fiy0,ty);
1590             fiz0             = _mm256_add_ps(fiz0,tz);
1591
1592             fjx1             = _mm256_add_ps(fjx1,tx);
1593             fjy1             = _mm256_add_ps(fjy1,ty);
1594             fjz1             = _mm256_add_ps(fjz1,tz);
1595
1596             }
1597
1598             /**************************
1599              * CALCULATE INTERACTIONS *
1600              **************************/
1601
1602             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1603             {
1604
1605             /* REACTION-FIELD ELECTROSTATICS */
1606             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1607
1608             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1609
1610             fscal            = felec;
1611
1612             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1613
1614             /* Calculate temporary vectorial force */
1615             tx               = _mm256_mul_ps(fscal,dx02);
1616             ty               = _mm256_mul_ps(fscal,dy02);
1617             tz               = _mm256_mul_ps(fscal,dz02);
1618
1619             /* Update vectorial force */
1620             fix0             = _mm256_add_ps(fix0,tx);
1621             fiy0             = _mm256_add_ps(fiy0,ty);
1622             fiz0             = _mm256_add_ps(fiz0,tz);
1623
1624             fjx2             = _mm256_add_ps(fjx2,tx);
1625             fjy2             = _mm256_add_ps(fjy2,ty);
1626             fjz2             = _mm256_add_ps(fjz2,tz);
1627
1628             }
1629
1630             /**************************
1631              * CALCULATE INTERACTIONS *
1632              **************************/
1633
1634             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1635             {
1636
1637             /* REACTION-FIELD ELECTROSTATICS */
1638             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1639
1640             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1641
1642             fscal            = felec;
1643
1644             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1645
1646             /* Calculate temporary vectorial force */
1647             tx               = _mm256_mul_ps(fscal,dx10);
1648             ty               = _mm256_mul_ps(fscal,dy10);
1649             tz               = _mm256_mul_ps(fscal,dz10);
1650
1651             /* Update vectorial force */
1652             fix1             = _mm256_add_ps(fix1,tx);
1653             fiy1             = _mm256_add_ps(fiy1,ty);
1654             fiz1             = _mm256_add_ps(fiz1,tz);
1655
1656             fjx0             = _mm256_add_ps(fjx0,tx);
1657             fjy0             = _mm256_add_ps(fjy0,ty);
1658             fjz0             = _mm256_add_ps(fjz0,tz);
1659
1660             }
1661
1662             /**************************
1663              * CALCULATE INTERACTIONS *
1664              **************************/
1665
1666             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1667             {
1668
1669             /* REACTION-FIELD ELECTROSTATICS */
1670             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1671
1672             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1673
1674             fscal            = felec;
1675
1676             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1677
1678             /* Calculate temporary vectorial force */
1679             tx               = _mm256_mul_ps(fscal,dx11);
1680             ty               = _mm256_mul_ps(fscal,dy11);
1681             tz               = _mm256_mul_ps(fscal,dz11);
1682
1683             /* Update vectorial force */
1684             fix1             = _mm256_add_ps(fix1,tx);
1685             fiy1             = _mm256_add_ps(fiy1,ty);
1686             fiz1             = _mm256_add_ps(fiz1,tz);
1687
1688             fjx1             = _mm256_add_ps(fjx1,tx);
1689             fjy1             = _mm256_add_ps(fjy1,ty);
1690             fjz1             = _mm256_add_ps(fjz1,tz);
1691
1692             }
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1699             {
1700
1701             /* REACTION-FIELD ELECTROSTATICS */
1702             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1703
1704             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1705
1706             fscal            = felec;
1707
1708             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1709
1710             /* Calculate temporary vectorial force */
1711             tx               = _mm256_mul_ps(fscal,dx12);
1712             ty               = _mm256_mul_ps(fscal,dy12);
1713             tz               = _mm256_mul_ps(fscal,dz12);
1714
1715             /* Update vectorial force */
1716             fix1             = _mm256_add_ps(fix1,tx);
1717             fiy1             = _mm256_add_ps(fiy1,ty);
1718             fiz1             = _mm256_add_ps(fiz1,tz);
1719
1720             fjx2             = _mm256_add_ps(fjx2,tx);
1721             fjy2             = _mm256_add_ps(fjy2,ty);
1722             fjz2             = _mm256_add_ps(fjz2,tz);
1723
1724             }
1725
1726             /**************************
1727              * CALCULATE INTERACTIONS *
1728              **************************/
1729
1730             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1731             {
1732
1733             /* REACTION-FIELD ELECTROSTATICS */
1734             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1735
1736             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1737
1738             fscal            = felec;
1739
1740             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1741
1742             /* Calculate temporary vectorial force */
1743             tx               = _mm256_mul_ps(fscal,dx20);
1744             ty               = _mm256_mul_ps(fscal,dy20);
1745             tz               = _mm256_mul_ps(fscal,dz20);
1746
1747             /* Update vectorial force */
1748             fix2             = _mm256_add_ps(fix2,tx);
1749             fiy2             = _mm256_add_ps(fiy2,ty);
1750             fiz2             = _mm256_add_ps(fiz2,tz);
1751
1752             fjx0             = _mm256_add_ps(fjx0,tx);
1753             fjy0             = _mm256_add_ps(fjy0,ty);
1754             fjz0             = _mm256_add_ps(fjz0,tz);
1755
1756             }
1757
1758             /**************************
1759              * CALCULATE INTERACTIONS *
1760              **************************/
1761
1762             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1763             {
1764
1765             /* REACTION-FIELD ELECTROSTATICS */
1766             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1767
1768             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1769
1770             fscal            = felec;
1771
1772             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1773
1774             /* Calculate temporary vectorial force */
1775             tx               = _mm256_mul_ps(fscal,dx21);
1776             ty               = _mm256_mul_ps(fscal,dy21);
1777             tz               = _mm256_mul_ps(fscal,dz21);
1778
1779             /* Update vectorial force */
1780             fix2             = _mm256_add_ps(fix2,tx);
1781             fiy2             = _mm256_add_ps(fiy2,ty);
1782             fiz2             = _mm256_add_ps(fiz2,tz);
1783
1784             fjx1             = _mm256_add_ps(fjx1,tx);
1785             fjy1             = _mm256_add_ps(fjy1,ty);
1786             fjz1             = _mm256_add_ps(fjz1,tz);
1787
1788             }
1789
1790             /**************************
1791              * CALCULATE INTERACTIONS *
1792              **************************/
1793
1794             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1795             {
1796
1797             /* REACTION-FIELD ELECTROSTATICS */
1798             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1799
1800             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1801
1802             fscal            = felec;
1803
1804             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1805
1806             /* Calculate temporary vectorial force */
1807             tx               = _mm256_mul_ps(fscal,dx22);
1808             ty               = _mm256_mul_ps(fscal,dy22);
1809             tz               = _mm256_mul_ps(fscal,dz22);
1810
1811             /* Update vectorial force */
1812             fix2             = _mm256_add_ps(fix2,tx);
1813             fiy2             = _mm256_add_ps(fiy2,ty);
1814             fiz2             = _mm256_add_ps(fiz2,tz);
1815
1816             fjx2             = _mm256_add_ps(fjx2,tx);
1817             fjy2             = _mm256_add_ps(fjy2,ty);
1818             fjz2             = _mm256_add_ps(fjz2,tz);
1819
1820             }
1821
1822             fjptrA             = f+j_coord_offsetA;
1823             fjptrB             = f+j_coord_offsetB;
1824             fjptrC             = f+j_coord_offsetC;
1825             fjptrD             = f+j_coord_offsetD;
1826             fjptrE             = f+j_coord_offsetE;
1827             fjptrF             = f+j_coord_offsetF;
1828             fjptrG             = f+j_coord_offsetG;
1829             fjptrH             = f+j_coord_offsetH;
1830
1831             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1832                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1833
1834             /* Inner loop uses 301 flops */
1835         }
1836
1837         if(jidx<j_index_end)
1838         {
1839
1840             /* Get j neighbor index, and coordinate index */
1841             jnrlistA         = jjnr[jidx];
1842             jnrlistB         = jjnr[jidx+1];
1843             jnrlistC         = jjnr[jidx+2];
1844             jnrlistD         = jjnr[jidx+3];
1845             jnrlistE         = jjnr[jidx+4];
1846             jnrlistF         = jjnr[jidx+5];
1847             jnrlistG         = jjnr[jidx+6];
1848             jnrlistH         = jjnr[jidx+7];
1849             /* Sign of each element will be negative for non-real atoms.
1850              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1851              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1852              */
1853             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1854                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1855                                             
1856             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1857             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1858             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1859             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1860             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1861             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1862             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1863             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1864             j_coord_offsetA  = DIM*jnrA;
1865             j_coord_offsetB  = DIM*jnrB;
1866             j_coord_offsetC  = DIM*jnrC;
1867             j_coord_offsetD  = DIM*jnrD;
1868             j_coord_offsetE  = DIM*jnrE;
1869             j_coord_offsetF  = DIM*jnrF;
1870             j_coord_offsetG  = DIM*jnrG;
1871             j_coord_offsetH  = DIM*jnrH;
1872
1873             /* load j atom coordinates */
1874             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1875                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1876                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1877                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1878                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1879
1880             /* Calculate displacement vector */
1881             dx00             = _mm256_sub_ps(ix0,jx0);
1882             dy00             = _mm256_sub_ps(iy0,jy0);
1883             dz00             = _mm256_sub_ps(iz0,jz0);
1884             dx01             = _mm256_sub_ps(ix0,jx1);
1885             dy01             = _mm256_sub_ps(iy0,jy1);
1886             dz01             = _mm256_sub_ps(iz0,jz1);
1887             dx02             = _mm256_sub_ps(ix0,jx2);
1888             dy02             = _mm256_sub_ps(iy0,jy2);
1889             dz02             = _mm256_sub_ps(iz0,jz2);
1890             dx10             = _mm256_sub_ps(ix1,jx0);
1891             dy10             = _mm256_sub_ps(iy1,jy0);
1892             dz10             = _mm256_sub_ps(iz1,jz0);
1893             dx11             = _mm256_sub_ps(ix1,jx1);
1894             dy11             = _mm256_sub_ps(iy1,jy1);
1895             dz11             = _mm256_sub_ps(iz1,jz1);
1896             dx12             = _mm256_sub_ps(ix1,jx2);
1897             dy12             = _mm256_sub_ps(iy1,jy2);
1898             dz12             = _mm256_sub_ps(iz1,jz2);
1899             dx20             = _mm256_sub_ps(ix2,jx0);
1900             dy20             = _mm256_sub_ps(iy2,jy0);
1901             dz20             = _mm256_sub_ps(iz2,jz0);
1902             dx21             = _mm256_sub_ps(ix2,jx1);
1903             dy21             = _mm256_sub_ps(iy2,jy1);
1904             dz21             = _mm256_sub_ps(iz2,jz1);
1905             dx22             = _mm256_sub_ps(ix2,jx2);
1906             dy22             = _mm256_sub_ps(iy2,jy2);
1907             dz22             = _mm256_sub_ps(iz2,jz2);
1908
1909             /* Calculate squared distance and things based on it */
1910             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1911             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1912             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1913             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1914             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1915             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1916             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1917             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1918             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1919
1920             rinv00           = avx256_invsqrt_f(rsq00);
1921             rinv01           = avx256_invsqrt_f(rsq01);
1922             rinv02           = avx256_invsqrt_f(rsq02);
1923             rinv10           = avx256_invsqrt_f(rsq10);
1924             rinv11           = avx256_invsqrt_f(rsq11);
1925             rinv12           = avx256_invsqrt_f(rsq12);
1926             rinv20           = avx256_invsqrt_f(rsq20);
1927             rinv21           = avx256_invsqrt_f(rsq21);
1928             rinv22           = avx256_invsqrt_f(rsq22);
1929
1930             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1931             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1932             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1933             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1934             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1935             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1936             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1937             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1938             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1939
1940             fjx0             = _mm256_setzero_ps();
1941             fjy0             = _mm256_setzero_ps();
1942             fjz0             = _mm256_setzero_ps();
1943             fjx1             = _mm256_setzero_ps();
1944             fjy1             = _mm256_setzero_ps();
1945             fjz1             = _mm256_setzero_ps();
1946             fjx2             = _mm256_setzero_ps();
1947             fjy2             = _mm256_setzero_ps();
1948             fjz2             = _mm256_setzero_ps();
1949
1950             /**************************
1951              * CALCULATE INTERACTIONS *
1952              **************************/
1953
1954             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1955             {
1956
1957             r00              = _mm256_mul_ps(rsq00,rinv00);
1958             r00              = _mm256_andnot_ps(dummy_mask,r00);
1959
1960             /* REACTION-FIELD ELECTROSTATICS */
1961             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1962
1963             /* LENNARD-JONES DISPERSION/REPULSION */
1964
1965             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1966             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1967             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1968             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1969             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1970
1971             d                = _mm256_sub_ps(r00,rswitch);
1972             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1973             d2               = _mm256_mul_ps(d,d);
1974             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1975
1976             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1977
1978             /* Evaluate switch function */
1979             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1980             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1981             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1982
1983             fscal            = _mm256_add_ps(felec,fvdw);
1984
1985             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1986
1987             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1988
1989             /* Calculate temporary vectorial force */
1990             tx               = _mm256_mul_ps(fscal,dx00);
1991             ty               = _mm256_mul_ps(fscal,dy00);
1992             tz               = _mm256_mul_ps(fscal,dz00);
1993
1994             /* Update vectorial force */
1995             fix0             = _mm256_add_ps(fix0,tx);
1996             fiy0             = _mm256_add_ps(fiy0,ty);
1997             fiz0             = _mm256_add_ps(fiz0,tz);
1998
1999             fjx0             = _mm256_add_ps(fjx0,tx);
2000             fjy0             = _mm256_add_ps(fjy0,ty);
2001             fjz0             = _mm256_add_ps(fjz0,tz);
2002
2003             }
2004
2005             /**************************
2006              * CALCULATE INTERACTIONS *
2007              **************************/
2008
2009             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2010             {
2011
2012             /* REACTION-FIELD ELECTROSTATICS */
2013             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
2014
2015             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2016
2017             fscal            = felec;
2018
2019             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2020
2021             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2022
2023             /* Calculate temporary vectorial force */
2024             tx               = _mm256_mul_ps(fscal,dx01);
2025             ty               = _mm256_mul_ps(fscal,dy01);
2026             tz               = _mm256_mul_ps(fscal,dz01);
2027
2028             /* Update vectorial force */
2029             fix0             = _mm256_add_ps(fix0,tx);
2030             fiy0             = _mm256_add_ps(fiy0,ty);
2031             fiz0             = _mm256_add_ps(fiz0,tz);
2032
2033             fjx1             = _mm256_add_ps(fjx1,tx);
2034             fjy1             = _mm256_add_ps(fjy1,ty);
2035             fjz1             = _mm256_add_ps(fjz1,tz);
2036
2037             }
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2044             {
2045
2046             /* REACTION-FIELD ELECTROSTATICS */
2047             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
2048
2049             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2050
2051             fscal            = felec;
2052
2053             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2054
2055             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2056
2057             /* Calculate temporary vectorial force */
2058             tx               = _mm256_mul_ps(fscal,dx02);
2059             ty               = _mm256_mul_ps(fscal,dy02);
2060             tz               = _mm256_mul_ps(fscal,dz02);
2061
2062             /* Update vectorial force */
2063             fix0             = _mm256_add_ps(fix0,tx);
2064             fiy0             = _mm256_add_ps(fiy0,ty);
2065             fiz0             = _mm256_add_ps(fiz0,tz);
2066
2067             fjx2             = _mm256_add_ps(fjx2,tx);
2068             fjy2             = _mm256_add_ps(fjy2,ty);
2069             fjz2             = _mm256_add_ps(fjz2,tz);
2070
2071             }
2072
2073             /**************************
2074              * CALCULATE INTERACTIONS *
2075              **************************/
2076
2077             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2078             {
2079
2080             /* REACTION-FIELD ELECTROSTATICS */
2081             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2082
2083             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2084
2085             fscal            = felec;
2086
2087             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2088
2089             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2090
2091             /* Calculate temporary vectorial force */
2092             tx               = _mm256_mul_ps(fscal,dx10);
2093             ty               = _mm256_mul_ps(fscal,dy10);
2094             tz               = _mm256_mul_ps(fscal,dz10);
2095
2096             /* Update vectorial force */
2097             fix1             = _mm256_add_ps(fix1,tx);
2098             fiy1             = _mm256_add_ps(fiy1,ty);
2099             fiz1             = _mm256_add_ps(fiz1,tz);
2100
2101             fjx0             = _mm256_add_ps(fjx0,tx);
2102             fjy0             = _mm256_add_ps(fjy0,ty);
2103             fjz0             = _mm256_add_ps(fjz0,tz);
2104
2105             }
2106
2107             /**************************
2108              * CALCULATE INTERACTIONS *
2109              **************************/
2110
2111             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2112             {
2113
2114             /* REACTION-FIELD ELECTROSTATICS */
2115             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2116
2117             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2118
2119             fscal            = felec;
2120
2121             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2122
2123             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2124
2125             /* Calculate temporary vectorial force */
2126             tx               = _mm256_mul_ps(fscal,dx11);
2127             ty               = _mm256_mul_ps(fscal,dy11);
2128             tz               = _mm256_mul_ps(fscal,dz11);
2129
2130             /* Update vectorial force */
2131             fix1             = _mm256_add_ps(fix1,tx);
2132             fiy1             = _mm256_add_ps(fiy1,ty);
2133             fiz1             = _mm256_add_ps(fiz1,tz);
2134
2135             fjx1             = _mm256_add_ps(fjx1,tx);
2136             fjy1             = _mm256_add_ps(fjy1,ty);
2137             fjz1             = _mm256_add_ps(fjz1,tz);
2138
2139             }
2140
2141             /**************************
2142              * CALCULATE INTERACTIONS *
2143              **************************/
2144
2145             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2146             {
2147
2148             /* REACTION-FIELD ELECTROSTATICS */
2149             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2150
2151             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2152
2153             fscal            = felec;
2154
2155             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2156
2157             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2158
2159             /* Calculate temporary vectorial force */
2160             tx               = _mm256_mul_ps(fscal,dx12);
2161             ty               = _mm256_mul_ps(fscal,dy12);
2162             tz               = _mm256_mul_ps(fscal,dz12);
2163
2164             /* Update vectorial force */
2165             fix1             = _mm256_add_ps(fix1,tx);
2166             fiy1             = _mm256_add_ps(fiy1,ty);
2167             fiz1             = _mm256_add_ps(fiz1,tz);
2168
2169             fjx2             = _mm256_add_ps(fjx2,tx);
2170             fjy2             = _mm256_add_ps(fjy2,ty);
2171             fjz2             = _mm256_add_ps(fjz2,tz);
2172
2173             }
2174
2175             /**************************
2176              * CALCULATE INTERACTIONS *
2177              **************************/
2178
2179             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2180             {
2181
2182             /* REACTION-FIELD ELECTROSTATICS */
2183             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2184
2185             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2186
2187             fscal            = felec;
2188
2189             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2190
2191             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2192
2193             /* Calculate temporary vectorial force */
2194             tx               = _mm256_mul_ps(fscal,dx20);
2195             ty               = _mm256_mul_ps(fscal,dy20);
2196             tz               = _mm256_mul_ps(fscal,dz20);
2197
2198             /* Update vectorial force */
2199             fix2             = _mm256_add_ps(fix2,tx);
2200             fiy2             = _mm256_add_ps(fiy2,ty);
2201             fiz2             = _mm256_add_ps(fiz2,tz);
2202
2203             fjx0             = _mm256_add_ps(fjx0,tx);
2204             fjy0             = _mm256_add_ps(fjy0,ty);
2205             fjz0             = _mm256_add_ps(fjz0,tz);
2206
2207             }
2208
2209             /**************************
2210              * CALCULATE INTERACTIONS *
2211              **************************/
2212
2213             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2214             {
2215
2216             /* REACTION-FIELD ELECTROSTATICS */
2217             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2218
2219             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2220
2221             fscal            = felec;
2222
2223             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2224
2225             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2226
2227             /* Calculate temporary vectorial force */
2228             tx               = _mm256_mul_ps(fscal,dx21);
2229             ty               = _mm256_mul_ps(fscal,dy21);
2230             tz               = _mm256_mul_ps(fscal,dz21);
2231
2232             /* Update vectorial force */
2233             fix2             = _mm256_add_ps(fix2,tx);
2234             fiy2             = _mm256_add_ps(fiy2,ty);
2235             fiz2             = _mm256_add_ps(fiz2,tz);
2236
2237             fjx1             = _mm256_add_ps(fjx1,tx);
2238             fjy1             = _mm256_add_ps(fjy1,ty);
2239             fjz1             = _mm256_add_ps(fjz1,tz);
2240
2241             }
2242
2243             /**************************
2244              * CALCULATE INTERACTIONS *
2245              **************************/
2246
2247             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2248             {
2249
2250             /* REACTION-FIELD ELECTROSTATICS */
2251             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2252
2253             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2254
2255             fscal            = felec;
2256
2257             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2258
2259             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2260
2261             /* Calculate temporary vectorial force */
2262             tx               = _mm256_mul_ps(fscal,dx22);
2263             ty               = _mm256_mul_ps(fscal,dy22);
2264             tz               = _mm256_mul_ps(fscal,dz22);
2265
2266             /* Update vectorial force */
2267             fix2             = _mm256_add_ps(fix2,tx);
2268             fiy2             = _mm256_add_ps(fiy2,ty);
2269             fiz2             = _mm256_add_ps(fiz2,tz);
2270
2271             fjx2             = _mm256_add_ps(fjx2,tx);
2272             fjy2             = _mm256_add_ps(fjy2,ty);
2273             fjz2             = _mm256_add_ps(fjz2,tz);
2274
2275             }
2276
2277             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2278             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2279             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2280             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2281             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2282             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2283             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2284             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2285
2286             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2287                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2288
2289             /* Inner loop uses 302 flops */
2290         }
2291
2292         /* End of innermost loop */
2293
2294         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2295                                                  f+i_coord_offset,fshift+i_shift_offset);
2296
2297         /* Increment number of inner iterations */
2298         inneriter                  += j_index_end - j_index_start;
2299
2300         /* Outer loop uses 18 flops */
2301     }
2302
2303     /* Increment number of outer iterations */
2304     outeriter        += nri;
2305
2306     /* Update outer/inner flops */
2307
2308     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
2309 }