Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
116     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
117     __m256d          dummy_mask,cutoff_mask;
118     __m128           tmpmask0,tmpmask1;
119     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120     __m256d          one     = _mm256_set1_pd(1.0);
121     __m256d          two     = _mm256_set1_pd(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm256_set1_pd(fr->ic->epsfac);
134     charge           = mdatoms->chargeA;
135     krf              = _mm256_set1_pd(fr->ic->k_rf);
136     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
137     crf              = _mm256_set1_pd(fr->ic->c_rf);
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
145     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
146     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
147     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
148
149     jq1              = _mm256_set1_pd(charge[inr+1]);
150     jq2              = _mm256_set1_pd(charge[inr+2]);
151     jq3              = _mm256_set1_pd(charge[inr+3]);
152     vdwjidx0A        = 2*vdwtype[inr+0];
153     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
154     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
155     qq11             = _mm256_mul_pd(iq1,jq1);
156     qq12             = _mm256_mul_pd(iq1,jq2);
157     qq13             = _mm256_mul_pd(iq1,jq3);
158     qq21             = _mm256_mul_pd(iq2,jq1);
159     qq22             = _mm256_mul_pd(iq2,jq2);
160     qq23             = _mm256_mul_pd(iq2,jq3);
161     qq31             = _mm256_mul_pd(iq3,jq1);
162     qq32             = _mm256_mul_pd(iq3,jq2);
163     qq33             = _mm256_mul_pd(iq3,jq3);
164
165     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166     rcutoff_scalar   = fr->ic->rcoulomb;
167     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
168     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
169
170     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
171     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
172
173     /* Avoid stupid compiler warnings */
174     jnrA = jnrB = jnrC = jnrD = 0;
175     j_coord_offsetA = 0;
176     j_coord_offsetB = 0;
177     j_coord_offsetC = 0;
178     j_coord_offsetD = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
205
206         fix0             = _mm256_setzero_pd();
207         fiy0             = _mm256_setzero_pd();
208         fiz0             = _mm256_setzero_pd();
209         fix1             = _mm256_setzero_pd();
210         fiy1             = _mm256_setzero_pd();
211         fiz1             = _mm256_setzero_pd();
212         fix2             = _mm256_setzero_pd();
213         fiy2             = _mm256_setzero_pd();
214         fiz2             = _mm256_setzero_pd();
215         fix3             = _mm256_setzero_pd();
216         fiy3             = _mm256_setzero_pd();
217         fiz3             = _mm256_setzero_pd();
218
219         /* Reset potential sums */
220         velecsum         = _mm256_setzero_pd();
221         vvdwsum          = _mm256_setzero_pd();
222
223         /* Start inner kernel loop */
224         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225         {
226
227             /* Get j neighbor index, and coordinate index */
228             jnrA             = jjnr[jidx];
229             jnrB             = jjnr[jidx+1];
230             jnrC             = jjnr[jidx+2];
231             jnrD             = jjnr[jidx+3];
232             j_coord_offsetA  = DIM*jnrA;
233             j_coord_offsetB  = DIM*jnrB;
234             j_coord_offsetC  = DIM*jnrC;
235             j_coord_offsetD  = DIM*jnrD;
236
237             /* load j atom coordinates */
238             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
240                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
242
243             /* Calculate displacement vector */
244             dx00             = _mm256_sub_pd(ix0,jx0);
245             dy00             = _mm256_sub_pd(iy0,jy0);
246             dz00             = _mm256_sub_pd(iz0,jz0);
247             dx11             = _mm256_sub_pd(ix1,jx1);
248             dy11             = _mm256_sub_pd(iy1,jy1);
249             dz11             = _mm256_sub_pd(iz1,jz1);
250             dx12             = _mm256_sub_pd(ix1,jx2);
251             dy12             = _mm256_sub_pd(iy1,jy2);
252             dz12             = _mm256_sub_pd(iz1,jz2);
253             dx13             = _mm256_sub_pd(ix1,jx3);
254             dy13             = _mm256_sub_pd(iy1,jy3);
255             dz13             = _mm256_sub_pd(iz1,jz3);
256             dx21             = _mm256_sub_pd(ix2,jx1);
257             dy21             = _mm256_sub_pd(iy2,jy1);
258             dz21             = _mm256_sub_pd(iz2,jz1);
259             dx22             = _mm256_sub_pd(ix2,jx2);
260             dy22             = _mm256_sub_pd(iy2,jy2);
261             dz22             = _mm256_sub_pd(iz2,jz2);
262             dx23             = _mm256_sub_pd(ix2,jx3);
263             dy23             = _mm256_sub_pd(iy2,jy3);
264             dz23             = _mm256_sub_pd(iz2,jz3);
265             dx31             = _mm256_sub_pd(ix3,jx1);
266             dy31             = _mm256_sub_pd(iy3,jy1);
267             dz31             = _mm256_sub_pd(iz3,jz1);
268             dx32             = _mm256_sub_pd(ix3,jx2);
269             dy32             = _mm256_sub_pd(iy3,jy2);
270             dz32             = _mm256_sub_pd(iz3,jz2);
271             dx33             = _mm256_sub_pd(ix3,jx3);
272             dy33             = _mm256_sub_pd(iy3,jy3);
273             dz33             = _mm256_sub_pd(iz3,jz3);
274
275             /* Calculate squared distance and things based on it */
276             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
277             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
278             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
279             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
280             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
281             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
282             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
283             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
284             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
285             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
286
287             rinv11           = avx256_invsqrt_d(rsq11);
288             rinv12           = avx256_invsqrt_d(rsq12);
289             rinv13           = avx256_invsqrt_d(rsq13);
290             rinv21           = avx256_invsqrt_d(rsq21);
291             rinv22           = avx256_invsqrt_d(rsq22);
292             rinv23           = avx256_invsqrt_d(rsq23);
293             rinv31           = avx256_invsqrt_d(rsq31);
294             rinv32           = avx256_invsqrt_d(rsq32);
295             rinv33           = avx256_invsqrt_d(rsq33);
296
297             rinvsq00         = avx256_inv_d(rsq00);
298             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
299             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
300             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
301             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
302             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
303             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
304             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
305             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
306             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
307
308             fjx0             = _mm256_setzero_pd();
309             fjy0             = _mm256_setzero_pd();
310             fjz0             = _mm256_setzero_pd();
311             fjx1             = _mm256_setzero_pd();
312             fjy1             = _mm256_setzero_pd();
313             fjz1             = _mm256_setzero_pd();
314             fjx2             = _mm256_setzero_pd();
315             fjy2             = _mm256_setzero_pd();
316             fjz2             = _mm256_setzero_pd();
317             fjx3             = _mm256_setzero_pd();
318             fjy3             = _mm256_setzero_pd();
319             fjz3             = _mm256_setzero_pd();
320
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             if (gmx_mm256_any_lt(rsq00,rcutoff2))
326             {
327
328             /* LENNARD-JONES DISPERSION/REPULSION */
329
330             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
331             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
332             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
333             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
334                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
335             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
336
337             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
338
339             /* Update potential sum for this i atom from the interaction with this j atom. */
340             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
341             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
342
343             fscal            = fvdw;
344
345             fscal            = _mm256_and_pd(fscal,cutoff_mask);
346
347             /* Calculate temporary vectorial force */
348             tx               = _mm256_mul_pd(fscal,dx00);
349             ty               = _mm256_mul_pd(fscal,dy00);
350             tz               = _mm256_mul_pd(fscal,dz00);
351
352             /* Update vectorial force */
353             fix0             = _mm256_add_pd(fix0,tx);
354             fiy0             = _mm256_add_pd(fiy0,ty);
355             fiz0             = _mm256_add_pd(fiz0,tz);
356
357             fjx0             = _mm256_add_pd(fjx0,tx);
358             fjy0             = _mm256_add_pd(fjy0,ty);
359             fjz0             = _mm256_add_pd(fjz0,tz);
360
361             }
362
363             /**************************
364              * CALCULATE INTERACTIONS *
365              **************************/
366
367             if (gmx_mm256_any_lt(rsq11,rcutoff2))
368             {
369
370             /* REACTION-FIELD ELECTROSTATICS */
371             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
372             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
373
374             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
375
376             /* Update potential sum for this i atom from the interaction with this j atom. */
377             velec            = _mm256_and_pd(velec,cutoff_mask);
378             velecsum         = _mm256_add_pd(velecsum,velec);
379
380             fscal            = felec;
381
382             fscal            = _mm256_and_pd(fscal,cutoff_mask);
383
384             /* Calculate temporary vectorial force */
385             tx               = _mm256_mul_pd(fscal,dx11);
386             ty               = _mm256_mul_pd(fscal,dy11);
387             tz               = _mm256_mul_pd(fscal,dz11);
388
389             /* Update vectorial force */
390             fix1             = _mm256_add_pd(fix1,tx);
391             fiy1             = _mm256_add_pd(fiy1,ty);
392             fiz1             = _mm256_add_pd(fiz1,tz);
393
394             fjx1             = _mm256_add_pd(fjx1,tx);
395             fjy1             = _mm256_add_pd(fjy1,ty);
396             fjz1             = _mm256_add_pd(fjz1,tz);
397
398             }
399
400             /**************************
401              * CALCULATE INTERACTIONS *
402              **************************/
403
404             if (gmx_mm256_any_lt(rsq12,rcutoff2))
405             {
406
407             /* REACTION-FIELD ELECTROSTATICS */
408             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
409             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
410
411             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velec            = _mm256_and_pd(velec,cutoff_mask);
415             velecsum         = _mm256_add_pd(velecsum,velec);
416
417             fscal            = felec;
418
419             fscal            = _mm256_and_pd(fscal,cutoff_mask);
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm256_mul_pd(fscal,dx12);
423             ty               = _mm256_mul_pd(fscal,dy12);
424             tz               = _mm256_mul_pd(fscal,dz12);
425
426             /* Update vectorial force */
427             fix1             = _mm256_add_pd(fix1,tx);
428             fiy1             = _mm256_add_pd(fiy1,ty);
429             fiz1             = _mm256_add_pd(fiz1,tz);
430
431             fjx2             = _mm256_add_pd(fjx2,tx);
432             fjy2             = _mm256_add_pd(fjy2,ty);
433             fjz2             = _mm256_add_pd(fjz2,tz);
434
435             }
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             if (gmx_mm256_any_lt(rsq13,rcutoff2))
442             {
443
444             /* REACTION-FIELD ELECTROSTATICS */
445             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
446             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
447
448             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
449
450             /* Update potential sum for this i atom from the interaction with this j atom. */
451             velec            = _mm256_and_pd(velec,cutoff_mask);
452             velecsum         = _mm256_add_pd(velecsum,velec);
453
454             fscal            = felec;
455
456             fscal            = _mm256_and_pd(fscal,cutoff_mask);
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm256_mul_pd(fscal,dx13);
460             ty               = _mm256_mul_pd(fscal,dy13);
461             tz               = _mm256_mul_pd(fscal,dz13);
462
463             /* Update vectorial force */
464             fix1             = _mm256_add_pd(fix1,tx);
465             fiy1             = _mm256_add_pd(fiy1,ty);
466             fiz1             = _mm256_add_pd(fiz1,tz);
467
468             fjx3             = _mm256_add_pd(fjx3,tx);
469             fjy3             = _mm256_add_pd(fjy3,ty);
470             fjz3             = _mm256_add_pd(fjz3,tz);
471
472             }
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             if (gmx_mm256_any_lt(rsq21,rcutoff2))
479             {
480
481             /* REACTION-FIELD ELECTROSTATICS */
482             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
483             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
484
485             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
486
487             /* Update potential sum for this i atom from the interaction with this j atom. */
488             velec            = _mm256_and_pd(velec,cutoff_mask);
489             velecsum         = _mm256_add_pd(velecsum,velec);
490
491             fscal            = felec;
492
493             fscal            = _mm256_and_pd(fscal,cutoff_mask);
494
495             /* Calculate temporary vectorial force */
496             tx               = _mm256_mul_pd(fscal,dx21);
497             ty               = _mm256_mul_pd(fscal,dy21);
498             tz               = _mm256_mul_pd(fscal,dz21);
499
500             /* Update vectorial force */
501             fix2             = _mm256_add_pd(fix2,tx);
502             fiy2             = _mm256_add_pd(fiy2,ty);
503             fiz2             = _mm256_add_pd(fiz2,tz);
504
505             fjx1             = _mm256_add_pd(fjx1,tx);
506             fjy1             = _mm256_add_pd(fjy1,ty);
507             fjz1             = _mm256_add_pd(fjz1,tz);
508
509             }
510
511             /**************************
512              * CALCULATE INTERACTIONS *
513              **************************/
514
515             if (gmx_mm256_any_lt(rsq22,rcutoff2))
516             {
517
518             /* REACTION-FIELD ELECTROSTATICS */
519             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
520             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
521
522             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
523
524             /* Update potential sum for this i atom from the interaction with this j atom. */
525             velec            = _mm256_and_pd(velec,cutoff_mask);
526             velecsum         = _mm256_add_pd(velecsum,velec);
527
528             fscal            = felec;
529
530             fscal            = _mm256_and_pd(fscal,cutoff_mask);
531
532             /* Calculate temporary vectorial force */
533             tx               = _mm256_mul_pd(fscal,dx22);
534             ty               = _mm256_mul_pd(fscal,dy22);
535             tz               = _mm256_mul_pd(fscal,dz22);
536
537             /* Update vectorial force */
538             fix2             = _mm256_add_pd(fix2,tx);
539             fiy2             = _mm256_add_pd(fiy2,ty);
540             fiz2             = _mm256_add_pd(fiz2,tz);
541
542             fjx2             = _mm256_add_pd(fjx2,tx);
543             fjy2             = _mm256_add_pd(fjy2,ty);
544             fjz2             = _mm256_add_pd(fjz2,tz);
545
546             }
547
548             /**************************
549              * CALCULATE INTERACTIONS *
550              **************************/
551
552             if (gmx_mm256_any_lt(rsq23,rcutoff2))
553             {
554
555             /* REACTION-FIELD ELECTROSTATICS */
556             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
557             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
558
559             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velec            = _mm256_and_pd(velec,cutoff_mask);
563             velecsum         = _mm256_add_pd(velecsum,velec);
564
565             fscal            = felec;
566
567             fscal            = _mm256_and_pd(fscal,cutoff_mask);
568
569             /* Calculate temporary vectorial force */
570             tx               = _mm256_mul_pd(fscal,dx23);
571             ty               = _mm256_mul_pd(fscal,dy23);
572             tz               = _mm256_mul_pd(fscal,dz23);
573
574             /* Update vectorial force */
575             fix2             = _mm256_add_pd(fix2,tx);
576             fiy2             = _mm256_add_pd(fiy2,ty);
577             fiz2             = _mm256_add_pd(fiz2,tz);
578
579             fjx3             = _mm256_add_pd(fjx3,tx);
580             fjy3             = _mm256_add_pd(fjy3,ty);
581             fjz3             = _mm256_add_pd(fjz3,tz);
582
583             }
584
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             if (gmx_mm256_any_lt(rsq31,rcutoff2))
590             {
591
592             /* REACTION-FIELD ELECTROSTATICS */
593             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
594             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
595
596             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm256_and_pd(velec,cutoff_mask);
600             velecsum         = _mm256_add_pd(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm256_and_pd(fscal,cutoff_mask);
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm256_mul_pd(fscal,dx31);
608             ty               = _mm256_mul_pd(fscal,dy31);
609             tz               = _mm256_mul_pd(fscal,dz31);
610
611             /* Update vectorial force */
612             fix3             = _mm256_add_pd(fix3,tx);
613             fiy3             = _mm256_add_pd(fiy3,ty);
614             fiz3             = _mm256_add_pd(fiz3,tz);
615
616             fjx1             = _mm256_add_pd(fjx1,tx);
617             fjy1             = _mm256_add_pd(fjy1,ty);
618             fjz1             = _mm256_add_pd(fjz1,tz);
619
620             }
621
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             if (gmx_mm256_any_lt(rsq32,rcutoff2))
627             {
628
629             /* REACTION-FIELD ELECTROSTATICS */
630             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
631             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
632
633             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
634
635             /* Update potential sum for this i atom from the interaction with this j atom. */
636             velec            = _mm256_and_pd(velec,cutoff_mask);
637             velecsum         = _mm256_add_pd(velecsum,velec);
638
639             fscal            = felec;
640
641             fscal            = _mm256_and_pd(fscal,cutoff_mask);
642
643             /* Calculate temporary vectorial force */
644             tx               = _mm256_mul_pd(fscal,dx32);
645             ty               = _mm256_mul_pd(fscal,dy32);
646             tz               = _mm256_mul_pd(fscal,dz32);
647
648             /* Update vectorial force */
649             fix3             = _mm256_add_pd(fix3,tx);
650             fiy3             = _mm256_add_pd(fiy3,ty);
651             fiz3             = _mm256_add_pd(fiz3,tz);
652
653             fjx2             = _mm256_add_pd(fjx2,tx);
654             fjy2             = _mm256_add_pd(fjy2,ty);
655             fjz2             = _mm256_add_pd(fjz2,tz);
656
657             }
658
659             /**************************
660              * CALCULATE INTERACTIONS *
661              **************************/
662
663             if (gmx_mm256_any_lt(rsq33,rcutoff2))
664             {
665
666             /* REACTION-FIELD ELECTROSTATICS */
667             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
668             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
669
670             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
671
672             /* Update potential sum for this i atom from the interaction with this j atom. */
673             velec            = _mm256_and_pd(velec,cutoff_mask);
674             velecsum         = _mm256_add_pd(velecsum,velec);
675
676             fscal            = felec;
677
678             fscal            = _mm256_and_pd(fscal,cutoff_mask);
679
680             /* Calculate temporary vectorial force */
681             tx               = _mm256_mul_pd(fscal,dx33);
682             ty               = _mm256_mul_pd(fscal,dy33);
683             tz               = _mm256_mul_pd(fscal,dz33);
684
685             /* Update vectorial force */
686             fix3             = _mm256_add_pd(fix3,tx);
687             fiy3             = _mm256_add_pd(fiy3,ty);
688             fiz3             = _mm256_add_pd(fiz3,tz);
689
690             fjx3             = _mm256_add_pd(fjx3,tx);
691             fjy3             = _mm256_add_pd(fjy3,ty);
692             fjz3             = _mm256_add_pd(fjz3,tz);
693
694             }
695
696             fjptrA             = f+j_coord_offsetA;
697             fjptrB             = f+j_coord_offsetB;
698             fjptrC             = f+j_coord_offsetC;
699             fjptrD             = f+j_coord_offsetD;
700
701             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
702                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
703                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
704
705             /* Inner loop uses 368 flops */
706         }
707
708         if(jidx<j_index_end)
709         {
710
711             /* Get j neighbor index, and coordinate index */
712             jnrlistA         = jjnr[jidx];
713             jnrlistB         = jjnr[jidx+1];
714             jnrlistC         = jjnr[jidx+2];
715             jnrlistD         = jjnr[jidx+3];
716             /* Sign of each element will be negative for non-real atoms.
717              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
718              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
719              */
720             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
721
722             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
723             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
724             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
725
726             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
727             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
728             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
729             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
730             j_coord_offsetA  = DIM*jnrA;
731             j_coord_offsetB  = DIM*jnrB;
732             j_coord_offsetC  = DIM*jnrC;
733             j_coord_offsetD  = DIM*jnrD;
734
735             /* load j atom coordinates */
736             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
737                                                  x+j_coord_offsetC,x+j_coord_offsetD,
738                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
739                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
740
741             /* Calculate displacement vector */
742             dx00             = _mm256_sub_pd(ix0,jx0);
743             dy00             = _mm256_sub_pd(iy0,jy0);
744             dz00             = _mm256_sub_pd(iz0,jz0);
745             dx11             = _mm256_sub_pd(ix1,jx1);
746             dy11             = _mm256_sub_pd(iy1,jy1);
747             dz11             = _mm256_sub_pd(iz1,jz1);
748             dx12             = _mm256_sub_pd(ix1,jx2);
749             dy12             = _mm256_sub_pd(iy1,jy2);
750             dz12             = _mm256_sub_pd(iz1,jz2);
751             dx13             = _mm256_sub_pd(ix1,jx3);
752             dy13             = _mm256_sub_pd(iy1,jy3);
753             dz13             = _mm256_sub_pd(iz1,jz3);
754             dx21             = _mm256_sub_pd(ix2,jx1);
755             dy21             = _mm256_sub_pd(iy2,jy1);
756             dz21             = _mm256_sub_pd(iz2,jz1);
757             dx22             = _mm256_sub_pd(ix2,jx2);
758             dy22             = _mm256_sub_pd(iy2,jy2);
759             dz22             = _mm256_sub_pd(iz2,jz2);
760             dx23             = _mm256_sub_pd(ix2,jx3);
761             dy23             = _mm256_sub_pd(iy2,jy3);
762             dz23             = _mm256_sub_pd(iz2,jz3);
763             dx31             = _mm256_sub_pd(ix3,jx1);
764             dy31             = _mm256_sub_pd(iy3,jy1);
765             dz31             = _mm256_sub_pd(iz3,jz1);
766             dx32             = _mm256_sub_pd(ix3,jx2);
767             dy32             = _mm256_sub_pd(iy3,jy2);
768             dz32             = _mm256_sub_pd(iz3,jz2);
769             dx33             = _mm256_sub_pd(ix3,jx3);
770             dy33             = _mm256_sub_pd(iy3,jy3);
771             dz33             = _mm256_sub_pd(iz3,jz3);
772
773             /* Calculate squared distance and things based on it */
774             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
775             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
776             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
777             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
778             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
779             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
780             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
781             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
782             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
783             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
784
785             rinv11           = avx256_invsqrt_d(rsq11);
786             rinv12           = avx256_invsqrt_d(rsq12);
787             rinv13           = avx256_invsqrt_d(rsq13);
788             rinv21           = avx256_invsqrt_d(rsq21);
789             rinv22           = avx256_invsqrt_d(rsq22);
790             rinv23           = avx256_invsqrt_d(rsq23);
791             rinv31           = avx256_invsqrt_d(rsq31);
792             rinv32           = avx256_invsqrt_d(rsq32);
793             rinv33           = avx256_invsqrt_d(rsq33);
794
795             rinvsq00         = avx256_inv_d(rsq00);
796             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
797             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
798             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
799             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
800             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
801             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
802             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
803             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
804             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
805
806             fjx0             = _mm256_setzero_pd();
807             fjy0             = _mm256_setzero_pd();
808             fjz0             = _mm256_setzero_pd();
809             fjx1             = _mm256_setzero_pd();
810             fjy1             = _mm256_setzero_pd();
811             fjz1             = _mm256_setzero_pd();
812             fjx2             = _mm256_setzero_pd();
813             fjy2             = _mm256_setzero_pd();
814             fjz2             = _mm256_setzero_pd();
815             fjx3             = _mm256_setzero_pd();
816             fjy3             = _mm256_setzero_pd();
817             fjz3             = _mm256_setzero_pd();
818
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             if (gmx_mm256_any_lt(rsq00,rcutoff2))
824             {
825
826             /* LENNARD-JONES DISPERSION/REPULSION */
827
828             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
829             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
830             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
831             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
832                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
833             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
834
835             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
836
837             /* Update potential sum for this i atom from the interaction with this j atom. */
838             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
839             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
840             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
841
842             fscal            = fvdw;
843
844             fscal            = _mm256_and_pd(fscal,cutoff_mask);
845
846             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
847
848             /* Calculate temporary vectorial force */
849             tx               = _mm256_mul_pd(fscal,dx00);
850             ty               = _mm256_mul_pd(fscal,dy00);
851             tz               = _mm256_mul_pd(fscal,dz00);
852
853             /* Update vectorial force */
854             fix0             = _mm256_add_pd(fix0,tx);
855             fiy0             = _mm256_add_pd(fiy0,ty);
856             fiz0             = _mm256_add_pd(fiz0,tz);
857
858             fjx0             = _mm256_add_pd(fjx0,tx);
859             fjy0             = _mm256_add_pd(fjy0,ty);
860             fjz0             = _mm256_add_pd(fjz0,tz);
861
862             }
863
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             if (gmx_mm256_any_lt(rsq11,rcutoff2))
869             {
870
871             /* REACTION-FIELD ELECTROSTATICS */
872             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
873             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
874
875             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
876
877             /* Update potential sum for this i atom from the interaction with this j atom. */
878             velec            = _mm256_and_pd(velec,cutoff_mask);
879             velec            = _mm256_andnot_pd(dummy_mask,velec);
880             velecsum         = _mm256_add_pd(velecsum,velec);
881
882             fscal            = felec;
883
884             fscal            = _mm256_and_pd(fscal,cutoff_mask);
885
886             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
887
888             /* Calculate temporary vectorial force */
889             tx               = _mm256_mul_pd(fscal,dx11);
890             ty               = _mm256_mul_pd(fscal,dy11);
891             tz               = _mm256_mul_pd(fscal,dz11);
892
893             /* Update vectorial force */
894             fix1             = _mm256_add_pd(fix1,tx);
895             fiy1             = _mm256_add_pd(fiy1,ty);
896             fiz1             = _mm256_add_pd(fiz1,tz);
897
898             fjx1             = _mm256_add_pd(fjx1,tx);
899             fjy1             = _mm256_add_pd(fjy1,ty);
900             fjz1             = _mm256_add_pd(fjz1,tz);
901
902             }
903
904             /**************************
905              * CALCULATE INTERACTIONS *
906              **************************/
907
908             if (gmx_mm256_any_lt(rsq12,rcutoff2))
909             {
910
911             /* REACTION-FIELD ELECTROSTATICS */
912             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
913             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
914
915             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
916
917             /* Update potential sum for this i atom from the interaction with this j atom. */
918             velec            = _mm256_and_pd(velec,cutoff_mask);
919             velec            = _mm256_andnot_pd(dummy_mask,velec);
920             velecsum         = _mm256_add_pd(velecsum,velec);
921
922             fscal            = felec;
923
924             fscal            = _mm256_and_pd(fscal,cutoff_mask);
925
926             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm256_mul_pd(fscal,dx12);
930             ty               = _mm256_mul_pd(fscal,dy12);
931             tz               = _mm256_mul_pd(fscal,dz12);
932
933             /* Update vectorial force */
934             fix1             = _mm256_add_pd(fix1,tx);
935             fiy1             = _mm256_add_pd(fiy1,ty);
936             fiz1             = _mm256_add_pd(fiz1,tz);
937
938             fjx2             = _mm256_add_pd(fjx2,tx);
939             fjy2             = _mm256_add_pd(fjy2,ty);
940             fjz2             = _mm256_add_pd(fjz2,tz);
941
942             }
943
944             /**************************
945              * CALCULATE INTERACTIONS *
946              **************************/
947
948             if (gmx_mm256_any_lt(rsq13,rcutoff2))
949             {
950
951             /* REACTION-FIELD ELECTROSTATICS */
952             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
953             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
954
955             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
956
957             /* Update potential sum for this i atom from the interaction with this j atom. */
958             velec            = _mm256_and_pd(velec,cutoff_mask);
959             velec            = _mm256_andnot_pd(dummy_mask,velec);
960             velecsum         = _mm256_add_pd(velecsum,velec);
961
962             fscal            = felec;
963
964             fscal            = _mm256_and_pd(fscal,cutoff_mask);
965
966             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
967
968             /* Calculate temporary vectorial force */
969             tx               = _mm256_mul_pd(fscal,dx13);
970             ty               = _mm256_mul_pd(fscal,dy13);
971             tz               = _mm256_mul_pd(fscal,dz13);
972
973             /* Update vectorial force */
974             fix1             = _mm256_add_pd(fix1,tx);
975             fiy1             = _mm256_add_pd(fiy1,ty);
976             fiz1             = _mm256_add_pd(fiz1,tz);
977
978             fjx3             = _mm256_add_pd(fjx3,tx);
979             fjy3             = _mm256_add_pd(fjy3,ty);
980             fjz3             = _mm256_add_pd(fjz3,tz);
981
982             }
983
984             /**************************
985              * CALCULATE INTERACTIONS *
986              **************************/
987
988             if (gmx_mm256_any_lt(rsq21,rcutoff2))
989             {
990
991             /* REACTION-FIELD ELECTROSTATICS */
992             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
993             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
994
995             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
996
997             /* Update potential sum for this i atom from the interaction with this j atom. */
998             velec            = _mm256_and_pd(velec,cutoff_mask);
999             velec            = _mm256_andnot_pd(dummy_mask,velec);
1000             velecsum         = _mm256_add_pd(velecsum,velec);
1001
1002             fscal            = felec;
1003
1004             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1005
1006             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1007
1008             /* Calculate temporary vectorial force */
1009             tx               = _mm256_mul_pd(fscal,dx21);
1010             ty               = _mm256_mul_pd(fscal,dy21);
1011             tz               = _mm256_mul_pd(fscal,dz21);
1012
1013             /* Update vectorial force */
1014             fix2             = _mm256_add_pd(fix2,tx);
1015             fiy2             = _mm256_add_pd(fiy2,ty);
1016             fiz2             = _mm256_add_pd(fiz2,tz);
1017
1018             fjx1             = _mm256_add_pd(fjx1,tx);
1019             fjy1             = _mm256_add_pd(fjy1,ty);
1020             fjz1             = _mm256_add_pd(fjz1,tz);
1021
1022             }
1023
1024             /**************************
1025              * CALCULATE INTERACTIONS *
1026              **************************/
1027
1028             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1029             {
1030
1031             /* REACTION-FIELD ELECTROSTATICS */
1032             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1033             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1034
1035             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1036
1037             /* Update potential sum for this i atom from the interaction with this j atom. */
1038             velec            = _mm256_and_pd(velec,cutoff_mask);
1039             velec            = _mm256_andnot_pd(dummy_mask,velec);
1040             velecsum         = _mm256_add_pd(velecsum,velec);
1041
1042             fscal            = felec;
1043
1044             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1045
1046             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1047
1048             /* Calculate temporary vectorial force */
1049             tx               = _mm256_mul_pd(fscal,dx22);
1050             ty               = _mm256_mul_pd(fscal,dy22);
1051             tz               = _mm256_mul_pd(fscal,dz22);
1052
1053             /* Update vectorial force */
1054             fix2             = _mm256_add_pd(fix2,tx);
1055             fiy2             = _mm256_add_pd(fiy2,ty);
1056             fiz2             = _mm256_add_pd(fiz2,tz);
1057
1058             fjx2             = _mm256_add_pd(fjx2,tx);
1059             fjy2             = _mm256_add_pd(fjy2,ty);
1060             fjz2             = _mm256_add_pd(fjz2,tz);
1061
1062             }
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1069             {
1070
1071             /* REACTION-FIELD ELECTROSTATICS */
1072             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1073             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1074
1075             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1076
1077             /* Update potential sum for this i atom from the interaction with this j atom. */
1078             velec            = _mm256_and_pd(velec,cutoff_mask);
1079             velec            = _mm256_andnot_pd(dummy_mask,velec);
1080             velecsum         = _mm256_add_pd(velecsum,velec);
1081
1082             fscal            = felec;
1083
1084             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1085
1086             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1087
1088             /* Calculate temporary vectorial force */
1089             tx               = _mm256_mul_pd(fscal,dx23);
1090             ty               = _mm256_mul_pd(fscal,dy23);
1091             tz               = _mm256_mul_pd(fscal,dz23);
1092
1093             /* Update vectorial force */
1094             fix2             = _mm256_add_pd(fix2,tx);
1095             fiy2             = _mm256_add_pd(fiy2,ty);
1096             fiz2             = _mm256_add_pd(fiz2,tz);
1097
1098             fjx3             = _mm256_add_pd(fjx3,tx);
1099             fjy3             = _mm256_add_pd(fjy3,ty);
1100             fjz3             = _mm256_add_pd(fjz3,tz);
1101
1102             }
1103
1104             /**************************
1105              * CALCULATE INTERACTIONS *
1106              **************************/
1107
1108             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1109             {
1110
1111             /* REACTION-FIELD ELECTROSTATICS */
1112             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1113             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1114
1115             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1116
1117             /* Update potential sum for this i atom from the interaction with this j atom. */
1118             velec            = _mm256_and_pd(velec,cutoff_mask);
1119             velec            = _mm256_andnot_pd(dummy_mask,velec);
1120             velecsum         = _mm256_add_pd(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1125
1126             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1127
1128             /* Calculate temporary vectorial force */
1129             tx               = _mm256_mul_pd(fscal,dx31);
1130             ty               = _mm256_mul_pd(fscal,dy31);
1131             tz               = _mm256_mul_pd(fscal,dz31);
1132
1133             /* Update vectorial force */
1134             fix3             = _mm256_add_pd(fix3,tx);
1135             fiy3             = _mm256_add_pd(fiy3,ty);
1136             fiz3             = _mm256_add_pd(fiz3,tz);
1137
1138             fjx1             = _mm256_add_pd(fjx1,tx);
1139             fjy1             = _mm256_add_pd(fjy1,ty);
1140             fjz1             = _mm256_add_pd(fjz1,tz);
1141
1142             }
1143
1144             /**************************
1145              * CALCULATE INTERACTIONS *
1146              **************************/
1147
1148             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1149             {
1150
1151             /* REACTION-FIELD ELECTROSTATICS */
1152             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1153             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1154
1155             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1156
1157             /* Update potential sum for this i atom from the interaction with this j atom. */
1158             velec            = _mm256_and_pd(velec,cutoff_mask);
1159             velec            = _mm256_andnot_pd(dummy_mask,velec);
1160             velecsum         = _mm256_add_pd(velecsum,velec);
1161
1162             fscal            = felec;
1163
1164             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1165
1166             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm256_mul_pd(fscal,dx32);
1170             ty               = _mm256_mul_pd(fscal,dy32);
1171             tz               = _mm256_mul_pd(fscal,dz32);
1172
1173             /* Update vectorial force */
1174             fix3             = _mm256_add_pd(fix3,tx);
1175             fiy3             = _mm256_add_pd(fiy3,ty);
1176             fiz3             = _mm256_add_pd(fiz3,tz);
1177
1178             fjx2             = _mm256_add_pd(fjx2,tx);
1179             fjy2             = _mm256_add_pd(fjy2,ty);
1180             fjz2             = _mm256_add_pd(fjz2,tz);
1181
1182             }
1183
1184             /**************************
1185              * CALCULATE INTERACTIONS *
1186              **************************/
1187
1188             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1189             {
1190
1191             /* REACTION-FIELD ELECTROSTATICS */
1192             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1193             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1194
1195             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1196
1197             /* Update potential sum for this i atom from the interaction with this j atom. */
1198             velec            = _mm256_and_pd(velec,cutoff_mask);
1199             velec            = _mm256_andnot_pd(dummy_mask,velec);
1200             velecsum         = _mm256_add_pd(velecsum,velec);
1201
1202             fscal            = felec;
1203
1204             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1205
1206             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1207
1208             /* Calculate temporary vectorial force */
1209             tx               = _mm256_mul_pd(fscal,dx33);
1210             ty               = _mm256_mul_pd(fscal,dy33);
1211             tz               = _mm256_mul_pd(fscal,dz33);
1212
1213             /* Update vectorial force */
1214             fix3             = _mm256_add_pd(fix3,tx);
1215             fiy3             = _mm256_add_pd(fiy3,ty);
1216             fiz3             = _mm256_add_pd(fiz3,tz);
1217
1218             fjx3             = _mm256_add_pd(fjx3,tx);
1219             fjy3             = _mm256_add_pd(fjy3,ty);
1220             fjz3             = _mm256_add_pd(fjz3,tz);
1221
1222             }
1223
1224             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1225             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1226             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1227             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1228
1229             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1230                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1231                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1232
1233             /* Inner loop uses 368 flops */
1234         }
1235
1236         /* End of innermost loop */
1237
1238         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1239                                                  f+i_coord_offset,fshift+i_shift_offset);
1240
1241         ggid                        = gid[iidx];
1242         /* Update potential energies */
1243         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1244         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1245
1246         /* Increment number of inner iterations */
1247         inneriter                  += j_index_end - j_index_start;
1248
1249         /* Outer loop uses 26 flops */
1250     }
1251
1252     /* Increment number of outer iterations */
1253     outeriter        += nri;
1254
1255     /* Update outer/inner flops */
1256
1257     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
1258 }
1259 /*
1260  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1261  * Electrostatics interaction: ReactionField
1262  * VdW interaction:            LennardJones
1263  * Geometry:                   Water4-Water4
1264  * Calculate force/pot:        Force
1265  */
1266 void
1267 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1268                     (t_nblist                    * gmx_restrict       nlist,
1269                      rvec                        * gmx_restrict          xx,
1270                      rvec                        * gmx_restrict          ff,
1271                      struct t_forcerec           * gmx_restrict          fr,
1272                      t_mdatoms                   * gmx_restrict     mdatoms,
1273                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1274                      t_nrnb                      * gmx_restrict        nrnb)
1275 {
1276     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1277      * just 0 for non-waters.
1278      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1279      * jnr indices corresponding to data put in the four positions in the SIMD register.
1280      */
1281     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1282     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1283     int              jnrA,jnrB,jnrC,jnrD;
1284     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1285     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1286     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1287     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1288     real             rcutoff_scalar;
1289     real             *shiftvec,*fshift,*x,*f;
1290     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1291     real             scratch[4*DIM];
1292     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1293     real *           vdwioffsetptr0;
1294     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1295     real *           vdwioffsetptr1;
1296     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1297     real *           vdwioffsetptr2;
1298     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1299     real *           vdwioffsetptr3;
1300     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1301     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1302     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1303     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1304     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1305     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1306     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1307     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1308     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1309     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1310     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1311     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1312     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1313     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1314     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1315     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1316     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1317     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1318     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1319     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1320     real             *charge;
1321     int              nvdwtype;
1322     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1323     int              *vdwtype;
1324     real             *vdwparam;
1325     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1326     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1327     __m256d          dummy_mask,cutoff_mask;
1328     __m128           tmpmask0,tmpmask1;
1329     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1330     __m256d          one     = _mm256_set1_pd(1.0);
1331     __m256d          two     = _mm256_set1_pd(2.0);
1332     x                = xx[0];
1333     f                = ff[0];
1334
1335     nri              = nlist->nri;
1336     iinr             = nlist->iinr;
1337     jindex           = nlist->jindex;
1338     jjnr             = nlist->jjnr;
1339     shiftidx         = nlist->shift;
1340     gid              = nlist->gid;
1341     shiftvec         = fr->shift_vec[0];
1342     fshift           = fr->fshift[0];
1343     facel            = _mm256_set1_pd(fr->ic->epsfac);
1344     charge           = mdatoms->chargeA;
1345     krf              = _mm256_set1_pd(fr->ic->k_rf);
1346     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1347     crf              = _mm256_set1_pd(fr->ic->c_rf);
1348     nvdwtype         = fr->ntype;
1349     vdwparam         = fr->nbfp;
1350     vdwtype          = mdatoms->typeA;
1351
1352     /* Setup water-specific parameters */
1353     inr              = nlist->iinr[0];
1354     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1355     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1356     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1357     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1358
1359     jq1              = _mm256_set1_pd(charge[inr+1]);
1360     jq2              = _mm256_set1_pd(charge[inr+2]);
1361     jq3              = _mm256_set1_pd(charge[inr+3]);
1362     vdwjidx0A        = 2*vdwtype[inr+0];
1363     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1364     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1365     qq11             = _mm256_mul_pd(iq1,jq1);
1366     qq12             = _mm256_mul_pd(iq1,jq2);
1367     qq13             = _mm256_mul_pd(iq1,jq3);
1368     qq21             = _mm256_mul_pd(iq2,jq1);
1369     qq22             = _mm256_mul_pd(iq2,jq2);
1370     qq23             = _mm256_mul_pd(iq2,jq3);
1371     qq31             = _mm256_mul_pd(iq3,jq1);
1372     qq32             = _mm256_mul_pd(iq3,jq2);
1373     qq33             = _mm256_mul_pd(iq3,jq3);
1374
1375     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1376     rcutoff_scalar   = fr->ic->rcoulomb;
1377     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1378     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1379
1380     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1381     rvdw             = _mm256_set1_pd(fr->ic->rvdw);
1382
1383     /* Avoid stupid compiler warnings */
1384     jnrA = jnrB = jnrC = jnrD = 0;
1385     j_coord_offsetA = 0;
1386     j_coord_offsetB = 0;
1387     j_coord_offsetC = 0;
1388     j_coord_offsetD = 0;
1389
1390     outeriter        = 0;
1391     inneriter        = 0;
1392
1393     for(iidx=0;iidx<4*DIM;iidx++)
1394     {
1395         scratch[iidx] = 0.0;
1396     }
1397
1398     /* Start outer loop over neighborlists */
1399     for(iidx=0; iidx<nri; iidx++)
1400     {
1401         /* Load shift vector for this list */
1402         i_shift_offset   = DIM*shiftidx[iidx];
1403
1404         /* Load limits for loop over neighbors */
1405         j_index_start    = jindex[iidx];
1406         j_index_end      = jindex[iidx+1];
1407
1408         /* Get outer coordinate index */
1409         inr              = iinr[iidx];
1410         i_coord_offset   = DIM*inr;
1411
1412         /* Load i particle coords and add shift vector */
1413         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1414                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1415
1416         fix0             = _mm256_setzero_pd();
1417         fiy0             = _mm256_setzero_pd();
1418         fiz0             = _mm256_setzero_pd();
1419         fix1             = _mm256_setzero_pd();
1420         fiy1             = _mm256_setzero_pd();
1421         fiz1             = _mm256_setzero_pd();
1422         fix2             = _mm256_setzero_pd();
1423         fiy2             = _mm256_setzero_pd();
1424         fiz2             = _mm256_setzero_pd();
1425         fix3             = _mm256_setzero_pd();
1426         fiy3             = _mm256_setzero_pd();
1427         fiz3             = _mm256_setzero_pd();
1428
1429         /* Start inner kernel loop */
1430         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1431         {
1432
1433             /* Get j neighbor index, and coordinate index */
1434             jnrA             = jjnr[jidx];
1435             jnrB             = jjnr[jidx+1];
1436             jnrC             = jjnr[jidx+2];
1437             jnrD             = jjnr[jidx+3];
1438             j_coord_offsetA  = DIM*jnrA;
1439             j_coord_offsetB  = DIM*jnrB;
1440             j_coord_offsetC  = DIM*jnrC;
1441             j_coord_offsetD  = DIM*jnrD;
1442
1443             /* load j atom coordinates */
1444             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1445                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1446                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1447                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1448
1449             /* Calculate displacement vector */
1450             dx00             = _mm256_sub_pd(ix0,jx0);
1451             dy00             = _mm256_sub_pd(iy0,jy0);
1452             dz00             = _mm256_sub_pd(iz0,jz0);
1453             dx11             = _mm256_sub_pd(ix1,jx1);
1454             dy11             = _mm256_sub_pd(iy1,jy1);
1455             dz11             = _mm256_sub_pd(iz1,jz1);
1456             dx12             = _mm256_sub_pd(ix1,jx2);
1457             dy12             = _mm256_sub_pd(iy1,jy2);
1458             dz12             = _mm256_sub_pd(iz1,jz2);
1459             dx13             = _mm256_sub_pd(ix1,jx3);
1460             dy13             = _mm256_sub_pd(iy1,jy3);
1461             dz13             = _mm256_sub_pd(iz1,jz3);
1462             dx21             = _mm256_sub_pd(ix2,jx1);
1463             dy21             = _mm256_sub_pd(iy2,jy1);
1464             dz21             = _mm256_sub_pd(iz2,jz1);
1465             dx22             = _mm256_sub_pd(ix2,jx2);
1466             dy22             = _mm256_sub_pd(iy2,jy2);
1467             dz22             = _mm256_sub_pd(iz2,jz2);
1468             dx23             = _mm256_sub_pd(ix2,jx3);
1469             dy23             = _mm256_sub_pd(iy2,jy3);
1470             dz23             = _mm256_sub_pd(iz2,jz3);
1471             dx31             = _mm256_sub_pd(ix3,jx1);
1472             dy31             = _mm256_sub_pd(iy3,jy1);
1473             dz31             = _mm256_sub_pd(iz3,jz1);
1474             dx32             = _mm256_sub_pd(ix3,jx2);
1475             dy32             = _mm256_sub_pd(iy3,jy2);
1476             dz32             = _mm256_sub_pd(iz3,jz2);
1477             dx33             = _mm256_sub_pd(ix3,jx3);
1478             dy33             = _mm256_sub_pd(iy3,jy3);
1479             dz33             = _mm256_sub_pd(iz3,jz3);
1480
1481             /* Calculate squared distance and things based on it */
1482             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1483             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1484             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1485             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1486             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1487             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1488             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1489             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1490             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1491             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1492
1493             rinv11           = avx256_invsqrt_d(rsq11);
1494             rinv12           = avx256_invsqrt_d(rsq12);
1495             rinv13           = avx256_invsqrt_d(rsq13);
1496             rinv21           = avx256_invsqrt_d(rsq21);
1497             rinv22           = avx256_invsqrt_d(rsq22);
1498             rinv23           = avx256_invsqrt_d(rsq23);
1499             rinv31           = avx256_invsqrt_d(rsq31);
1500             rinv32           = avx256_invsqrt_d(rsq32);
1501             rinv33           = avx256_invsqrt_d(rsq33);
1502
1503             rinvsq00         = avx256_inv_d(rsq00);
1504             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1505             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1506             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1507             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1508             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1509             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1510             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1511             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1512             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1513
1514             fjx0             = _mm256_setzero_pd();
1515             fjy0             = _mm256_setzero_pd();
1516             fjz0             = _mm256_setzero_pd();
1517             fjx1             = _mm256_setzero_pd();
1518             fjy1             = _mm256_setzero_pd();
1519             fjz1             = _mm256_setzero_pd();
1520             fjx2             = _mm256_setzero_pd();
1521             fjy2             = _mm256_setzero_pd();
1522             fjz2             = _mm256_setzero_pd();
1523             fjx3             = _mm256_setzero_pd();
1524             fjy3             = _mm256_setzero_pd();
1525             fjz3             = _mm256_setzero_pd();
1526
1527             /**************************
1528              * CALCULATE INTERACTIONS *
1529              **************************/
1530
1531             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1532             {
1533
1534             /* LENNARD-JONES DISPERSION/REPULSION */
1535
1536             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1537             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1538
1539             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1540
1541             fscal            = fvdw;
1542
1543             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1544
1545             /* Calculate temporary vectorial force */
1546             tx               = _mm256_mul_pd(fscal,dx00);
1547             ty               = _mm256_mul_pd(fscal,dy00);
1548             tz               = _mm256_mul_pd(fscal,dz00);
1549
1550             /* Update vectorial force */
1551             fix0             = _mm256_add_pd(fix0,tx);
1552             fiy0             = _mm256_add_pd(fiy0,ty);
1553             fiz0             = _mm256_add_pd(fiz0,tz);
1554
1555             fjx0             = _mm256_add_pd(fjx0,tx);
1556             fjy0             = _mm256_add_pd(fjy0,ty);
1557             fjz0             = _mm256_add_pd(fjz0,tz);
1558
1559             }
1560
1561             /**************************
1562              * CALCULATE INTERACTIONS *
1563              **************************/
1564
1565             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1566             {
1567
1568             /* REACTION-FIELD ELECTROSTATICS */
1569             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1570
1571             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1572
1573             fscal            = felec;
1574
1575             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1576
1577             /* Calculate temporary vectorial force */
1578             tx               = _mm256_mul_pd(fscal,dx11);
1579             ty               = _mm256_mul_pd(fscal,dy11);
1580             tz               = _mm256_mul_pd(fscal,dz11);
1581
1582             /* Update vectorial force */
1583             fix1             = _mm256_add_pd(fix1,tx);
1584             fiy1             = _mm256_add_pd(fiy1,ty);
1585             fiz1             = _mm256_add_pd(fiz1,tz);
1586
1587             fjx1             = _mm256_add_pd(fjx1,tx);
1588             fjy1             = _mm256_add_pd(fjy1,ty);
1589             fjz1             = _mm256_add_pd(fjz1,tz);
1590
1591             }
1592
1593             /**************************
1594              * CALCULATE INTERACTIONS *
1595              **************************/
1596
1597             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1598             {
1599
1600             /* REACTION-FIELD ELECTROSTATICS */
1601             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1602
1603             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1604
1605             fscal            = felec;
1606
1607             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1608
1609             /* Calculate temporary vectorial force */
1610             tx               = _mm256_mul_pd(fscal,dx12);
1611             ty               = _mm256_mul_pd(fscal,dy12);
1612             tz               = _mm256_mul_pd(fscal,dz12);
1613
1614             /* Update vectorial force */
1615             fix1             = _mm256_add_pd(fix1,tx);
1616             fiy1             = _mm256_add_pd(fiy1,ty);
1617             fiz1             = _mm256_add_pd(fiz1,tz);
1618
1619             fjx2             = _mm256_add_pd(fjx2,tx);
1620             fjy2             = _mm256_add_pd(fjy2,ty);
1621             fjz2             = _mm256_add_pd(fjz2,tz);
1622
1623             }
1624
1625             /**************************
1626              * CALCULATE INTERACTIONS *
1627              **************************/
1628
1629             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1630             {
1631
1632             /* REACTION-FIELD ELECTROSTATICS */
1633             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1634
1635             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1636
1637             fscal            = felec;
1638
1639             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1640
1641             /* Calculate temporary vectorial force */
1642             tx               = _mm256_mul_pd(fscal,dx13);
1643             ty               = _mm256_mul_pd(fscal,dy13);
1644             tz               = _mm256_mul_pd(fscal,dz13);
1645
1646             /* Update vectorial force */
1647             fix1             = _mm256_add_pd(fix1,tx);
1648             fiy1             = _mm256_add_pd(fiy1,ty);
1649             fiz1             = _mm256_add_pd(fiz1,tz);
1650
1651             fjx3             = _mm256_add_pd(fjx3,tx);
1652             fjy3             = _mm256_add_pd(fjy3,ty);
1653             fjz3             = _mm256_add_pd(fjz3,tz);
1654
1655             }
1656
1657             /**************************
1658              * CALCULATE INTERACTIONS *
1659              **************************/
1660
1661             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1662             {
1663
1664             /* REACTION-FIELD ELECTROSTATICS */
1665             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1666
1667             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1668
1669             fscal            = felec;
1670
1671             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm256_mul_pd(fscal,dx21);
1675             ty               = _mm256_mul_pd(fscal,dy21);
1676             tz               = _mm256_mul_pd(fscal,dz21);
1677
1678             /* Update vectorial force */
1679             fix2             = _mm256_add_pd(fix2,tx);
1680             fiy2             = _mm256_add_pd(fiy2,ty);
1681             fiz2             = _mm256_add_pd(fiz2,tz);
1682
1683             fjx1             = _mm256_add_pd(fjx1,tx);
1684             fjy1             = _mm256_add_pd(fjy1,ty);
1685             fjz1             = _mm256_add_pd(fjz1,tz);
1686
1687             }
1688
1689             /**************************
1690              * CALCULATE INTERACTIONS *
1691              **************************/
1692
1693             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1694             {
1695
1696             /* REACTION-FIELD ELECTROSTATICS */
1697             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1698
1699             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1700
1701             fscal            = felec;
1702
1703             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1704
1705             /* Calculate temporary vectorial force */
1706             tx               = _mm256_mul_pd(fscal,dx22);
1707             ty               = _mm256_mul_pd(fscal,dy22);
1708             tz               = _mm256_mul_pd(fscal,dz22);
1709
1710             /* Update vectorial force */
1711             fix2             = _mm256_add_pd(fix2,tx);
1712             fiy2             = _mm256_add_pd(fiy2,ty);
1713             fiz2             = _mm256_add_pd(fiz2,tz);
1714
1715             fjx2             = _mm256_add_pd(fjx2,tx);
1716             fjy2             = _mm256_add_pd(fjy2,ty);
1717             fjz2             = _mm256_add_pd(fjz2,tz);
1718
1719             }
1720
1721             /**************************
1722              * CALCULATE INTERACTIONS *
1723              **************************/
1724
1725             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1726             {
1727
1728             /* REACTION-FIELD ELECTROSTATICS */
1729             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1730
1731             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1732
1733             fscal            = felec;
1734
1735             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1736
1737             /* Calculate temporary vectorial force */
1738             tx               = _mm256_mul_pd(fscal,dx23);
1739             ty               = _mm256_mul_pd(fscal,dy23);
1740             tz               = _mm256_mul_pd(fscal,dz23);
1741
1742             /* Update vectorial force */
1743             fix2             = _mm256_add_pd(fix2,tx);
1744             fiy2             = _mm256_add_pd(fiy2,ty);
1745             fiz2             = _mm256_add_pd(fiz2,tz);
1746
1747             fjx3             = _mm256_add_pd(fjx3,tx);
1748             fjy3             = _mm256_add_pd(fjy3,ty);
1749             fjz3             = _mm256_add_pd(fjz3,tz);
1750
1751             }
1752
1753             /**************************
1754              * CALCULATE INTERACTIONS *
1755              **************************/
1756
1757             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1758             {
1759
1760             /* REACTION-FIELD ELECTROSTATICS */
1761             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1762
1763             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1764
1765             fscal            = felec;
1766
1767             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1768
1769             /* Calculate temporary vectorial force */
1770             tx               = _mm256_mul_pd(fscal,dx31);
1771             ty               = _mm256_mul_pd(fscal,dy31);
1772             tz               = _mm256_mul_pd(fscal,dz31);
1773
1774             /* Update vectorial force */
1775             fix3             = _mm256_add_pd(fix3,tx);
1776             fiy3             = _mm256_add_pd(fiy3,ty);
1777             fiz3             = _mm256_add_pd(fiz3,tz);
1778
1779             fjx1             = _mm256_add_pd(fjx1,tx);
1780             fjy1             = _mm256_add_pd(fjy1,ty);
1781             fjz1             = _mm256_add_pd(fjz1,tz);
1782
1783             }
1784
1785             /**************************
1786              * CALCULATE INTERACTIONS *
1787              **************************/
1788
1789             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1790             {
1791
1792             /* REACTION-FIELD ELECTROSTATICS */
1793             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1794
1795             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1796
1797             fscal            = felec;
1798
1799             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1800
1801             /* Calculate temporary vectorial force */
1802             tx               = _mm256_mul_pd(fscal,dx32);
1803             ty               = _mm256_mul_pd(fscal,dy32);
1804             tz               = _mm256_mul_pd(fscal,dz32);
1805
1806             /* Update vectorial force */
1807             fix3             = _mm256_add_pd(fix3,tx);
1808             fiy3             = _mm256_add_pd(fiy3,ty);
1809             fiz3             = _mm256_add_pd(fiz3,tz);
1810
1811             fjx2             = _mm256_add_pd(fjx2,tx);
1812             fjy2             = _mm256_add_pd(fjy2,ty);
1813             fjz2             = _mm256_add_pd(fjz2,tz);
1814
1815             }
1816
1817             /**************************
1818              * CALCULATE INTERACTIONS *
1819              **************************/
1820
1821             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1822             {
1823
1824             /* REACTION-FIELD ELECTROSTATICS */
1825             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1826
1827             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1828
1829             fscal            = felec;
1830
1831             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1832
1833             /* Calculate temporary vectorial force */
1834             tx               = _mm256_mul_pd(fscal,dx33);
1835             ty               = _mm256_mul_pd(fscal,dy33);
1836             tz               = _mm256_mul_pd(fscal,dz33);
1837
1838             /* Update vectorial force */
1839             fix3             = _mm256_add_pd(fix3,tx);
1840             fiy3             = _mm256_add_pd(fiy3,ty);
1841             fiz3             = _mm256_add_pd(fiz3,tz);
1842
1843             fjx3             = _mm256_add_pd(fjx3,tx);
1844             fjy3             = _mm256_add_pd(fjy3,ty);
1845             fjz3             = _mm256_add_pd(fjz3,tz);
1846
1847             }
1848
1849             fjptrA             = f+j_coord_offsetA;
1850             fjptrB             = f+j_coord_offsetB;
1851             fjptrC             = f+j_coord_offsetC;
1852             fjptrD             = f+j_coord_offsetD;
1853
1854             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1855                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1856                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1857
1858             /* Inner loop uses 303 flops */
1859         }
1860
1861         if(jidx<j_index_end)
1862         {
1863
1864             /* Get j neighbor index, and coordinate index */
1865             jnrlistA         = jjnr[jidx];
1866             jnrlistB         = jjnr[jidx+1];
1867             jnrlistC         = jjnr[jidx+2];
1868             jnrlistD         = jjnr[jidx+3];
1869             /* Sign of each element will be negative for non-real atoms.
1870              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1871              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1872              */
1873             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1874
1875             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1876             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1877             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1878
1879             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1880             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1881             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1882             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1883             j_coord_offsetA  = DIM*jnrA;
1884             j_coord_offsetB  = DIM*jnrB;
1885             j_coord_offsetC  = DIM*jnrC;
1886             j_coord_offsetD  = DIM*jnrD;
1887
1888             /* load j atom coordinates */
1889             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1890                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1891                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1892                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1893
1894             /* Calculate displacement vector */
1895             dx00             = _mm256_sub_pd(ix0,jx0);
1896             dy00             = _mm256_sub_pd(iy0,jy0);
1897             dz00             = _mm256_sub_pd(iz0,jz0);
1898             dx11             = _mm256_sub_pd(ix1,jx1);
1899             dy11             = _mm256_sub_pd(iy1,jy1);
1900             dz11             = _mm256_sub_pd(iz1,jz1);
1901             dx12             = _mm256_sub_pd(ix1,jx2);
1902             dy12             = _mm256_sub_pd(iy1,jy2);
1903             dz12             = _mm256_sub_pd(iz1,jz2);
1904             dx13             = _mm256_sub_pd(ix1,jx3);
1905             dy13             = _mm256_sub_pd(iy1,jy3);
1906             dz13             = _mm256_sub_pd(iz1,jz3);
1907             dx21             = _mm256_sub_pd(ix2,jx1);
1908             dy21             = _mm256_sub_pd(iy2,jy1);
1909             dz21             = _mm256_sub_pd(iz2,jz1);
1910             dx22             = _mm256_sub_pd(ix2,jx2);
1911             dy22             = _mm256_sub_pd(iy2,jy2);
1912             dz22             = _mm256_sub_pd(iz2,jz2);
1913             dx23             = _mm256_sub_pd(ix2,jx3);
1914             dy23             = _mm256_sub_pd(iy2,jy3);
1915             dz23             = _mm256_sub_pd(iz2,jz3);
1916             dx31             = _mm256_sub_pd(ix3,jx1);
1917             dy31             = _mm256_sub_pd(iy3,jy1);
1918             dz31             = _mm256_sub_pd(iz3,jz1);
1919             dx32             = _mm256_sub_pd(ix3,jx2);
1920             dy32             = _mm256_sub_pd(iy3,jy2);
1921             dz32             = _mm256_sub_pd(iz3,jz2);
1922             dx33             = _mm256_sub_pd(ix3,jx3);
1923             dy33             = _mm256_sub_pd(iy3,jy3);
1924             dz33             = _mm256_sub_pd(iz3,jz3);
1925
1926             /* Calculate squared distance and things based on it */
1927             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1928             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1929             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1930             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1931             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1932             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1933             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1934             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1935             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1936             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1937
1938             rinv11           = avx256_invsqrt_d(rsq11);
1939             rinv12           = avx256_invsqrt_d(rsq12);
1940             rinv13           = avx256_invsqrt_d(rsq13);
1941             rinv21           = avx256_invsqrt_d(rsq21);
1942             rinv22           = avx256_invsqrt_d(rsq22);
1943             rinv23           = avx256_invsqrt_d(rsq23);
1944             rinv31           = avx256_invsqrt_d(rsq31);
1945             rinv32           = avx256_invsqrt_d(rsq32);
1946             rinv33           = avx256_invsqrt_d(rsq33);
1947
1948             rinvsq00         = avx256_inv_d(rsq00);
1949             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1950             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1951             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1952             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1953             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1954             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1955             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1956             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1957             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1958
1959             fjx0             = _mm256_setzero_pd();
1960             fjy0             = _mm256_setzero_pd();
1961             fjz0             = _mm256_setzero_pd();
1962             fjx1             = _mm256_setzero_pd();
1963             fjy1             = _mm256_setzero_pd();
1964             fjz1             = _mm256_setzero_pd();
1965             fjx2             = _mm256_setzero_pd();
1966             fjy2             = _mm256_setzero_pd();
1967             fjz2             = _mm256_setzero_pd();
1968             fjx3             = _mm256_setzero_pd();
1969             fjy3             = _mm256_setzero_pd();
1970             fjz3             = _mm256_setzero_pd();
1971
1972             /**************************
1973              * CALCULATE INTERACTIONS *
1974              **************************/
1975
1976             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1977             {
1978
1979             /* LENNARD-JONES DISPERSION/REPULSION */
1980
1981             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1982             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1983
1984             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1985
1986             fscal            = fvdw;
1987
1988             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1989
1990             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1991
1992             /* Calculate temporary vectorial force */
1993             tx               = _mm256_mul_pd(fscal,dx00);
1994             ty               = _mm256_mul_pd(fscal,dy00);
1995             tz               = _mm256_mul_pd(fscal,dz00);
1996
1997             /* Update vectorial force */
1998             fix0             = _mm256_add_pd(fix0,tx);
1999             fiy0             = _mm256_add_pd(fiy0,ty);
2000             fiz0             = _mm256_add_pd(fiz0,tz);
2001
2002             fjx0             = _mm256_add_pd(fjx0,tx);
2003             fjy0             = _mm256_add_pd(fjy0,ty);
2004             fjz0             = _mm256_add_pd(fjz0,tz);
2005
2006             }
2007
2008             /**************************
2009              * CALCULATE INTERACTIONS *
2010              **************************/
2011
2012             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2013             {
2014
2015             /* REACTION-FIELD ELECTROSTATICS */
2016             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2017
2018             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2019
2020             fscal            = felec;
2021
2022             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2023
2024             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2025
2026             /* Calculate temporary vectorial force */
2027             tx               = _mm256_mul_pd(fscal,dx11);
2028             ty               = _mm256_mul_pd(fscal,dy11);
2029             tz               = _mm256_mul_pd(fscal,dz11);
2030
2031             /* Update vectorial force */
2032             fix1             = _mm256_add_pd(fix1,tx);
2033             fiy1             = _mm256_add_pd(fiy1,ty);
2034             fiz1             = _mm256_add_pd(fiz1,tz);
2035
2036             fjx1             = _mm256_add_pd(fjx1,tx);
2037             fjy1             = _mm256_add_pd(fjy1,ty);
2038             fjz1             = _mm256_add_pd(fjz1,tz);
2039
2040             }
2041
2042             /**************************
2043              * CALCULATE INTERACTIONS *
2044              **************************/
2045
2046             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2047             {
2048
2049             /* REACTION-FIELD ELECTROSTATICS */
2050             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2051
2052             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2053
2054             fscal            = felec;
2055
2056             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2057
2058             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm256_mul_pd(fscal,dx12);
2062             ty               = _mm256_mul_pd(fscal,dy12);
2063             tz               = _mm256_mul_pd(fscal,dz12);
2064
2065             /* Update vectorial force */
2066             fix1             = _mm256_add_pd(fix1,tx);
2067             fiy1             = _mm256_add_pd(fiy1,ty);
2068             fiz1             = _mm256_add_pd(fiz1,tz);
2069
2070             fjx2             = _mm256_add_pd(fjx2,tx);
2071             fjy2             = _mm256_add_pd(fjy2,ty);
2072             fjz2             = _mm256_add_pd(fjz2,tz);
2073
2074             }
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2081             {
2082
2083             /* REACTION-FIELD ELECTROSTATICS */
2084             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2085
2086             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2087
2088             fscal            = felec;
2089
2090             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2091
2092             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2093
2094             /* Calculate temporary vectorial force */
2095             tx               = _mm256_mul_pd(fscal,dx13);
2096             ty               = _mm256_mul_pd(fscal,dy13);
2097             tz               = _mm256_mul_pd(fscal,dz13);
2098
2099             /* Update vectorial force */
2100             fix1             = _mm256_add_pd(fix1,tx);
2101             fiy1             = _mm256_add_pd(fiy1,ty);
2102             fiz1             = _mm256_add_pd(fiz1,tz);
2103
2104             fjx3             = _mm256_add_pd(fjx3,tx);
2105             fjy3             = _mm256_add_pd(fjy3,ty);
2106             fjz3             = _mm256_add_pd(fjz3,tz);
2107
2108             }
2109
2110             /**************************
2111              * CALCULATE INTERACTIONS *
2112              **************************/
2113
2114             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2115             {
2116
2117             /* REACTION-FIELD ELECTROSTATICS */
2118             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2119
2120             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2121
2122             fscal            = felec;
2123
2124             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2125
2126             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2127
2128             /* Calculate temporary vectorial force */
2129             tx               = _mm256_mul_pd(fscal,dx21);
2130             ty               = _mm256_mul_pd(fscal,dy21);
2131             tz               = _mm256_mul_pd(fscal,dz21);
2132
2133             /* Update vectorial force */
2134             fix2             = _mm256_add_pd(fix2,tx);
2135             fiy2             = _mm256_add_pd(fiy2,ty);
2136             fiz2             = _mm256_add_pd(fiz2,tz);
2137
2138             fjx1             = _mm256_add_pd(fjx1,tx);
2139             fjy1             = _mm256_add_pd(fjy1,ty);
2140             fjz1             = _mm256_add_pd(fjz1,tz);
2141
2142             }
2143
2144             /**************************
2145              * CALCULATE INTERACTIONS *
2146              **************************/
2147
2148             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2149             {
2150
2151             /* REACTION-FIELD ELECTROSTATICS */
2152             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2153
2154             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2155
2156             fscal            = felec;
2157
2158             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2159
2160             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2161
2162             /* Calculate temporary vectorial force */
2163             tx               = _mm256_mul_pd(fscal,dx22);
2164             ty               = _mm256_mul_pd(fscal,dy22);
2165             tz               = _mm256_mul_pd(fscal,dz22);
2166
2167             /* Update vectorial force */
2168             fix2             = _mm256_add_pd(fix2,tx);
2169             fiy2             = _mm256_add_pd(fiy2,ty);
2170             fiz2             = _mm256_add_pd(fiz2,tz);
2171
2172             fjx2             = _mm256_add_pd(fjx2,tx);
2173             fjy2             = _mm256_add_pd(fjy2,ty);
2174             fjz2             = _mm256_add_pd(fjz2,tz);
2175
2176             }
2177
2178             /**************************
2179              * CALCULATE INTERACTIONS *
2180              **************************/
2181
2182             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2183             {
2184
2185             /* REACTION-FIELD ELECTROSTATICS */
2186             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2187
2188             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2189
2190             fscal            = felec;
2191
2192             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2193
2194             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2195
2196             /* Calculate temporary vectorial force */
2197             tx               = _mm256_mul_pd(fscal,dx23);
2198             ty               = _mm256_mul_pd(fscal,dy23);
2199             tz               = _mm256_mul_pd(fscal,dz23);
2200
2201             /* Update vectorial force */
2202             fix2             = _mm256_add_pd(fix2,tx);
2203             fiy2             = _mm256_add_pd(fiy2,ty);
2204             fiz2             = _mm256_add_pd(fiz2,tz);
2205
2206             fjx3             = _mm256_add_pd(fjx3,tx);
2207             fjy3             = _mm256_add_pd(fjy3,ty);
2208             fjz3             = _mm256_add_pd(fjz3,tz);
2209
2210             }
2211
2212             /**************************
2213              * CALCULATE INTERACTIONS *
2214              **************************/
2215
2216             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2217             {
2218
2219             /* REACTION-FIELD ELECTROSTATICS */
2220             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2221
2222             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2223
2224             fscal            = felec;
2225
2226             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2227
2228             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2229
2230             /* Calculate temporary vectorial force */
2231             tx               = _mm256_mul_pd(fscal,dx31);
2232             ty               = _mm256_mul_pd(fscal,dy31);
2233             tz               = _mm256_mul_pd(fscal,dz31);
2234
2235             /* Update vectorial force */
2236             fix3             = _mm256_add_pd(fix3,tx);
2237             fiy3             = _mm256_add_pd(fiy3,ty);
2238             fiz3             = _mm256_add_pd(fiz3,tz);
2239
2240             fjx1             = _mm256_add_pd(fjx1,tx);
2241             fjy1             = _mm256_add_pd(fjy1,ty);
2242             fjz1             = _mm256_add_pd(fjz1,tz);
2243
2244             }
2245
2246             /**************************
2247              * CALCULATE INTERACTIONS *
2248              **************************/
2249
2250             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2251             {
2252
2253             /* REACTION-FIELD ELECTROSTATICS */
2254             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2255
2256             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2261
2262             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm256_mul_pd(fscal,dx32);
2266             ty               = _mm256_mul_pd(fscal,dy32);
2267             tz               = _mm256_mul_pd(fscal,dz32);
2268
2269             /* Update vectorial force */
2270             fix3             = _mm256_add_pd(fix3,tx);
2271             fiy3             = _mm256_add_pd(fiy3,ty);
2272             fiz3             = _mm256_add_pd(fiz3,tz);
2273
2274             fjx2             = _mm256_add_pd(fjx2,tx);
2275             fjy2             = _mm256_add_pd(fjy2,ty);
2276             fjz2             = _mm256_add_pd(fjz2,tz);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2285             {
2286
2287             /* REACTION-FIELD ELECTROSTATICS */
2288             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2289
2290             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2291
2292             fscal            = felec;
2293
2294             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2295
2296             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2297
2298             /* Calculate temporary vectorial force */
2299             tx               = _mm256_mul_pd(fscal,dx33);
2300             ty               = _mm256_mul_pd(fscal,dy33);
2301             tz               = _mm256_mul_pd(fscal,dz33);
2302
2303             /* Update vectorial force */
2304             fix3             = _mm256_add_pd(fix3,tx);
2305             fiy3             = _mm256_add_pd(fiy3,ty);
2306             fiz3             = _mm256_add_pd(fiz3,tz);
2307
2308             fjx3             = _mm256_add_pd(fjx3,tx);
2309             fjy3             = _mm256_add_pd(fjy3,ty);
2310             fjz3             = _mm256_add_pd(fjz3,tz);
2311
2312             }
2313
2314             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2315             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2316             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2317             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2318
2319             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2320                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2321                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2322
2323             /* Inner loop uses 303 flops */
2324         }
2325
2326         /* End of innermost loop */
2327
2328         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2329                                                  f+i_coord_offset,fshift+i_shift_offset);
2330
2331         /* Increment number of inner iterations */
2332         inneriter                  += j_index_end - j_index_start;
2333
2334         /* Outer loop uses 24 flops */
2335     }
2336
2337     /* Increment number of outer iterations */
2338     outeriter        += nri;
2339
2340     /* Update outer/inner flops */
2341
2342     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
2343 }