Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_avx_256_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
54  * Electrostatics interaction: ReactionField
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84     real             scratch[4*DIM];
85     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
119     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
120     __m256d          dummy_mask,cutoff_mask;
121     __m128           tmpmask0,tmpmask1;
122     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
123     __m256d          one     = _mm256_set1_pd(1.0);
124     __m256d          two     = _mm256_set1_pd(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm256_set1_pd(fr->epsfac);
137     charge           = mdatoms->chargeA;
138     krf              = _mm256_set1_pd(fr->ic->k_rf);
139     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
140     crf              = _mm256_set1_pd(fr->ic->c_rf);
141     nvdwtype         = fr->ntype;
142     vdwparam         = fr->nbfp;
143     vdwtype          = mdatoms->typeA;
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
148     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
149     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
150     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm256_set1_pd(charge[inr+1]);
153     jq2              = _mm256_set1_pd(charge[inr+2]);
154     jq3              = _mm256_set1_pd(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
157     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
158     qq11             = _mm256_mul_pd(iq1,jq1);
159     qq12             = _mm256_mul_pd(iq1,jq2);
160     qq13             = _mm256_mul_pd(iq1,jq3);
161     qq21             = _mm256_mul_pd(iq2,jq1);
162     qq22             = _mm256_mul_pd(iq2,jq2);
163     qq23             = _mm256_mul_pd(iq2,jq3);
164     qq31             = _mm256_mul_pd(iq3,jq1);
165     qq32             = _mm256_mul_pd(iq3,jq2);
166     qq33             = _mm256_mul_pd(iq3,jq3);
167
168     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169     rcutoff_scalar   = fr->rcoulomb;
170     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
171     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
172
173     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
174     rvdw             = _mm256_set1_pd(fr->rvdw);
175
176     /* Avoid stupid compiler warnings */
177     jnrA = jnrB = jnrC = jnrD = 0;
178     j_coord_offsetA = 0;
179     j_coord_offsetB = 0;
180     j_coord_offsetC = 0;
181     j_coord_offsetD = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
208
209         fix0             = _mm256_setzero_pd();
210         fiy0             = _mm256_setzero_pd();
211         fiz0             = _mm256_setzero_pd();
212         fix1             = _mm256_setzero_pd();
213         fiy1             = _mm256_setzero_pd();
214         fiz1             = _mm256_setzero_pd();
215         fix2             = _mm256_setzero_pd();
216         fiy2             = _mm256_setzero_pd();
217         fiz2             = _mm256_setzero_pd();
218         fix3             = _mm256_setzero_pd();
219         fiy3             = _mm256_setzero_pd();
220         fiz3             = _mm256_setzero_pd();
221
222         /* Reset potential sums */
223         velecsum         = _mm256_setzero_pd();
224         vvdwsum          = _mm256_setzero_pd();
225
226         /* Start inner kernel loop */
227         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
228         {
229
230             /* Get j neighbor index, and coordinate index */
231             jnrA             = jjnr[jidx];
232             jnrB             = jjnr[jidx+1];
233             jnrC             = jjnr[jidx+2];
234             jnrD             = jjnr[jidx+3];
235             j_coord_offsetA  = DIM*jnrA;
236             j_coord_offsetB  = DIM*jnrB;
237             j_coord_offsetC  = DIM*jnrC;
238             j_coord_offsetD  = DIM*jnrD;
239
240             /* load j atom coordinates */
241             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
242                                                  x+j_coord_offsetC,x+j_coord_offsetD,
243                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
244                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
245
246             /* Calculate displacement vector */
247             dx00             = _mm256_sub_pd(ix0,jx0);
248             dy00             = _mm256_sub_pd(iy0,jy0);
249             dz00             = _mm256_sub_pd(iz0,jz0);
250             dx11             = _mm256_sub_pd(ix1,jx1);
251             dy11             = _mm256_sub_pd(iy1,jy1);
252             dz11             = _mm256_sub_pd(iz1,jz1);
253             dx12             = _mm256_sub_pd(ix1,jx2);
254             dy12             = _mm256_sub_pd(iy1,jy2);
255             dz12             = _mm256_sub_pd(iz1,jz2);
256             dx13             = _mm256_sub_pd(ix1,jx3);
257             dy13             = _mm256_sub_pd(iy1,jy3);
258             dz13             = _mm256_sub_pd(iz1,jz3);
259             dx21             = _mm256_sub_pd(ix2,jx1);
260             dy21             = _mm256_sub_pd(iy2,jy1);
261             dz21             = _mm256_sub_pd(iz2,jz1);
262             dx22             = _mm256_sub_pd(ix2,jx2);
263             dy22             = _mm256_sub_pd(iy2,jy2);
264             dz22             = _mm256_sub_pd(iz2,jz2);
265             dx23             = _mm256_sub_pd(ix2,jx3);
266             dy23             = _mm256_sub_pd(iy2,jy3);
267             dz23             = _mm256_sub_pd(iz2,jz3);
268             dx31             = _mm256_sub_pd(ix3,jx1);
269             dy31             = _mm256_sub_pd(iy3,jy1);
270             dz31             = _mm256_sub_pd(iz3,jz1);
271             dx32             = _mm256_sub_pd(ix3,jx2);
272             dy32             = _mm256_sub_pd(iy3,jy2);
273             dz32             = _mm256_sub_pd(iz3,jz2);
274             dx33             = _mm256_sub_pd(ix3,jx3);
275             dy33             = _mm256_sub_pd(iy3,jy3);
276             dz33             = _mm256_sub_pd(iz3,jz3);
277
278             /* Calculate squared distance and things based on it */
279             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
280             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
281             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
282             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
283             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
284             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
285             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
286             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
287             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
288             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
289
290             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
291             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
292             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
293             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
294             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
295             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
296             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
297             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
298             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
299
300             rinvsq00         = gmx_mm256_inv_pd(rsq00);
301             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
302             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
303             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
304             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
305             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
306             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
307             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
308             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
309             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
310
311             fjx0             = _mm256_setzero_pd();
312             fjy0             = _mm256_setzero_pd();
313             fjz0             = _mm256_setzero_pd();
314             fjx1             = _mm256_setzero_pd();
315             fjy1             = _mm256_setzero_pd();
316             fjz1             = _mm256_setzero_pd();
317             fjx2             = _mm256_setzero_pd();
318             fjy2             = _mm256_setzero_pd();
319             fjz2             = _mm256_setzero_pd();
320             fjx3             = _mm256_setzero_pd();
321             fjy3             = _mm256_setzero_pd();
322             fjz3             = _mm256_setzero_pd();
323
324             /**************************
325              * CALCULATE INTERACTIONS *
326              **************************/
327
328             if (gmx_mm256_any_lt(rsq00,rcutoff2))
329             {
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
335             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
336             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
337                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
338             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
339
340             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
341
342             /* Update potential sum for this i atom from the interaction with this j atom. */
343             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
344             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
345
346             fscal            = fvdw;
347
348             fscal            = _mm256_and_pd(fscal,cutoff_mask);
349
350             /* Calculate temporary vectorial force */
351             tx               = _mm256_mul_pd(fscal,dx00);
352             ty               = _mm256_mul_pd(fscal,dy00);
353             tz               = _mm256_mul_pd(fscal,dz00);
354
355             /* Update vectorial force */
356             fix0             = _mm256_add_pd(fix0,tx);
357             fiy0             = _mm256_add_pd(fiy0,ty);
358             fiz0             = _mm256_add_pd(fiz0,tz);
359
360             fjx0             = _mm256_add_pd(fjx0,tx);
361             fjy0             = _mm256_add_pd(fjy0,ty);
362             fjz0             = _mm256_add_pd(fjz0,tz);
363
364             }
365
366             /**************************
367              * CALCULATE INTERACTIONS *
368              **************************/
369
370             if (gmx_mm256_any_lt(rsq11,rcutoff2))
371             {
372
373             /* REACTION-FIELD ELECTROSTATICS */
374             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
375             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
376
377             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
378
379             /* Update potential sum for this i atom from the interaction with this j atom. */
380             velec            = _mm256_and_pd(velec,cutoff_mask);
381             velecsum         = _mm256_add_pd(velecsum,velec);
382
383             fscal            = felec;
384
385             fscal            = _mm256_and_pd(fscal,cutoff_mask);
386
387             /* Calculate temporary vectorial force */
388             tx               = _mm256_mul_pd(fscal,dx11);
389             ty               = _mm256_mul_pd(fscal,dy11);
390             tz               = _mm256_mul_pd(fscal,dz11);
391
392             /* Update vectorial force */
393             fix1             = _mm256_add_pd(fix1,tx);
394             fiy1             = _mm256_add_pd(fiy1,ty);
395             fiz1             = _mm256_add_pd(fiz1,tz);
396
397             fjx1             = _mm256_add_pd(fjx1,tx);
398             fjy1             = _mm256_add_pd(fjy1,ty);
399             fjz1             = _mm256_add_pd(fjz1,tz);
400
401             }
402
403             /**************************
404              * CALCULATE INTERACTIONS *
405              **************************/
406
407             if (gmx_mm256_any_lt(rsq12,rcutoff2))
408             {
409
410             /* REACTION-FIELD ELECTROSTATICS */
411             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
412             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
413
414             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velec            = _mm256_and_pd(velec,cutoff_mask);
418             velecsum         = _mm256_add_pd(velecsum,velec);
419
420             fscal            = felec;
421
422             fscal            = _mm256_and_pd(fscal,cutoff_mask);
423
424             /* Calculate temporary vectorial force */
425             tx               = _mm256_mul_pd(fscal,dx12);
426             ty               = _mm256_mul_pd(fscal,dy12);
427             tz               = _mm256_mul_pd(fscal,dz12);
428
429             /* Update vectorial force */
430             fix1             = _mm256_add_pd(fix1,tx);
431             fiy1             = _mm256_add_pd(fiy1,ty);
432             fiz1             = _mm256_add_pd(fiz1,tz);
433
434             fjx2             = _mm256_add_pd(fjx2,tx);
435             fjy2             = _mm256_add_pd(fjy2,ty);
436             fjz2             = _mm256_add_pd(fjz2,tz);
437
438             }
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             if (gmx_mm256_any_lt(rsq13,rcutoff2))
445             {
446
447             /* REACTION-FIELD ELECTROSTATICS */
448             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
449             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
450
451             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velec            = _mm256_and_pd(velec,cutoff_mask);
455             velecsum         = _mm256_add_pd(velecsum,velec);
456
457             fscal            = felec;
458
459             fscal            = _mm256_and_pd(fscal,cutoff_mask);
460
461             /* Calculate temporary vectorial force */
462             tx               = _mm256_mul_pd(fscal,dx13);
463             ty               = _mm256_mul_pd(fscal,dy13);
464             tz               = _mm256_mul_pd(fscal,dz13);
465
466             /* Update vectorial force */
467             fix1             = _mm256_add_pd(fix1,tx);
468             fiy1             = _mm256_add_pd(fiy1,ty);
469             fiz1             = _mm256_add_pd(fiz1,tz);
470
471             fjx3             = _mm256_add_pd(fjx3,tx);
472             fjy3             = _mm256_add_pd(fjy3,ty);
473             fjz3             = _mm256_add_pd(fjz3,tz);
474
475             }
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             if (gmx_mm256_any_lt(rsq21,rcutoff2))
482             {
483
484             /* REACTION-FIELD ELECTROSTATICS */
485             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
486             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
487
488             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
489
490             /* Update potential sum for this i atom from the interaction with this j atom. */
491             velec            = _mm256_and_pd(velec,cutoff_mask);
492             velecsum         = _mm256_add_pd(velecsum,velec);
493
494             fscal            = felec;
495
496             fscal            = _mm256_and_pd(fscal,cutoff_mask);
497
498             /* Calculate temporary vectorial force */
499             tx               = _mm256_mul_pd(fscal,dx21);
500             ty               = _mm256_mul_pd(fscal,dy21);
501             tz               = _mm256_mul_pd(fscal,dz21);
502
503             /* Update vectorial force */
504             fix2             = _mm256_add_pd(fix2,tx);
505             fiy2             = _mm256_add_pd(fiy2,ty);
506             fiz2             = _mm256_add_pd(fiz2,tz);
507
508             fjx1             = _mm256_add_pd(fjx1,tx);
509             fjy1             = _mm256_add_pd(fjy1,ty);
510             fjz1             = _mm256_add_pd(fjz1,tz);
511
512             }
513
514             /**************************
515              * CALCULATE INTERACTIONS *
516              **************************/
517
518             if (gmx_mm256_any_lt(rsq22,rcutoff2))
519             {
520
521             /* REACTION-FIELD ELECTROSTATICS */
522             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
523             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
524
525             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
526
527             /* Update potential sum for this i atom from the interaction with this j atom. */
528             velec            = _mm256_and_pd(velec,cutoff_mask);
529             velecsum         = _mm256_add_pd(velecsum,velec);
530
531             fscal            = felec;
532
533             fscal            = _mm256_and_pd(fscal,cutoff_mask);
534
535             /* Calculate temporary vectorial force */
536             tx               = _mm256_mul_pd(fscal,dx22);
537             ty               = _mm256_mul_pd(fscal,dy22);
538             tz               = _mm256_mul_pd(fscal,dz22);
539
540             /* Update vectorial force */
541             fix2             = _mm256_add_pd(fix2,tx);
542             fiy2             = _mm256_add_pd(fiy2,ty);
543             fiz2             = _mm256_add_pd(fiz2,tz);
544
545             fjx2             = _mm256_add_pd(fjx2,tx);
546             fjy2             = _mm256_add_pd(fjy2,ty);
547             fjz2             = _mm256_add_pd(fjz2,tz);
548
549             }
550
551             /**************************
552              * CALCULATE INTERACTIONS *
553              **************************/
554
555             if (gmx_mm256_any_lt(rsq23,rcutoff2))
556             {
557
558             /* REACTION-FIELD ELECTROSTATICS */
559             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
560             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
561
562             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velec            = _mm256_and_pd(velec,cutoff_mask);
566             velecsum         = _mm256_add_pd(velecsum,velec);
567
568             fscal            = felec;
569
570             fscal            = _mm256_and_pd(fscal,cutoff_mask);
571
572             /* Calculate temporary vectorial force */
573             tx               = _mm256_mul_pd(fscal,dx23);
574             ty               = _mm256_mul_pd(fscal,dy23);
575             tz               = _mm256_mul_pd(fscal,dz23);
576
577             /* Update vectorial force */
578             fix2             = _mm256_add_pd(fix2,tx);
579             fiy2             = _mm256_add_pd(fiy2,ty);
580             fiz2             = _mm256_add_pd(fiz2,tz);
581
582             fjx3             = _mm256_add_pd(fjx3,tx);
583             fjy3             = _mm256_add_pd(fjy3,ty);
584             fjz3             = _mm256_add_pd(fjz3,tz);
585
586             }
587
588             /**************************
589              * CALCULATE INTERACTIONS *
590              **************************/
591
592             if (gmx_mm256_any_lt(rsq31,rcutoff2))
593             {
594
595             /* REACTION-FIELD ELECTROSTATICS */
596             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
597             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
598
599             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
600
601             /* Update potential sum for this i atom from the interaction with this j atom. */
602             velec            = _mm256_and_pd(velec,cutoff_mask);
603             velecsum         = _mm256_add_pd(velecsum,velec);
604
605             fscal            = felec;
606
607             fscal            = _mm256_and_pd(fscal,cutoff_mask);
608
609             /* Calculate temporary vectorial force */
610             tx               = _mm256_mul_pd(fscal,dx31);
611             ty               = _mm256_mul_pd(fscal,dy31);
612             tz               = _mm256_mul_pd(fscal,dz31);
613
614             /* Update vectorial force */
615             fix3             = _mm256_add_pd(fix3,tx);
616             fiy3             = _mm256_add_pd(fiy3,ty);
617             fiz3             = _mm256_add_pd(fiz3,tz);
618
619             fjx1             = _mm256_add_pd(fjx1,tx);
620             fjy1             = _mm256_add_pd(fjy1,ty);
621             fjz1             = _mm256_add_pd(fjz1,tz);
622
623             }
624
625             /**************************
626              * CALCULATE INTERACTIONS *
627              **************************/
628
629             if (gmx_mm256_any_lt(rsq32,rcutoff2))
630             {
631
632             /* REACTION-FIELD ELECTROSTATICS */
633             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
634             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
635
636             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
637
638             /* Update potential sum for this i atom from the interaction with this j atom. */
639             velec            = _mm256_and_pd(velec,cutoff_mask);
640             velecsum         = _mm256_add_pd(velecsum,velec);
641
642             fscal            = felec;
643
644             fscal            = _mm256_and_pd(fscal,cutoff_mask);
645
646             /* Calculate temporary vectorial force */
647             tx               = _mm256_mul_pd(fscal,dx32);
648             ty               = _mm256_mul_pd(fscal,dy32);
649             tz               = _mm256_mul_pd(fscal,dz32);
650
651             /* Update vectorial force */
652             fix3             = _mm256_add_pd(fix3,tx);
653             fiy3             = _mm256_add_pd(fiy3,ty);
654             fiz3             = _mm256_add_pd(fiz3,tz);
655
656             fjx2             = _mm256_add_pd(fjx2,tx);
657             fjy2             = _mm256_add_pd(fjy2,ty);
658             fjz2             = _mm256_add_pd(fjz2,tz);
659
660             }
661
662             /**************************
663              * CALCULATE INTERACTIONS *
664              **************************/
665
666             if (gmx_mm256_any_lt(rsq33,rcutoff2))
667             {
668
669             /* REACTION-FIELD ELECTROSTATICS */
670             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
671             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
672
673             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
674
675             /* Update potential sum for this i atom from the interaction with this j atom. */
676             velec            = _mm256_and_pd(velec,cutoff_mask);
677             velecsum         = _mm256_add_pd(velecsum,velec);
678
679             fscal            = felec;
680
681             fscal            = _mm256_and_pd(fscal,cutoff_mask);
682
683             /* Calculate temporary vectorial force */
684             tx               = _mm256_mul_pd(fscal,dx33);
685             ty               = _mm256_mul_pd(fscal,dy33);
686             tz               = _mm256_mul_pd(fscal,dz33);
687
688             /* Update vectorial force */
689             fix3             = _mm256_add_pd(fix3,tx);
690             fiy3             = _mm256_add_pd(fiy3,ty);
691             fiz3             = _mm256_add_pd(fiz3,tz);
692
693             fjx3             = _mm256_add_pd(fjx3,tx);
694             fjy3             = _mm256_add_pd(fjy3,ty);
695             fjz3             = _mm256_add_pd(fjz3,tz);
696
697             }
698
699             fjptrA             = f+j_coord_offsetA;
700             fjptrB             = f+j_coord_offsetB;
701             fjptrC             = f+j_coord_offsetC;
702             fjptrD             = f+j_coord_offsetD;
703
704             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
705                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
706                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
707
708             /* Inner loop uses 368 flops */
709         }
710
711         if(jidx<j_index_end)
712         {
713
714             /* Get j neighbor index, and coordinate index */
715             jnrlistA         = jjnr[jidx];
716             jnrlistB         = jjnr[jidx+1];
717             jnrlistC         = jjnr[jidx+2];
718             jnrlistD         = jjnr[jidx+3];
719             /* Sign of each element will be negative for non-real atoms.
720              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
721              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
722              */
723             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
724
725             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
726             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
727             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
728
729             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
730             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
731             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
732             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
733             j_coord_offsetA  = DIM*jnrA;
734             j_coord_offsetB  = DIM*jnrB;
735             j_coord_offsetC  = DIM*jnrC;
736             j_coord_offsetD  = DIM*jnrD;
737
738             /* load j atom coordinates */
739             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
740                                                  x+j_coord_offsetC,x+j_coord_offsetD,
741                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
742                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
743
744             /* Calculate displacement vector */
745             dx00             = _mm256_sub_pd(ix0,jx0);
746             dy00             = _mm256_sub_pd(iy0,jy0);
747             dz00             = _mm256_sub_pd(iz0,jz0);
748             dx11             = _mm256_sub_pd(ix1,jx1);
749             dy11             = _mm256_sub_pd(iy1,jy1);
750             dz11             = _mm256_sub_pd(iz1,jz1);
751             dx12             = _mm256_sub_pd(ix1,jx2);
752             dy12             = _mm256_sub_pd(iy1,jy2);
753             dz12             = _mm256_sub_pd(iz1,jz2);
754             dx13             = _mm256_sub_pd(ix1,jx3);
755             dy13             = _mm256_sub_pd(iy1,jy3);
756             dz13             = _mm256_sub_pd(iz1,jz3);
757             dx21             = _mm256_sub_pd(ix2,jx1);
758             dy21             = _mm256_sub_pd(iy2,jy1);
759             dz21             = _mm256_sub_pd(iz2,jz1);
760             dx22             = _mm256_sub_pd(ix2,jx2);
761             dy22             = _mm256_sub_pd(iy2,jy2);
762             dz22             = _mm256_sub_pd(iz2,jz2);
763             dx23             = _mm256_sub_pd(ix2,jx3);
764             dy23             = _mm256_sub_pd(iy2,jy3);
765             dz23             = _mm256_sub_pd(iz2,jz3);
766             dx31             = _mm256_sub_pd(ix3,jx1);
767             dy31             = _mm256_sub_pd(iy3,jy1);
768             dz31             = _mm256_sub_pd(iz3,jz1);
769             dx32             = _mm256_sub_pd(ix3,jx2);
770             dy32             = _mm256_sub_pd(iy3,jy2);
771             dz32             = _mm256_sub_pd(iz3,jz2);
772             dx33             = _mm256_sub_pd(ix3,jx3);
773             dy33             = _mm256_sub_pd(iy3,jy3);
774             dz33             = _mm256_sub_pd(iz3,jz3);
775
776             /* Calculate squared distance and things based on it */
777             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
778             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
779             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
780             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
781             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
782             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
783             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
784             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
785             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
786             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
787
788             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
789             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
790             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
791             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
792             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
793             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
794             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
795             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
796             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
797
798             rinvsq00         = gmx_mm256_inv_pd(rsq00);
799             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
800             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
801             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
802             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
803             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
804             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
805             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
806             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
807             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
808
809             fjx0             = _mm256_setzero_pd();
810             fjy0             = _mm256_setzero_pd();
811             fjz0             = _mm256_setzero_pd();
812             fjx1             = _mm256_setzero_pd();
813             fjy1             = _mm256_setzero_pd();
814             fjz1             = _mm256_setzero_pd();
815             fjx2             = _mm256_setzero_pd();
816             fjy2             = _mm256_setzero_pd();
817             fjz2             = _mm256_setzero_pd();
818             fjx3             = _mm256_setzero_pd();
819             fjy3             = _mm256_setzero_pd();
820             fjz3             = _mm256_setzero_pd();
821
822             /**************************
823              * CALCULATE INTERACTIONS *
824              **************************/
825
826             if (gmx_mm256_any_lt(rsq00,rcutoff2))
827             {
828
829             /* LENNARD-JONES DISPERSION/REPULSION */
830
831             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
832             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
833             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
834             vvdw             = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
835                                           _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
836             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
837
838             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
839
840             /* Update potential sum for this i atom from the interaction with this j atom. */
841             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
842             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
843             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
844
845             fscal            = fvdw;
846
847             fscal            = _mm256_and_pd(fscal,cutoff_mask);
848
849             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
850
851             /* Calculate temporary vectorial force */
852             tx               = _mm256_mul_pd(fscal,dx00);
853             ty               = _mm256_mul_pd(fscal,dy00);
854             tz               = _mm256_mul_pd(fscal,dz00);
855
856             /* Update vectorial force */
857             fix0             = _mm256_add_pd(fix0,tx);
858             fiy0             = _mm256_add_pd(fiy0,ty);
859             fiz0             = _mm256_add_pd(fiz0,tz);
860
861             fjx0             = _mm256_add_pd(fjx0,tx);
862             fjy0             = _mm256_add_pd(fjy0,ty);
863             fjz0             = _mm256_add_pd(fjz0,tz);
864
865             }
866
867             /**************************
868              * CALCULATE INTERACTIONS *
869              **************************/
870
871             if (gmx_mm256_any_lt(rsq11,rcutoff2))
872             {
873
874             /* REACTION-FIELD ELECTROSTATICS */
875             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
876             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
877
878             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
879
880             /* Update potential sum for this i atom from the interaction with this j atom. */
881             velec            = _mm256_and_pd(velec,cutoff_mask);
882             velec            = _mm256_andnot_pd(dummy_mask,velec);
883             velecsum         = _mm256_add_pd(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _mm256_and_pd(fscal,cutoff_mask);
888
889             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
890
891             /* Calculate temporary vectorial force */
892             tx               = _mm256_mul_pd(fscal,dx11);
893             ty               = _mm256_mul_pd(fscal,dy11);
894             tz               = _mm256_mul_pd(fscal,dz11);
895
896             /* Update vectorial force */
897             fix1             = _mm256_add_pd(fix1,tx);
898             fiy1             = _mm256_add_pd(fiy1,ty);
899             fiz1             = _mm256_add_pd(fiz1,tz);
900
901             fjx1             = _mm256_add_pd(fjx1,tx);
902             fjy1             = _mm256_add_pd(fjy1,ty);
903             fjz1             = _mm256_add_pd(fjz1,tz);
904
905             }
906
907             /**************************
908              * CALCULATE INTERACTIONS *
909              **************************/
910
911             if (gmx_mm256_any_lt(rsq12,rcutoff2))
912             {
913
914             /* REACTION-FIELD ELECTROSTATICS */
915             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
916             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
917
918             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm256_and_pd(velec,cutoff_mask);
922             velec            = _mm256_andnot_pd(dummy_mask,velec);
923             velecsum         = _mm256_add_pd(velecsum,velec);
924
925             fscal            = felec;
926
927             fscal            = _mm256_and_pd(fscal,cutoff_mask);
928
929             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
930
931             /* Calculate temporary vectorial force */
932             tx               = _mm256_mul_pd(fscal,dx12);
933             ty               = _mm256_mul_pd(fscal,dy12);
934             tz               = _mm256_mul_pd(fscal,dz12);
935
936             /* Update vectorial force */
937             fix1             = _mm256_add_pd(fix1,tx);
938             fiy1             = _mm256_add_pd(fiy1,ty);
939             fiz1             = _mm256_add_pd(fiz1,tz);
940
941             fjx2             = _mm256_add_pd(fjx2,tx);
942             fjy2             = _mm256_add_pd(fjy2,ty);
943             fjz2             = _mm256_add_pd(fjz2,tz);
944
945             }
946
947             /**************************
948              * CALCULATE INTERACTIONS *
949              **************************/
950
951             if (gmx_mm256_any_lt(rsq13,rcutoff2))
952             {
953
954             /* REACTION-FIELD ELECTROSTATICS */
955             velec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
956             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
957
958             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
959
960             /* Update potential sum for this i atom from the interaction with this j atom. */
961             velec            = _mm256_and_pd(velec,cutoff_mask);
962             velec            = _mm256_andnot_pd(dummy_mask,velec);
963             velecsum         = _mm256_add_pd(velecsum,velec);
964
965             fscal            = felec;
966
967             fscal            = _mm256_and_pd(fscal,cutoff_mask);
968
969             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
970
971             /* Calculate temporary vectorial force */
972             tx               = _mm256_mul_pd(fscal,dx13);
973             ty               = _mm256_mul_pd(fscal,dy13);
974             tz               = _mm256_mul_pd(fscal,dz13);
975
976             /* Update vectorial force */
977             fix1             = _mm256_add_pd(fix1,tx);
978             fiy1             = _mm256_add_pd(fiy1,ty);
979             fiz1             = _mm256_add_pd(fiz1,tz);
980
981             fjx3             = _mm256_add_pd(fjx3,tx);
982             fjy3             = _mm256_add_pd(fjy3,ty);
983             fjz3             = _mm256_add_pd(fjz3,tz);
984
985             }
986
987             /**************************
988              * CALCULATE INTERACTIONS *
989              **************************/
990
991             if (gmx_mm256_any_lt(rsq21,rcutoff2))
992             {
993
994             /* REACTION-FIELD ELECTROSTATICS */
995             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
996             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
997
998             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
999
1000             /* Update potential sum for this i atom from the interaction with this j atom. */
1001             velec            = _mm256_and_pd(velec,cutoff_mask);
1002             velec            = _mm256_andnot_pd(dummy_mask,velec);
1003             velecsum         = _mm256_add_pd(velecsum,velec);
1004
1005             fscal            = felec;
1006
1007             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1008
1009             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1010
1011             /* Calculate temporary vectorial force */
1012             tx               = _mm256_mul_pd(fscal,dx21);
1013             ty               = _mm256_mul_pd(fscal,dy21);
1014             tz               = _mm256_mul_pd(fscal,dz21);
1015
1016             /* Update vectorial force */
1017             fix2             = _mm256_add_pd(fix2,tx);
1018             fiy2             = _mm256_add_pd(fiy2,ty);
1019             fiz2             = _mm256_add_pd(fiz2,tz);
1020
1021             fjx1             = _mm256_add_pd(fjx1,tx);
1022             fjy1             = _mm256_add_pd(fjy1,ty);
1023             fjz1             = _mm256_add_pd(fjz1,tz);
1024
1025             }
1026
1027             /**************************
1028              * CALCULATE INTERACTIONS *
1029              **************************/
1030
1031             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1032             {
1033
1034             /* REACTION-FIELD ELECTROSTATICS */
1035             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1036             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1037
1038             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1039
1040             /* Update potential sum for this i atom from the interaction with this j atom. */
1041             velec            = _mm256_and_pd(velec,cutoff_mask);
1042             velec            = _mm256_andnot_pd(dummy_mask,velec);
1043             velecsum         = _mm256_add_pd(velecsum,velec);
1044
1045             fscal            = felec;
1046
1047             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1048
1049             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1050
1051             /* Calculate temporary vectorial force */
1052             tx               = _mm256_mul_pd(fscal,dx22);
1053             ty               = _mm256_mul_pd(fscal,dy22);
1054             tz               = _mm256_mul_pd(fscal,dz22);
1055
1056             /* Update vectorial force */
1057             fix2             = _mm256_add_pd(fix2,tx);
1058             fiy2             = _mm256_add_pd(fiy2,ty);
1059             fiz2             = _mm256_add_pd(fiz2,tz);
1060
1061             fjx2             = _mm256_add_pd(fjx2,tx);
1062             fjy2             = _mm256_add_pd(fjy2,ty);
1063             fjz2             = _mm256_add_pd(fjz2,tz);
1064
1065             }
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1072             {
1073
1074             /* REACTION-FIELD ELECTROSTATICS */
1075             velec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1076             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1077
1078             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1079
1080             /* Update potential sum for this i atom from the interaction with this j atom. */
1081             velec            = _mm256_and_pd(velec,cutoff_mask);
1082             velec            = _mm256_andnot_pd(dummy_mask,velec);
1083             velecsum         = _mm256_add_pd(velecsum,velec);
1084
1085             fscal            = felec;
1086
1087             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1088
1089             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1090
1091             /* Calculate temporary vectorial force */
1092             tx               = _mm256_mul_pd(fscal,dx23);
1093             ty               = _mm256_mul_pd(fscal,dy23);
1094             tz               = _mm256_mul_pd(fscal,dz23);
1095
1096             /* Update vectorial force */
1097             fix2             = _mm256_add_pd(fix2,tx);
1098             fiy2             = _mm256_add_pd(fiy2,ty);
1099             fiz2             = _mm256_add_pd(fiz2,tz);
1100
1101             fjx3             = _mm256_add_pd(fjx3,tx);
1102             fjy3             = _mm256_add_pd(fjy3,ty);
1103             fjz3             = _mm256_add_pd(fjz3,tz);
1104
1105             }
1106
1107             /**************************
1108              * CALCULATE INTERACTIONS *
1109              **************************/
1110
1111             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1112             {
1113
1114             /* REACTION-FIELD ELECTROSTATICS */
1115             velec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1116             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1117
1118             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1119
1120             /* Update potential sum for this i atom from the interaction with this j atom. */
1121             velec            = _mm256_and_pd(velec,cutoff_mask);
1122             velec            = _mm256_andnot_pd(dummy_mask,velec);
1123             velecsum         = _mm256_add_pd(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1128
1129             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1130
1131             /* Calculate temporary vectorial force */
1132             tx               = _mm256_mul_pd(fscal,dx31);
1133             ty               = _mm256_mul_pd(fscal,dy31);
1134             tz               = _mm256_mul_pd(fscal,dz31);
1135
1136             /* Update vectorial force */
1137             fix3             = _mm256_add_pd(fix3,tx);
1138             fiy3             = _mm256_add_pd(fiy3,ty);
1139             fiz3             = _mm256_add_pd(fiz3,tz);
1140
1141             fjx1             = _mm256_add_pd(fjx1,tx);
1142             fjy1             = _mm256_add_pd(fjy1,ty);
1143             fjz1             = _mm256_add_pd(fjz1,tz);
1144
1145             }
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1152             {
1153
1154             /* REACTION-FIELD ELECTROSTATICS */
1155             velec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1156             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1157
1158             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm256_and_pd(velec,cutoff_mask);
1162             velec            = _mm256_andnot_pd(dummy_mask,velec);
1163             velecsum         = _mm256_add_pd(velecsum,velec);
1164
1165             fscal            = felec;
1166
1167             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1168
1169             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1170
1171             /* Calculate temporary vectorial force */
1172             tx               = _mm256_mul_pd(fscal,dx32);
1173             ty               = _mm256_mul_pd(fscal,dy32);
1174             tz               = _mm256_mul_pd(fscal,dz32);
1175
1176             /* Update vectorial force */
1177             fix3             = _mm256_add_pd(fix3,tx);
1178             fiy3             = _mm256_add_pd(fiy3,ty);
1179             fiz3             = _mm256_add_pd(fiz3,tz);
1180
1181             fjx2             = _mm256_add_pd(fjx2,tx);
1182             fjy2             = _mm256_add_pd(fjy2,ty);
1183             fjz2             = _mm256_add_pd(fjz2,tz);
1184
1185             }
1186
1187             /**************************
1188              * CALCULATE INTERACTIONS *
1189              **************************/
1190
1191             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1192             {
1193
1194             /* REACTION-FIELD ELECTROSTATICS */
1195             velec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1196             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1197
1198             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1199
1200             /* Update potential sum for this i atom from the interaction with this j atom. */
1201             velec            = _mm256_and_pd(velec,cutoff_mask);
1202             velec            = _mm256_andnot_pd(dummy_mask,velec);
1203             velecsum         = _mm256_add_pd(velecsum,velec);
1204
1205             fscal            = felec;
1206
1207             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1208
1209             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1210
1211             /* Calculate temporary vectorial force */
1212             tx               = _mm256_mul_pd(fscal,dx33);
1213             ty               = _mm256_mul_pd(fscal,dy33);
1214             tz               = _mm256_mul_pd(fscal,dz33);
1215
1216             /* Update vectorial force */
1217             fix3             = _mm256_add_pd(fix3,tx);
1218             fiy3             = _mm256_add_pd(fiy3,ty);
1219             fiz3             = _mm256_add_pd(fiz3,tz);
1220
1221             fjx3             = _mm256_add_pd(fjx3,tx);
1222             fjy3             = _mm256_add_pd(fjy3,ty);
1223             fjz3             = _mm256_add_pd(fjz3,tz);
1224
1225             }
1226
1227             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1228             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1229             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1230             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1231
1232             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1233                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1234                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1235
1236             /* Inner loop uses 368 flops */
1237         }
1238
1239         /* End of innermost loop */
1240
1241         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1242                                                  f+i_coord_offset,fshift+i_shift_offset);
1243
1244         ggid                        = gid[iidx];
1245         /* Update potential energies */
1246         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1247         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1248
1249         /* Increment number of inner iterations */
1250         inneriter                  += j_index_end - j_index_start;
1251
1252         /* Outer loop uses 26 flops */
1253     }
1254
1255     /* Increment number of outer iterations */
1256     outeriter        += nri;
1257
1258     /* Update outer/inner flops */
1259
1260     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
1261 }
1262 /*
1263  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1264  * Electrostatics interaction: ReactionField
1265  * VdW interaction:            LennardJones
1266  * Geometry:                   Water4-Water4
1267  * Calculate force/pot:        Force
1268  */
1269 void
1270 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1271                     (t_nblist                    * gmx_restrict       nlist,
1272                      rvec                        * gmx_restrict          xx,
1273                      rvec                        * gmx_restrict          ff,
1274                      t_forcerec                  * gmx_restrict          fr,
1275                      t_mdatoms                   * gmx_restrict     mdatoms,
1276                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1277                      t_nrnb                      * gmx_restrict        nrnb)
1278 {
1279     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1280      * just 0 for non-waters.
1281      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1282      * jnr indices corresponding to data put in the four positions in the SIMD register.
1283      */
1284     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1285     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1286     int              jnrA,jnrB,jnrC,jnrD;
1287     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1288     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1289     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1290     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1291     real             rcutoff_scalar;
1292     real             *shiftvec,*fshift,*x,*f;
1293     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1294     real             scratch[4*DIM];
1295     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1296     real *           vdwioffsetptr0;
1297     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1298     real *           vdwioffsetptr1;
1299     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1300     real *           vdwioffsetptr2;
1301     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1302     real *           vdwioffsetptr3;
1303     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1304     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1305     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1306     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1307     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1308     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1309     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1310     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1311     __m256d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1312     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1313     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1314     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1315     __m256d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1316     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1317     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1318     __m256d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1319     __m256d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1320     __m256d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1321     __m256d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1322     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1323     real             *charge;
1324     int              nvdwtype;
1325     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1326     int              *vdwtype;
1327     real             *vdwparam;
1328     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1329     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1330     __m256d          dummy_mask,cutoff_mask;
1331     __m128           tmpmask0,tmpmask1;
1332     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1333     __m256d          one     = _mm256_set1_pd(1.0);
1334     __m256d          two     = _mm256_set1_pd(2.0);
1335     x                = xx[0];
1336     f                = ff[0];
1337
1338     nri              = nlist->nri;
1339     iinr             = nlist->iinr;
1340     jindex           = nlist->jindex;
1341     jjnr             = nlist->jjnr;
1342     shiftidx         = nlist->shift;
1343     gid              = nlist->gid;
1344     shiftvec         = fr->shift_vec[0];
1345     fshift           = fr->fshift[0];
1346     facel            = _mm256_set1_pd(fr->epsfac);
1347     charge           = mdatoms->chargeA;
1348     krf              = _mm256_set1_pd(fr->ic->k_rf);
1349     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1350     crf              = _mm256_set1_pd(fr->ic->c_rf);
1351     nvdwtype         = fr->ntype;
1352     vdwparam         = fr->nbfp;
1353     vdwtype          = mdatoms->typeA;
1354
1355     /* Setup water-specific parameters */
1356     inr              = nlist->iinr[0];
1357     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1358     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1359     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1360     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1361
1362     jq1              = _mm256_set1_pd(charge[inr+1]);
1363     jq2              = _mm256_set1_pd(charge[inr+2]);
1364     jq3              = _mm256_set1_pd(charge[inr+3]);
1365     vdwjidx0A        = 2*vdwtype[inr+0];
1366     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1367     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1368     qq11             = _mm256_mul_pd(iq1,jq1);
1369     qq12             = _mm256_mul_pd(iq1,jq2);
1370     qq13             = _mm256_mul_pd(iq1,jq3);
1371     qq21             = _mm256_mul_pd(iq2,jq1);
1372     qq22             = _mm256_mul_pd(iq2,jq2);
1373     qq23             = _mm256_mul_pd(iq2,jq3);
1374     qq31             = _mm256_mul_pd(iq3,jq1);
1375     qq32             = _mm256_mul_pd(iq3,jq2);
1376     qq33             = _mm256_mul_pd(iq3,jq3);
1377
1378     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1379     rcutoff_scalar   = fr->rcoulomb;
1380     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1381     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1382
1383     sh_vdw_invrcut6  = _mm256_set1_pd(fr->ic->sh_invrc6);
1384     rvdw             = _mm256_set1_pd(fr->rvdw);
1385
1386     /* Avoid stupid compiler warnings */
1387     jnrA = jnrB = jnrC = jnrD = 0;
1388     j_coord_offsetA = 0;
1389     j_coord_offsetB = 0;
1390     j_coord_offsetC = 0;
1391     j_coord_offsetD = 0;
1392
1393     outeriter        = 0;
1394     inneriter        = 0;
1395
1396     for(iidx=0;iidx<4*DIM;iidx++)
1397     {
1398         scratch[iidx] = 0.0;
1399     }
1400
1401     /* Start outer loop over neighborlists */
1402     for(iidx=0; iidx<nri; iidx++)
1403     {
1404         /* Load shift vector for this list */
1405         i_shift_offset   = DIM*shiftidx[iidx];
1406
1407         /* Load limits for loop over neighbors */
1408         j_index_start    = jindex[iidx];
1409         j_index_end      = jindex[iidx+1];
1410
1411         /* Get outer coordinate index */
1412         inr              = iinr[iidx];
1413         i_coord_offset   = DIM*inr;
1414
1415         /* Load i particle coords and add shift vector */
1416         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1417                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1418
1419         fix0             = _mm256_setzero_pd();
1420         fiy0             = _mm256_setzero_pd();
1421         fiz0             = _mm256_setzero_pd();
1422         fix1             = _mm256_setzero_pd();
1423         fiy1             = _mm256_setzero_pd();
1424         fiz1             = _mm256_setzero_pd();
1425         fix2             = _mm256_setzero_pd();
1426         fiy2             = _mm256_setzero_pd();
1427         fiz2             = _mm256_setzero_pd();
1428         fix3             = _mm256_setzero_pd();
1429         fiy3             = _mm256_setzero_pd();
1430         fiz3             = _mm256_setzero_pd();
1431
1432         /* Start inner kernel loop */
1433         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1434         {
1435
1436             /* Get j neighbor index, and coordinate index */
1437             jnrA             = jjnr[jidx];
1438             jnrB             = jjnr[jidx+1];
1439             jnrC             = jjnr[jidx+2];
1440             jnrD             = jjnr[jidx+3];
1441             j_coord_offsetA  = DIM*jnrA;
1442             j_coord_offsetB  = DIM*jnrB;
1443             j_coord_offsetC  = DIM*jnrC;
1444             j_coord_offsetD  = DIM*jnrD;
1445
1446             /* load j atom coordinates */
1447             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1448                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1449                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1450                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1451
1452             /* Calculate displacement vector */
1453             dx00             = _mm256_sub_pd(ix0,jx0);
1454             dy00             = _mm256_sub_pd(iy0,jy0);
1455             dz00             = _mm256_sub_pd(iz0,jz0);
1456             dx11             = _mm256_sub_pd(ix1,jx1);
1457             dy11             = _mm256_sub_pd(iy1,jy1);
1458             dz11             = _mm256_sub_pd(iz1,jz1);
1459             dx12             = _mm256_sub_pd(ix1,jx2);
1460             dy12             = _mm256_sub_pd(iy1,jy2);
1461             dz12             = _mm256_sub_pd(iz1,jz2);
1462             dx13             = _mm256_sub_pd(ix1,jx3);
1463             dy13             = _mm256_sub_pd(iy1,jy3);
1464             dz13             = _mm256_sub_pd(iz1,jz3);
1465             dx21             = _mm256_sub_pd(ix2,jx1);
1466             dy21             = _mm256_sub_pd(iy2,jy1);
1467             dz21             = _mm256_sub_pd(iz2,jz1);
1468             dx22             = _mm256_sub_pd(ix2,jx2);
1469             dy22             = _mm256_sub_pd(iy2,jy2);
1470             dz22             = _mm256_sub_pd(iz2,jz2);
1471             dx23             = _mm256_sub_pd(ix2,jx3);
1472             dy23             = _mm256_sub_pd(iy2,jy3);
1473             dz23             = _mm256_sub_pd(iz2,jz3);
1474             dx31             = _mm256_sub_pd(ix3,jx1);
1475             dy31             = _mm256_sub_pd(iy3,jy1);
1476             dz31             = _mm256_sub_pd(iz3,jz1);
1477             dx32             = _mm256_sub_pd(ix3,jx2);
1478             dy32             = _mm256_sub_pd(iy3,jy2);
1479             dz32             = _mm256_sub_pd(iz3,jz2);
1480             dx33             = _mm256_sub_pd(ix3,jx3);
1481             dy33             = _mm256_sub_pd(iy3,jy3);
1482             dz33             = _mm256_sub_pd(iz3,jz3);
1483
1484             /* Calculate squared distance and things based on it */
1485             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1486             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1487             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1488             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1489             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1490             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1491             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1492             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1493             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1494             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1495
1496             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1497             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1498             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1499             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1500             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1501             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1502             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1503             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1504             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1505
1506             rinvsq00         = gmx_mm256_inv_pd(rsq00);
1507             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1508             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1509             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1510             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1511             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1512             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1513             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1514             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1515             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1516
1517             fjx0             = _mm256_setzero_pd();
1518             fjy0             = _mm256_setzero_pd();
1519             fjz0             = _mm256_setzero_pd();
1520             fjx1             = _mm256_setzero_pd();
1521             fjy1             = _mm256_setzero_pd();
1522             fjz1             = _mm256_setzero_pd();
1523             fjx2             = _mm256_setzero_pd();
1524             fjy2             = _mm256_setzero_pd();
1525             fjz2             = _mm256_setzero_pd();
1526             fjx3             = _mm256_setzero_pd();
1527             fjy3             = _mm256_setzero_pd();
1528             fjz3             = _mm256_setzero_pd();
1529
1530             /**************************
1531              * CALCULATE INTERACTIONS *
1532              **************************/
1533
1534             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1535             {
1536
1537             /* LENNARD-JONES DISPERSION/REPULSION */
1538
1539             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1540             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1541
1542             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1543
1544             fscal            = fvdw;
1545
1546             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1547
1548             /* Calculate temporary vectorial force */
1549             tx               = _mm256_mul_pd(fscal,dx00);
1550             ty               = _mm256_mul_pd(fscal,dy00);
1551             tz               = _mm256_mul_pd(fscal,dz00);
1552
1553             /* Update vectorial force */
1554             fix0             = _mm256_add_pd(fix0,tx);
1555             fiy0             = _mm256_add_pd(fiy0,ty);
1556             fiz0             = _mm256_add_pd(fiz0,tz);
1557
1558             fjx0             = _mm256_add_pd(fjx0,tx);
1559             fjy0             = _mm256_add_pd(fjy0,ty);
1560             fjz0             = _mm256_add_pd(fjz0,tz);
1561
1562             }
1563
1564             /**************************
1565              * CALCULATE INTERACTIONS *
1566              **************************/
1567
1568             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1569             {
1570
1571             /* REACTION-FIELD ELECTROSTATICS */
1572             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1573
1574             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1575
1576             fscal            = felec;
1577
1578             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1579
1580             /* Calculate temporary vectorial force */
1581             tx               = _mm256_mul_pd(fscal,dx11);
1582             ty               = _mm256_mul_pd(fscal,dy11);
1583             tz               = _mm256_mul_pd(fscal,dz11);
1584
1585             /* Update vectorial force */
1586             fix1             = _mm256_add_pd(fix1,tx);
1587             fiy1             = _mm256_add_pd(fiy1,ty);
1588             fiz1             = _mm256_add_pd(fiz1,tz);
1589
1590             fjx1             = _mm256_add_pd(fjx1,tx);
1591             fjy1             = _mm256_add_pd(fjy1,ty);
1592             fjz1             = _mm256_add_pd(fjz1,tz);
1593
1594             }
1595
1596             /**************************
1597              * CALCULATE INTERACTIONS *
1598              **************************/
1599
1600             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1601             {
1602
1603             /* REACTION-FIELD ELECTROSTATICS */
1604             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1605
1606             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1607
1608             fscal            = felec;
1609
1610             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm256_mul_pd(fscal,dx12);
1614             ty               = _mm256_mul_pd(fscal,dy12);
1615             tz               = _mm256_mul_pd(fscal,dz12);
1616
1617             /* Update vectorial force */
1618             fix1             = _mm256_add_pd(fix1,tx);
1619             fiy1             = _mm256_add_pd(fiy1,ty);
1620             fiz1             = _mm256_add_pd(fiz1,tz);
1621
1622             fjx2             = _mm256_add_pd(fjx2,tx);
1623             fjy2             = _mm256_add_pd(fjy2,ty);
1624             fjz2             = _mm256_add_pd(fjz2,tz);
1625
1626             }
1627
1628             /**************************
1629              * CALCULATE INTERACTIONS *
1630              **************************/
1631
1632             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1633             {
1634
1635             /* REACTION-FIELD ELECTROSTATICS */
1636             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1637
1638             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1639
1640             fscal            = felec;
1641
1642             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1643
1644             /* Calculate temporary vectorial force */
1645             tx               = _mm256_mul_pd(fscal,dx13);
1646             ty               = _mm256_mul_pd(fscal,dy13);
1647             tz               = _mm256_mul_pd(fscal,dz13);
1648
1649             /* Update vectorial force */
1650             fix1             = _mm256_add_pd(fix1,tx);
1651             fiy1             = _mm256_add_pd(fiy1,ty);
1652             fiz1             = _mm256_add_pd(fiz1,tz);
1653
1654             fjx3             = _mm256_add_pd(fjx3,tx);
1655             fjy3             = _mm256_add_pd(fjy3,ty);
1656             fjz3             = _mm256_add_pd(fjz3,tz);
1657
1658             }
1659
1660             /**************************
1661              * CALCULATE INTERACTIONS *
1662              **************************/
1663
1664             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1665             {
1666
1667             /* REACTION-FIELD ELECTROSTATICS */
1668             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1669
1670             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1671
1672             fscal            = felec;
1673
1674             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1675
1676             /* Calculate temporary vectorial force */
1677             tx               = _mm256_mul_pd(fscal,dx21);
1678             ty               = _mm256_mul_pd(fscal,dy21);
1679             tz               = _mm256_mul_pd(fscal,dz21);
1680
1681             /* Update vectorial force */
1682             fix2             = _mm256_add_pd(fix2,tx);
1683             fiy2             = _mm256_add_pd(fiy2,ty);
1684             fiz2             = _mm256_add_pd(fiz2,tz);
1685
1686             fjx1             = _mm256_add_pd(fjx1,tx);
1687             fjy1             = _mm256_add_pd(fjy1,ty);
1688             fjz1             = _mm256_add_pd(fjz1,tz);
1689
1690             }
1691
1692             /**************************
1693              * CALCULATE INTERACTIONS *
1694              **************************/
1695
1696             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1697             {
1698
1699             /* REACTION-FIELD ELECTROSTATICS */
1700             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1701
1702             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1703
1704             fscal            = felec;
1705
1706             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1707
1708             /* Calculate temporary vectorial force */
1709             tx               = _mm256_mul_pd(fscal,dx22);
1710             ty               = _mm256_mul_pd(fscal,dy22);
1711             tz               = _mm256_mul_pd(fscal,dz22);
1712
1713             /* Update vectorial force */
1714             fix2             = _mm256_add_pd(fix2,tx);
1715             fiy2             = _mm256_add_pd(fiy2,ty);
1716             fiz2             = _mm256_add_pd(fiz2,tz);
1717
1718             fjx2             = _mm256_add_pd(fjx2,tx);
1719             fjy2             = _mm256_add_pd(fjy2,ty);
1720             fjz2             = _mm256_add_pd(fjz2,tz);
1721
1722             }
1723
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1729             {
1730
1731             /* REACTION-FIELD ELECTROSTATICS */
1732             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1733
1734             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1735
1736             fscal            = felec;
1737
1738             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1739
1740             /* Calculate temporary vectorial force */
1741             tx               = _mm256_mul_pd(fscal,dx23);
1742             ty               = _mm256_mul_pd(fscal,dy23);
1743             tz               = _mm256_mul_pd(fscal,dz23);
1744
1745             /* Update vectorial force */
1746             fix2             = _mm256_add_pd(fix2,tx);
1747             fiy2             = _mm256_add_pd(fiy2,ty);
1748             fiz2             = _mm256_add_pd(fiz2,tz);
1749
1750             fjx3             = _mm256_add_pd(fjx3,tx);
1751             fjy3             = _mm256_add_pd(fjy3,ty);
1752             fjz3             = _mm256_add_pd(fjz3,tz);
1753
1754             }
1755
1756             /**************************
1757              * CALCULATE INTERACTIONS *
1758              **************************/
1759
1760             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1761             {
1762
1763             /* REACTION-FIELD ELECTROSTATICS */
1764             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1765
1766             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1767
1768             fscal            = felec;
1769
1770             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1771
1772             /* Calculate temporary vectorial force */
1773             tx               = _mm256_mul_pd(fscal,dx31);
1774             ty               = _mm256_mul_pd(fscal,dy31);
1775             tz               = _mm256_mul_pd(fscal,dz31);
1776
1777             /* Update vectorial force */
1778             fix3             = _mm256_add_pd(fix3,tx);
1779             fiy3             = _mm256_add_pd(fiy3,ty);
1780             fiz3             = _mm256_add_pd(fiz3,tz);
1781
1782             fjx1             = _mm256_add_pd(fjx1,tx);
1783             fjy1             = _mm256_add_pd(fjy1,ty);
1784             fjz1             = _mm256_add_pd(fjz1,tz);
1785
1786             }
1787
1788             /**************************
1789              * CALCULATE INTERACTIONS *
1790              **************************/
1791
1792             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1793             {
1794
1795             /* REACTION-FIELD ELECTROSTATICS */
1796             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1797
1798             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1799
1800             fscal            = felec;
1801
1802             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1803
1804             /* Calculate temporary vectorial force */
1805             tx               = _mm256_mul_pd(fscal,dx32);
1806             ty               = _mm256_mul_pd(fscal,dy32);
1807             tz               = _mm256_mul_pd(fscal,dz32);
1808
1809             /* Update vectorial force */
1810             fix3             = _mm256_add_pd(fix3,tx);
1811             fiy3             = _mm256_add_pd(fiy3,ty);
1812             fiz3             = _mm256_add_pd(fiz3,tz);
1813
1814             fjx2             = _mm256_add_pd(fjx2,tx);
1815             fjy2             = _mm256_add_pd(fjy2,ty);
1816             fjz2             = _mm256_add_pd(fjz2,tz);
1817
1818             }
1819
1820             /**************************
1821              * CALCULATE INTERACTIONS *
1822              **************************/
1823
1824             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1825             {
1826
1827             /* REACTION-FIELD ELECTROSTATICS */
1828             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1829
1830             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1831
1832             fscal            = felec;
1833
1834             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1835
1836             /* Calculate temporary vectorial force */
1837             tx               = _mm256_mul_pd(fscal,dx33);
1838             ty               = _mm256_mul_pd(fscal,dy33);
1839             tz               = _mm256_mul_pd(fscal,dz33);
1840
1841             /* Update vectorial force */
1842             fix3             = _mm256_add_pd(fix3,tx);
1843             fiy3             = _mm256_add_pd(fiy3,ty);
1844             fiz3             = _mm256_add_pd(fiz3,tz);
1845
1846             fjx3             = _mm256_add_pd(fjx3,tx);
1847             fjy3             = _mm256_add_pd(fjy3,ty);
1848             fjz3             = _mm256_add_pd(fjz3,tz);
1849
1850             }
1851
1852             fjptrA             = f+j_coord_offsetA;
1853             fjptrB             = f+j_coord_offsetB;
1854             fjptrC             = f+j_coord_offsetC;
1855             fjptrD             = f+j_coord_offsetD;
1856
1857             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1858                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1859                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1860
1861             /* Inner loop uses 303 flops */
1862         }
1863
1864         if(jidx<j_index_end)
1865         {
1866
1867             /* Get j neighbor index, and coordinate index */
1868             jnrlistA         = jjnr[jidx];
1869             jnrlistB         = jjnr[jidx+1];
1870             jnrlistC         = jjnr[jidx+2];
1871             jnrlistD         = jjnr[jidx+3];
1872             /* Sign of each element will be negative for non-real atoms.
1873              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1874              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1875              */
1876             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1877
1878             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1879             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1880             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1881
1882             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1883             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1884             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1885             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1886             j_coord_offsetA  = DIM*jnrA;
1887             j_coord_offsetB  = DIM*jnrB;
1888             j_coord_offsetC  = DIM*jnrC;
1889             j_coord_offsetD  = DIM*jnrD;
1890
1891             /* load j atom coordinates */
1892             gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1893                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1894                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1895                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1896
1897             /* Calculate displacement vector */
1898             dx00             = _mm256_sub_pd(ix0,jx0);
1899             dy00             = _mm256_sub_pd(iy0,jy0);
1900             dz00             = _mm256_sub_pd(iz0,jz0);
1901             dx11             = _mm256_sub_pd(ix1,jx1);
1902             dy11             = _mm256_sub_pd(iy1,jy1);
1903             dz11             = _mm256_sub_pd(iz1,jz1);
1904             dx12             = _mm256_sub_pd(ix1,jx2);
1905             dy12             = _mm256_sub_pd(iy1,jy2);
1906             dz12             = _mm256_sub_pd(iz1,jz2);
1907             dx13             = _mm256_sub_pd(ix1,jx3);
1908             dy13             = _mm256_sub_pd(iy1,jy3);
1909             dz13             = _mm256_sub_pd(iz1,jz3);
1910             dx21             = _mm256_sub_pd(ix2,jx1);
1911             dy21             = _mm256_sub_pd(iy2,jy1);
1912             dz21             = _mm256_sub_pd(iz2,jz1);
1913             dx22             = _mm256_sub_pd(ix2,jx2);
1914             dy22             = _mm256_sub_pd(iy2,jy2);
1915             dz22             = _mm256_sub_pd(iz2,jz2);
1916             dx23             = _mm256_sub_pd(ix2,jx3);
1917             dy23             = _mm256_sub_pd(iy2,jy3);
1918             dz23             = _mm256_sub_pd(iz2,jz3);
1919             dx31             = _mm256_sub_pd(ix3,jx1);
1920             dy31             = _mm256_sub_pd(iy3,jy1);
1921             dz31             = _mm256_sub_pd(iz3,jz1);
1922             dx32             = _mm256_sub_pd(ix3,jx2);
1923             dy32             = _mm256_sub_pd(iy3,jy2);
1924             dz32             = _mm256_sub_pd(iz3,jz2);
1925             dx33             = _mm256_sub_pd(ix3,jx3);
1926             dy33             = _mm256_sub_pd(iy3,jy3);
1927             dz33             = _mm256_sub_pd(iz3,jz3);
1928
1929             /* Calculate squared distance and things based on it */
1930             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1931             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1932             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1933             rsq13            = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1934             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1935             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1936             rsq23            = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1937             rsq31            = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1938             rsq32            = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1939             rsq33            = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1940
1941             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1942             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1943             rinv13           = gmx_mm256_invsqrt_pd(rsq13);
1944             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1945             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1946             rinv23           = gmx_mm256_invsqrt_pd(rsq23);
1947             rinv31           = gmx_mm256_invsqrt_pd(rsq31);
1948             rinv32           = gmx_mm256_invsqrt_pd(rsq32);
1949             rinv33           = gmx_mm256_invsqrt_pd(rsq33);
1950
1951             rinvsq00         = gmx_mm256_inv_pd(rsq00);
1952             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1953             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1954             rinvsq13         = _mm256_mul_pd(rinv13,rinv13);
1955             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1956             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1957             rinvsq23         = _mm256_mul_pd(rinv23,rinv23);
1958             rinvsq31         = _mm256_mul_pd(rinv31,rinv31);
1959             rinvsq32         = _mm256_mul_pd(rinv32,rinv32);
1960             rinvsq33         = _mm256_mul_pd(rinv33,rinv33);
1961
1962             fjx0             = _mm256_setzero_pd();
1963             fjy0             = _mm256_setzero_pd();
1964             fjz0             = _mm256_setzero_pd();
1965             fjx1             = _mm256_setzero_pd();
1966             fjy1             = _mm256_setzero_pd();
1967             fjz1             = _mm256_setzero_pd();
1968             fjx2             = _mm256_setzero_pd();
1969             fjy2             = _mm256_setzero_pd();
1970             fjz2             = _mm256_setzero_pd();
1971             fjx3             = _mm256_setzero_pd();
1972             fjy3             = _mm256_setzero_pd();
1973             fjz3             = _mm256_setzero_pd();
1974
1975             /**************************
1976              * CALCULATE INTERACTIONS *
1977              **************************/
1978
1979             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1980             {
1981
1982             /* LENNARD-JONES DISPERSION/REPULSION */
1983
1984             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1985             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1986
1987             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1988
1989             fscal            = fvdw;
1990
1991             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1992
1993             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1994
1995             /* Calculate temporary vectorial force */
1996             tx               = _mm256_mul_pd(fscal,dx00);
1997             ty               = _mm256_mul_pd(fscal,dy00);
1998             tz               = _mm256_mul_pd(fscal,dz00);
1999
2000             /* Update vectorial force */
2001             fix0             = _mm256_add_pd(fix0,tx);
2002             fiy0             = _mm256_add_pd(fiy0,ty);
2003             fiz0             = _mm256_add_pd(fiz0,tz);
2004
2005             fjx0             = _mm256_add_pd(fjx0,tx);
2006             fjy0             = _mm256_add_pd(fjy0,ty);
2007             fjz0             = _mm256_add_pd(fjz0,tz);
2008
2009             }
2010
2011             /**************************
2012              * CALCULATE INTERACTIONS *
2013              **************************/
2014
2015             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2016             {
2017
2018             /* REACTION-FIELD ELECTROSTATICS */
2019             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2020
2021             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2022
2023             fscal            = felec;
2024
2025             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2026
2027             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2028
2029             /* Calculate temporary vectorial force */
2030             tx               = _mm256_mul_pd(fscal,dx11);
2031             ty               = _mm256_mul_pd(fscal,dy11);
2032             tz               = _mm256_mul_pd(fscal,dz11);
2033
2034             /* Update vectorial force */
2035             fix1             = _mm256_add_pd(fix1,tx);
2036             fiy1             = _mm256_add_pd(fiy1,ty);
2037             fiz1             = _mm256_add_pd(fiz1,tz);
2038
2039             fjx1             = _mm256_add_pd(fjx1,tx);
2040             fjy1             = _mm256_add_pd(fjy1,ty);
2041             fjz1             = _mm256_add_pd(fjz1,tz);
2042
2043             }
2044
2045             /**************************
2046              * CALCULATE INTERACTIONS *
2047              **************************/
2048
2049             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2050             {
2051
2052             /* REACTION-FIELD ELECTROSTATICS */
2053             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2054
2055             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2060
2061             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2062
2063             /* Calculate temporary vectorial force */
2064             tx               = _mm256_mul_pd(fscal,dx12);
2065             ty               = _mm256_mul_pd(fscal,dy12);
2066             tz               = _mm256_mul_pd(fscal,dz12);
2067
2068             /* Update vectorial force */
2069             fix1             = _mm256_add_pd(fix1,tx);
2070             fiy1             = _mm256_add_pd(fiy1,ty);
2071             fiz1             = _mm256_add_pd(fiz1,tz);
2072
2073             fjx2             = _mm256_add_pd(fjx2,tx);
2074             fjy2             = _mm256_add_pd(fjy2,ty);
2075             fjz2             = _mm256_add_pd(fjz2,tz);
2076
2077             }
2078
2079             /**************************
2080              * CALCULATE INTERACTIONS *
2081              **************************/
2082
2083             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2084             {
2085
2086             /* REACTION-FIELD ELECTROSTATICS */
2087             felec            = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2088
2089             cutoff_mask      = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2090
2091             fscal            = felec;
2092
2093             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2094
2095             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2096
2097             /* Calculate temporary vectorial force */
2098             tx               = _mm256_mul_pd(fscal,dx13);
2099             ty               = _mm256_mul_pd(fscal,dy13);
2100             tz               = _mm256_mul_pd(fscal,dz13);
2101
2102             /* Update vectorial force */
2103             fix1             = _mm256_add_pd(fix1,tx);
2104             fiy1             = _mm256_add_pd(fiy1,ty);
2105             fiz1             = _mm256_add_pd(fiz1,tz);
2106
2107             fjx3             = _mm256_add_pd(fjx3,tx);
2108             fjy3             = _mm256_add_pd(fjy3,ty);
2109             fjz3             = _mm256_add_pd(fjz3,tz);
2110
2111             }
2112
2113             /**************************
2114              * CALCULATE INTERACTIONS *
2115              **************************/
2116
2117             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2118             {
2119
2120             /* REACTION-FIELD ELECTROSTATICS */
2121             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2122
2123             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2124
2125             fscal            = felec;
2126
2127             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2128
2129             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2130
2131             /* Calculate temporary vectorial force */
2132             tx               = _mm256_mul_pd(fscal,dx21);
2133             ty               = _mm256_mul_pd(fscal,dy21);
2134             tz               = _mm256_mul_pd(fscal,dz21);
2135
2136             /* Update vectorial force */
2137             fix2             = _mm256_add_pd(fix2,tx);
2138             fiy2             = _mm256_add_pd(fiy2,ty);
2139             fiz2             = _mm256_add_pd(fiz2,tz);
2140
2141             fjx1             = _mm256_add_pd(fjx1,tx);
2142             fjy1             = _mm256_add_pd(fjy1,ty);
2143             fjz1             = _mm256_add_pd(fjz1,tz);
2144
2145             }
2146
2147             /**************************
2148              * CALCULATE INTERACTIONS *
2149              **************************/
2150
2151             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2152             {
2153
2154             /* REACTION-FIELD ELECTROSTATICS */
2155             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2156
2157             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2158
2159             fscal            = felec;
2160
2161             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2162
2163             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2164
2165             /* Calculate temporary vectorial force */
2166             tx               = _mm256_mul_pd(fscal,dx22);
2167             ty               = _mm256_mul_pd(fscal,dy22);
2168             tz               = _mm256_mul_pd(fscal,dz22);
2169
2170             /* Update vectorial force */
2171             fix2             = _mm256_add_pd(fix2,tx);
2172             fiy2             = _mm256_add_pd(fiy2,ty);
2173             fiz2             = _mm256_add_pd(fiz2,tz);
2174
2175             fjx2             = _mm256_add_pd(fjx2,tx);
2176             fjy2             = _mm256_add_pd(fjy2,ty);
2177             fjz2             = _mm256_add_pd(fjz2,tz);
2178
2179             }
2180
2181             /**************************
2182              * CALCULATE INTERACTIONS *
2183              **************************/
2184
2185             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2186             {
2187
2188             /* REACTION-FIELD ELECTROSTATICS */
2189             felec            = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2190
2191             cutoff_mask      = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2192
2193             fscal            = felec;
2194
2195             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2196
2197             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2198
2199             /* Calculate temporary vectorial force */
2200             tx               = _mm256_mul_pd(fscal,dx23);
2201             ty               = _mm256_mul_pd(fscal,dy23);
2202             tz               = _mm256_mul_pd(fscal,dz23);
2203
2204             /* Update vectorial force */
2205             fix2             = _mm256_add_pd(fix2,tx);
2206             fiy2             = _mm256_add_pd(fiy2,ty);
2207             fiz2             = _mm256_add_pd(fiz2,tz);
2208
2209             fjx3             = _mm256_add_pd(fjx3,tx);
2210             fjy3             = _mm256_add_pd(fjy3,ty);
2211             fjz3             = _mm256_add_pd(fjz3,tz);
2212
2213             }
2214
2215             /**************************
2216              * CALCULATE INTERACTIONS *
2217              **************************/
2218
2219             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2220             {
2221
2222             /* REACTION-FIELD ELECTROSTATICS */
2223             felec            = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2224
2225             cutoff_mask      = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2226
2227             fscal            = felec;
2228
2229             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2230
2231             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2232
2233             /* Calculate temporary vectorial force */
2234             tx               = _mm256_mul_pd(fscal,dx31);
2235             ty               = _mm256_mul_pd(fscal,dy31);
2236             tz               = _mm256_mul_pd(fscal,dz31);
2237
2238             /* Update vectorial force */
2239             fix3             = _mm256_add_pd(fix3,tx);
2240             fiy3             = _mm256_add_pd(fiy3,ty);
2241             fiz3             = _mm256_add_pd(fiz3,tz);
2242
2243             fjx1             = _mm256_add_pd(fjx1,tx);
2244             fjy1             = _mm256_add_pd(fjy1,ty);
2245             fjz1             = _mm256_add_pd(fjz1,tz);
2246
2247             }
2248
2249             /**************************
2250              * CALCULATE INTERACTIONS *
2251              **************************/
2252
2253             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2254             {
2255
2256             /* REACTION-FIELD ELECTROSTATICS */
2257             felec            = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2258
2259             cutoff_mask      = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2260
2261             fscal            = felec;
2262
2263             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2264
2265             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2266
2267             /* Calculate temporary vectorial force */
2268             tx               = _mm256_mul_pd(fscal,dx32);
2269             ty               = _mm256_mul_pd(fscal,dy32);
2270             tz               = _mm256_mul_pd(fscal,dz32);
2271
2272             /* Update vectorial force */
2273             fix3             = _mm256_add_pd(fix3,tx);
2274             fiy3             = _mm256_add_pd(fiy3,ty);
2275             fiz3             = _mm256_add_pd(fiz3,tz);
2276
2277             fjx2             = _mm256_add_pd(fjx2,tx);
2278             fjy2             = _mm256_add_pd(fjy2,ty);
2279             fjz2             = _mm256_add_pd(fjz2,tz);
2280
2281             }
2282
2283             /**************************
2284              * CALCULATE INTERACTIONS *
2285              **************************/
2286
2287             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2288             {
2289
2290             /* REACTION-FIELD ELECTROSTATICS */
2291             felec            = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2292
2293             cutoff_mask      = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2294
2295             fscal            = felec;
2296
2297             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2298
2299             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2300
2301             /* Calculate temporary vectorial force */
2302             tx               = _mm256_mul_pd(fscal,dx33);
2303             ty               = _mm256_mul_pd(fscal,dy33);
2304             tz               = _mm256_mul_pd(fscal,dz33);
2305
2306             /* Update vectorial force */
2307             fix3             = _mm256_add_pd(fix3,tx);
2308             fiy3             = _mm256_add_pd(fiy3,ty);
2309             fiz3             = _mm256_add_pd(fiz3,tz);
2310
2311             fjx3             = _mm256_add_pd(fjx3,tx);
2312             fjy3             = _mm256_add_pd(fjy3,ty);
2313             fjz3             = _mm256_add_pd(fjz3,tz);
2314
2315             }
2316
2317             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2318             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2319             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2320             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2321
2322             gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2323                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2324                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2325
2326             /* Inner loop uses 303 flops */
2327         }
2328
2329         /* End of innermost loop */
2330
2331         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2332                                                  f+i_coord_offset,fshift+i_shift_offset);
2333
2334         /* Increment number of inner iterations */
2335         inneriter                  += j_index_end - j_index_start;
2336
2337         /* Outer loop uses 24 flops */
2338     }
2339
2340     /* Increment number of outer iterations */
2341     outeriter        += nri;
2342
2343     /* Update outer/inner flops */
2344
2345     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
2346 }