Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
51  * Electrostatics interaction: ReactionField
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     real *           vdwioffsetptr2;
90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
114     __m256           dummy_mask,cutoff_mask;
115     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
116     __m256           one     = _mm256_set1_ps(1.0);
117     __m256           two     = _mm256_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm256_set1_ps(fr->ic->epsfac);
130     charge           = mdatoms->chargeA;
131     krf              = _mm256_set1_ps(fr->ic->k_rf);
132     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
133     crf              = _mm256_set1_ps(fr->ic->c_rf);
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
141     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
142     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
144
145     jq0              = _mm256_set1_ps(charge[inr+0]);
146     jq1              = _mm256_set1_ps(charge[inr+1]);
147     jq2              = _mm256_set1_ps(charge[inr+2]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     qq00             = _mm256_mul_ps(iq0,jq0);
150     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
151     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
152     qq01             = _mm256_mul_ps(iq0,jq1);
153     qq02             = _mm256_mul_ps(iq0,jq2);
154     qq10             = _mm256_mul_ps(iq1,jq0);
155     qq11             = _mm256_mul_ps(iq1,jq1);
156     qq12             = _mm256_mul_ps(iq1,jq2);
157     qq20             = _mm256_mul_ps(iq2,jq0);
158     qq21             = _mm256_mul_ps(iq2,jq1);
159     qq22             = _mm256_mul_ps(iq2,jq2);
160
161     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162     rcutoff_scalar   = fr->ic->rcoulomb;
163     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
164     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
165
166     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
167     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175     j_coord_offsetE = 0;
176     j_coord_offsetF = 0;
177     j_coord_offsetG = 0;
178     j_coord_offsetH = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205
206         fix0             = _mm256_setzero_ps();
207         fiy0             = _mm256_setzero_ps();
208         fiz0             = _mm256_setzero_ps();
209         fix1             = _mm256_setzero_ps();
210         fiy1             = _mm256_setzero_ps();
211         fiz1             = _mm256_setzero_ps();
212         fix2             = _mm256_setzero_ps();
213         fiy2             = _mm256_setzero_ps();
214         fiz2             = _mm256_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm256_setzero_ps();
218         vvdwsum          = _mm256_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             jnrE             = jjnr[jidx+4];
230             jnrF             = jjnr[jidx+5];
231             jnrG             = jjnr[jidx+6];
232             jnrH             = jjnr[jidx+7];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237             j_coord_offsetE  = DIM*jnrE;
238             j_coord_offsetF  = DIM*jnrF;
239             j_coord_offsetG  = DIM*jnrG;
240             j_coord_offsetH  = DIM*jnrH;
241
242             /* load j atom coordinates */
243             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
244                                                  x+j_coord_offsetC,x+j_coord_offsetD,
245                                                  x+j_coord_offsetE,x+j_coord_offsetF,
246                                                  x+j_coord_offsetG,x+j_coord_offsetH,
247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
248
249             /* Calculate displacement vector */
250             dx00             = _mm256_sub_ps(ix0,jx0);
251             dy00             = _mm256_sub_ps(iy0,jy0);
252             dz00             = _mm256_sub_ps(iz0,jz0);
253             dx01             = _mm256_sub_ps(ix0,jx1);
254             dy01             = _mm256_sub_ps(iy0,jy1);
255             dz01             = _mm256_sub_ps(iz0,jz1);
256             dx02             = _mm256_sub_ps(ix0,jx2);
257             dy02             = _mm256_sub_ps(iy0,jy2);
258             dz02             = _mm256_sub_ps(iz0,jz2);
259             dx10             = _mm256_sub_ps(ix1,jx0);
260             dy10             = _mm256_sub_ps(iy1,jy0);
261             dz10             = _mm256_sub_ps(iz1,jz0);
262             dx11             = _mm256_sub_ps(ix1,jx1);
263             dy11             = _mm256_sub_ps(iy1,jy1);
264             dz11             = _mm256_sub_ps(iz1,jz1);
265             dx12             = _mm256_sub_ps(ix1,jx2);
266             dy12             = _mm256_sub_ps(iy1,jy2);
267             dz12             = _mm256_sub_ps(iz1,jz2);
268             dx20             = _mm256_sub_ps(ix2,jx0);
269             dy20             = _mm256_sub_ps(iy2,jy0);
270             dz20             = _mm256_sub_ps(iz2,jz0);
271             dx21             = _mm256_sub_ps(ix2,jx1);
272             dy21             = _mm256_sub_ps(iy2,jy1);
273             dz21             = _mm256_sub_ps(iz2,jz1);
274             dx22             = _mm256_sub_ps(ix2,jx2);
275             dy22             = _mm256_sub_ps(iy2,jy2);
276             dz22             = _mm256_sub_ps(iz2,jz2);
277
278             /* Calculate squared distance and things based on it */
279             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
280             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
281             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
282             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
283             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
284             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
285             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
286             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
287             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
288
289             rinv00           = avx256_invsqrt_f(rsq00);
290             rinv01           = avx256_invsqrt_f(rsq01);
291             rinv02           = avx256_invsqrt_f(rsq02);
292             rinv10           = avx256_invsqrt_f(rsq10);
293             rinv11           = avx256_invsqrt_f(rsq11);
294             rinv12           = avx256_invsqrt_f(rsq12);
295             rinv20           = avx256_invsqrt_f(rsq20);
296             rinv21           = avx256_invsqrt_f(rsq21);
297             rinv22           = avx256_invsqrt_f(rsq22);
298
299             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
300             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
301             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
302             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
303             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
304             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
305             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
306             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
307             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
308
309             fjx0             = _mm256_setzero_ps();
310             fjy0             = _mm256_setzero_ps();
311             fjz0             = _mm256_setzero_ps();
312             fjx1             = _mm256_setzero_ps();
313             fjy1             = _mm256_setzero_ps();
314             fjz1             = _mm256_setzero_ps();
315             fjx2             = _mm256_setzero_ps();
316             fjy2             = _mm256_setzero_ps();
317             fjz2             = _mm256_setzero_ps();
318
319             /**************************
320              * CALCULATE INTERACTIONS *
321              **************************/
322
323             if (gmx_mm256_any_lt(rsq00,rcutoff2))
324             {
325
326             /* REACTION-FIELD ELECTROSTATICS */
327             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
328             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
329
330             /* LENNARD-JONES DISPERSION/REPULSION */
331
332             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
333             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
334             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
335             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
336                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
337             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
338
339             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
340
341             /* Update potential sum for this i atom from the interaction with this j atom. */
342             velec            = _mm256_and_ps(velec,cutoff_mask);
343             velecsum         = _mm256_add_ps(velecsum,velec);
344             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
345             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
346
347             fscal            = _mm256_add_ps(felec,fvdw);
348
349             fscal            = _mm256_and_ps(fscal,cutoff_mask);
350
351             /* Calculate temporary vectorial force */
352             tx               = _mm256_mul_ps(fscal,dx00);
353             ty               = _mm256_mul_ps(fscal,dy00);
354             tz               = _mm256_mul_ps(fscal,dz00);
355
356             /* Update vectorial force */
357             fix0             = _mm256_add_ps(fix0,tx);
358             fiy0             = _mm256_add_ps(fiy0,ty);
359             fiz0             = _mm256_add_ps(fiz0,tz);
360
361             fjx0             = _mm256_add_ps(fjx0,tx);
362             fjy0             = _mm256_add_ps(fjy0,ty);
363             fjz0             = _mm256_add_ps(fjz0,tz);
364
365             }
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             if (gmx_mm256_any_lt(rsq01,rcutoff2))
372             {
373
374             /* REACTION-FIELD ELECTROSTATICS */
375             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
376             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
377
378             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velec            = _mm256_and_ps(velec,cutoff_mask);
382             velecsum         = _mm256_add_ps(velecsum,velec);
383
384             fscal            = felec;
385
386             fscal            = _mm256_and_ps(fscal,cutoff_mask);
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm256_mul_ps(fscal,dx01);
390             ty               = _mm256_mul_ps(fscal,dy01);
391             tz               = _mm256_mul_ps(fscal,dz01);
392
393             /* Update vectorial force */
394             fix0             = _mm256_add_ps(fix0,tx);
395             fiy0             = _mm256_add_ps(fiy0,ty);
396             fiz0             = _mm256_add_ps(fiz0,tz);
397
398             fjx1             = _mm256_add_ps(fjx1,tx);
399             fjy1             = _mm256_add_ps(fjy1,ty);
400             fjz1             = _mm256_add_ps(fjz1,tz);
401
402             }
403
404             /**************************
405              * CALCULATE INTERACTIONS *
406              **************************/
407
408             if (gmx_mm256_any_lt(rsq02,rcutoff2))
409             {
410
411             /* REACTION-FIELD ELECTROSTATICS */
412             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
413             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
414
415             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
416
417             /* Update potential sum for this i atom from the interaction with this j atom. */
418             velec            = _mm256_and_ps(velec,cutoff_mask);
419             velecsum         = _mm256_add_ps(velecsum,velec);
420
421             fscal            = felec;
422
423             fscal            = _mm256_and_ps(fscal,cutoff_mask);
424
425             /* Calculate temporary vectorial force */
426             tx               = _mm256_mul_ps(fscal,dx02);
427             ty               = _mm256_mul_ps(fscal,dy02);
428             tz               = _mm256_mul_ps(fscal,dz02);
429
430             /* Update vectorial force */
431             fix0             = _mm256_add_ps(fix0,tx);
432             fiy0             = _mm256_add_ps(fiy0,ty);
433             fiz0             = _mm256_add_ps(fiz0,tz);
434
435             fjx2             = _mm256_add_ps(fjx2,tx);
436             fjy2             = _mm256_add_ps(fjy2,ty);
437             fjz2             = _mm256_add_ps(fjz2,tz);
438
439             }
440
441             /**************************
442              * CALCULATE INTERACTIONS *
443              **************************/
444
445             if (gmx_mm256_any_lt(rsq10,rcutoff2))
446             {
447
448             /* REACTION-FIELD ELECTROSTATICS */
449             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
450             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
451
452             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
453
454             /* Update potential sum for this i atom from the interaction with this j atom. */
455             velec            = _mm256_and_ps(velec,cutoff_mask);
456             velecsum         = _mm256_add_ps(velecsum,velec);
457
458             fscal            = felec;
459
460             fscal            = _mm256_and_ps(fscal,cutoff_mask);
461
462             /* Calculate temporary vectorial force */
463             tx               = _mm256_mul_ps(fscal,dx10);
464             ty               = _mm256_mul_ps(fscal,dy10);
465             tz               = _mm256_mul_ps(fscal,dz10);
466
467             /* Update vectorial force */
468             fix1             = _mm256_add_ps(fix1,tx);
469             fiy1             = _mm256_add_ps(fiy1,ty);
470             fiz1             = _mm256_add_ps(fiz1,tz);
471
472             fjx0             = _mm256_add_ps(fjx0,tx);
473             fjy0             = _mm256_add_ps(fjy0,ty);
474             fjz0             = _mm256_add_ps(fjz0,tz);
475
476             }
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             if (gmx_mm256_any_lt(rsq11,rcutoff2))
483             {
484
485             /* REACTION-FIELD ELECTROSTATICS */
486             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
487             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
488
489             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
490
491             /* Update potential sum for this i atom from the interaction with this j atom. */
492             velec            = _mm256_and_ps(velec,cutoff_mask);
493             velecsum         = _mm256_add_ps(velecsum,velec);
494
495             fscal            = felec;
496
497             fscal            = _mm256_and_ps(fscal,cutoff_mask);
498
499             /* Calculate temporary vectorial force */
500             tx               = _mm256_mul_ps(fscal,dx11);
501             ty               = _mm256_mul_ps(fscal,dy11);
502             tz               = _mm256_mul_ps(fscal,dz11);
503
504             /* Update vectorial force */
505             fix1             = _mm256_add_ps(fix1,tx);
506             fiy1             = _mm256_add_ps(fiy1,ty);
507             fiz1             = _mm256_add_ps(fiz1,tz);
508
509             fjx1             = _mm256_add_ps(fjx1,tx);
510             fjy1             = _mm256_add_ps(fjy1,ty);
511             fjz1             = _mm256_add_ps(fjz1,tz);
512
513             }
514
515             /**************************
516              * CALCULATE INTERACTIONS *
517              **************************/
518
519             if (gmx_mm256_any_lt(rsq12,rcutoff2))
520             {
521
522             /* REACTION-FIELD ELECTROSTATICS */
523             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
524             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
525
526             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
527
528             /* Update potential sum for this i atom from the interaction with this j atom. */
529             velec            = _mm256_and_ps(velec,cutoff_mask);
530             velecsum         = _mm256_add_ps(velecsum,velec);
531
532             fscal            = felec;
533
534             fscal            = _mm256_and_ps(fscal,cutoff_mask);
535
536             /* Calculate temporary vectorial force */
537             tx               = _mm256_mul_ps(fscal,dx12);
538             ty               = _mm256_mul_ps(fscal,dy12);
539             tz               = _mm256_mul_ps(fscal,dz12);
540
541             /* Update vectorial force */
542             fix1             = _mm256_add_ps(fix1,tx);
543             fiy1             = _mm256_add_ps(fiy1,ty);
544             fiz1             = _mm256_add_ps(fiz1,tz);
545
546             fjx2             = _mm256_add_ps(fjx2,tx);
547             fjy2             = _mm256_add_ps(fjy2,ty);
548             fjz2             = _mm256_add_ps(fjz2,tz);
549
550             }
551
552             /**************************
553              * CALCULATE INTERACTIONS *
554              **************************/
555
556             if (gmx_mm256_any_lt(rsq20,rcutoff2))
557             {
558
559             /* REACTION-FIELD ELECTROSTATICS */
560             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
561             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
562
563             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velec            = _mm256_and_ps(velec,cutoff_mask);
567             velecsum         = _mm256_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             fscal            = _mm256_and_ps(fscal,cutoff_mask);
572
573             /* Calculate temporary vectorial force */
574             tx               = _mm256_mul_ps(fscal,dx20);
575             ty               = _mm256_mul_ps(fscal,dy20);
576             tz               = _mm256_mul_ps(fscal,dz20);
577
578             /* Update vectorial force */
579             fix2             = _mm256_add_ps(fix2,tx);
580             fiy2             = _mm256_add_ps(fiy2,ty);
581             fiz2             = _mm256_add_ps(fiz2,tz);
582
583             fjx0             = _mm256_add_ps(fjx0,tx);
584             fjy0             = _mm256_add_ps(fjy0,ty);
585             fjz0             = _mm256_add_ps(fjz0,tz);
586
587             }
588
589             /**************************
590              * CALCULATE INTERACTIONS *
591              **************************/
592
593             if (gmx_mm256_any_lt(rsq21,rcutoff2))
594             {
595
596             /* REACTION-FIELD ELECTROSTATICS */
597             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
598             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
599
600             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
601
602             /* Update potential sum for this i atom from the interaction with this j atom. */
603             velec            = _mm256_and_ps(velec,cutoff_mask);
604             velecsum         = _mm256_add_ps(velecsum,velec);
605
606             fscal            = felec;
607
608             fscal            = _mm256_and_ps(fscal,cutoff_mask);
609
610             /* Calculate temporary vectorial force */
611             tx               = _mm256_mul_ps(fscal,dx21);
612             ty               = _mm256_mul_ps(fscal,dy21);
613             tz               = _mm256_mul_ps(fscal,dz21);
614
615             /* Update vectorial force */
616             fix2             = _mm256_add_ps(fix2,tx);
617             fiy2             = _mm256_add_ps(fiy2,ty);
618             fiz2             = _mm256_add_ps(fiz2,tz);
619
620             fjx1             = _mm256_add_ps(fjx1,tx);
621             fjy1             = _mm256_add_ps(fjy1,ty);
622             fjz1             = _mm256_add_ps(fjz1,tz);
623
624             }
625
626             /**************************
627              * CALCULATE INTERACTIONS *
628              **************************/
629
630             if (gmx_mm256_any_lt(rsq22,rcutoff2))
631             {
632
633             /* REACTION-FIELD ELECTROSTATICS */
634             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
635             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
636
637             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
638
639             /* Update potential sum for this i atom from the interaction with this j atom. */
640             velec            = _mm256_and_ps(velec,cutoff_mask);
641             velecsum         = _mm256_add_ps(velecsum,velec);
642
643             fscal            = felec;
644
645             fscal            = _mm256_and_ps(fscal,cutoff_mask);
646
647             /* Calculate temporary vectorial force */
648             tx               = _mm256_mul_ps(fscal,dx22);
649             ty               = _mm256_mul_ps(fscal,dy22);
650             tz               = _mm256_mul_ps(fscal,dz22);
651
652             /* Update vectorial force */
653             fix2             = _mm256_add_ps(fix2,tx);
654             fiy2             = _mm256_add_ps(fiy2,ty);
655             fiz2             = _mm256_add_ps(fiz2,tz);
656
657             fjx2             = _mm256_add_ps(fjx2,tx);
658             fjy2             = _mm256_add_ps(fjy2,ty);
659             fjz2             = _mm256_add_ps(fjz2,tz);
660
661             }
662
663             fjptrA             = f+j_coord_offsetA;
664             fjptrB             = f+j_coord_offsetB;
665             fjptrC             = f+j_coord_offsetC;
666             fjptrD             = f+j_coord_offsetD;
667             fjptrE             = f+j_coord_offsetE;
668             fjptrF             = f+j_coord_offsetF;
669             fjptrG             = f+j_coord_offsetG;
670             fjptrH             = f+j_coord_offsetH;
671
672             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
673                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
674
675             /* Inner loop uses 342 flops */
676         }
677
678         if(jidx<j_index_end)
679         {
680
681             /* Get j neighbor index, and coordinate index */
682             jnrlistA         = jjnr[jidx];
683             jnrlistB         = jjnr[jidx+1];
684             jnrlistC         = jjnr[jidx+2];
685             jnrlistD         = jjnr[jidx+3];
686             jnrlistE         = jjnr[jidx+4];
687             jnrlistF         = jjnr[jidx+5];
688             jnrlistG         = jjnr[jidx+6];
689             jnrlistH         = jjnr[jidx+7];
690             /* Sign of each element will be negative for non-real atoms.
691              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
692              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
693              */
694             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
695                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
696                                             
697             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
698             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
699             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
700             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
701             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
702             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
703             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
704             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
705             j_coord_offsetA  = DIM*jnrA;
706             j_coord_offsetB  = DIM*jnrB;
707             j_coord_offsetC  = DIM*jnrC;
708             j_coord_offsetD  = DIM*jnrD;
709             j_coord_offsetE  = DIM*jnrE;
710             j_coord_offsetF  = DIM*jnrF;
711             j_coord_offsetG  = DIM*jnrG;
712             j_coord_offsetH  = DIM*jnrH;
713
714             /* load j atom coordinates */
715             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
716                                                  x+j_coord_offsetC,x+j_coord_offsetD,
717                                                  x+j_coord_offsetE,x+j_coord_offsetF,
718                                                  x+j_coord_offsetG,x+j_coord_offsetH,
719                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
720
721             /* Calculate displacement vector */
722             dx00             = _mm256_sub_ps(ix0,jx0);
723             dy00             = _mm256_sub_ps(iy0,jy0);
724             dz00             = _mm256_sub_ps(iz0,jz0);
725             dx01             = _mm256_sub_ps(ix0,jx1);
726             dy01             = _mm256_sub_ps(iy0,jy1);
727             dz01             = _mm256_sub_ps(iz0,jz1);
728             dx02             = _mm256_sub_ps(ix0,jx2);
729             dy02             = _mm256_sub_ps(iy0,jy2);
730             dz02             = _mm256_sub_ps(iz0,jz2);
731             dx10             = _mm256_sub_ps(ix1,jx0);
732             dy10             = _mm256_sub_ps(iy1,jy0);
733             dz10             = _mm256_sub_ps(iz1,jz0);
734             dx11             = _mm256_sub_ps(ix1,jx1);
735             dy11             = _mm256_sub_ps(iy1,jy1);
736             dz11             = _mm256_sub_ps(iz1,jz1);
737             dx12             = _mm256_sub_ps(ix1,jx2);
738             dy12             = _mm256_sub_ps(iy1,jy2);
739             dz12             = _mm256_sub_ps(iz1,jz2);
740             dx20             = _mm256_sub_ps(ix2,jx0);
741             dy20             = _mm256_sub_ps(iy2,jy0);
742             dz20             = _mm256_sub_ps(iz2,jz0);
743             dx21             = _mm256_sub_ps(ix2,jx1);
744             dy21             = _mm256_sub_ps(iy2,jy1);
745             dz21             = _mm256_sub_ps(iz2,jz1);
746             dx22             = _mm256_sub_ps(ix2,jx2);
747             dy22             = _mm256_sub_ps(iy2,jy2);
748             dz22             = _mm256_sub_ps(iz2,jz2);
749
750             /* Calculate squared distance and things based on it */
751             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
752             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
753             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
754             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
755             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
756             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
757             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
758             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
759             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
760
761             rinv00           = avx256_invsqrt_f(rsq00);
762             rinv01           = avx256_invsqrt_f(rsq01);
763             rinv02           = avx256_invsqrt_f(rsq02);
764             rinv10           = avx256_invsqrt_f(rsq10);
765             rinv11           = avx256_invsqrt_f(rsq11);
766             rinv12           = avx256_invsqrt_f(rsq12);
767             rinv20           = avx256_invsqrt_f(rsq20);
768             rinv21           = avx256_invsqrt_f(rsq21);
769             rinv22           = avx256_invsqrt_f(rsq22);
770
771             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
772             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
773             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
774             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
775             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
776             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
777             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
778             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
779             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
780
781             fjx0             = _mm256_setzero_ps();
782             fjy0             = _mm256_setzero_ps();
783             fjz0             = _mm256_setzero_ps();
784             fjx1             = _mm256_setzero_ps();
785             fjy1             = _mm256_setzero_ps();
786             fjz1             = _mm256_setzero_ps();
787             fjx2             = _mm256_setzero_ps();
788             fjy2             = _mm256_setzero_ps();
789             fjz2             = _mm256_setzero_ps();
790
791             /**************************
792              * CALCULATE INTERACTIONS *
793              **************************/
794
795             if (gmx_mm256_any_lt(rsq00,rcutoff2))
796             {
797
798             /* REACTION-FIELD ELECTROSTATICS */
799             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
800             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
801
802             /* LENNARD-JONES DISPERSION/REPULSION */
803
804             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
805             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
806             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
807             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
808                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
809             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
810
811             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _mm256_and_ps(velec,cutoff_mask);
815             velec            = _mm256_andnot_ps(dummy_mask,velec);
816             velecsum         = _mm256_add_ps(velecsum,velec);
817             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
818             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
819             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
820
821             fscal            = _mm256_add_ps(felec,fvdw);
822
823             fscal            = _mm256_and_ps(fscal,cutoff_mask);
824
825             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
826
827             /* Calculate temporary vectorial force */
828             tx               = _mm256_mul_ps(fscal,dx00);
829             ty               = _mm256_mul_ps(fscal,dy00);
830             tz               = _mm256_mul_ps(fscal,dz00);
831
832             /* Update vectorial force */
833             fix0             = _mm256_add_ps(fix0,tx);
834             fiy0             = _mm256_add_ps(fiy0,ty);
835             fiz0             = _mm256_add_ps(fiz0,tz);
836
837             fjx0             = _mm256_add_ps(fjx0,tx);
838             fjy0             = _mm256_add_ps(fjy0,ty);
839             fjz0             = _mm256_add_ps(fjz0,tz);
840
841             }
842
843             /**************************
844              * CALCULATE INTERACTIONS *
845              **************************/
846
847             if (gmx_mm256_any_lt(rsq01,rcutoff2))
848             {
849
850             /* REACTION-FIELD ELECTROSTATICS */
851             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
852             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
853
854             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
855
856             /* Update potential sum for this i atom from the interaction with this j atom. */
857             velec            = _mm256_and_ps(velec,cutoff_mask);
858             velec            = _mm256_andnot_ps(dummy_mask,velec);
859             velecsum         = _mm256_add_ps(velecsum,velec);
860
861             fscal            = felec;
862
863             fscal            = _mm256_and_ps(fscal,cutoff_mask);
864
865             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
866
867             /* Calculate temporary vectorial force */
868             tx               = _mm256_mul_ps(fscal,dx01);
869             ty               = _mm256_mul_ps(fscal,dy01);
870             tz               = _mm256_mul_ps(fscal,dz01);
871
872             /* Update vectorial force */
873             fix0             = _mm256_add_ps(fix0,tx);
874             fiy0             = _mm256_add_ps(fiy0,ty);
875             fiz0             = _mm256_add_ps(fiz0,tz);
876
877             fjx1             = _mm256_add_ps(fjx1,tx);
878             fjy1             = _mm256_add_ps(fjy1,ty);
879             fjz1             = _mm256_add_ps(fjz1,tz);
880
881             }
882
883             /**************************
884              * CALCULATE INTERACTIONS *
885              **************************/
886
887             if (gmx_mm256_any_lt(rsq02,rcutoff2))
888             {
889
890             /* REACTION-FIELD ELECTROSTATICS */
891             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
892             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
893
894             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
895
896             /* Update potential sum for this i atom from the interaction with this j atom. */
897             velec            = _mm256_and_ps(velec,cutoff_mask);
898             velec            = _mm256_andnot_ps(dummy_mask,velec);
899             velecsum         = _mm256_add_ps(velecsum,velec);
900
901             fscal            = felec;
902
903             fscal            = _mm256_and_ps(fscal,cutoff_mask);
904
905             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
906
907             /* Calculate temporary vectorial force */
908             tx               = _mm256_mul_ps(fscal,dx02);
909             ty               = _mm256_mul_ps(fscal,dy02);
910             tz               = _mm256_mul_ps(fscal,dz02);
911
912             /* Update vectorial force */
913             fix0             = _mm256_add_ps(fix0,tx);
914             fiy0             = _mm256_add_ps(fiy0,ty);
915             fiz0             = _mm256_add_ps(fiz0,tz);
916
917             fjx2             = _mm256_add_ps(fjx2,tx);
918             fjy2             = _mm256_add_ps(fjy2,ty);
919             fjz2             = _mm256_add_ps(fjz2,tz);
920
921             }
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             if (gmx_mm256_any_lt(rsq10,rcutoff2))
928             {
929
930             /* REACTION-FIELD ELECTROSTATICS */
931             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
932             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
933
934             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
935
936             /* Update potential sum for this i atom from the interaction with this j atom. */
937             velec            = _mm256_and_ps(velec,cutoff_mask);
938             velec            = _mm256_andnot_ps(dummy_mask,velec);
939             velecsum         = _mm256_add_ps(velecsum,velec);
940
941             fscal            = felec;
942
943             fscal            = _mm256_and_ps(fscal,cutoff_mask);
944
945             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
946
947             /* Calculate temporary vectorial force */
948             tx               = _mm256_mul_ps(fscal,dx10);
949             ty               = _mm256_mul_ps(fscal,dy10);
950             tz               = _mm256_mul_ps(fscal,dz10);
951
952             /* Update vectorial force */
953             fix1             = _mm256_add_ps(fix1,tx);
954             fiy1             = _mm256_add_ps(fiy1,ty);
955             fiz1             = _mm256_add_ps(fiz1,tz);
956
957             fjx0             = _mm256_add_ps(fjx0,tx);
958             fjy0             = _mm256_add_ps(fjy0,ty);
959             fjz0             = _mm256_add_ps(fjz0,tz);
960
961             }
962
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             if (gmx_mm256_any_lt(rsq11,rcutoff2))
968             {
969
970             /* REACTION-FIELD ELECTROSTATICS */
971             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
972             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
973
974             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
975
976             /* Update potential sum for this i atom from the interaction with this j atom. */
977             velec            = _mm256_and_ps(velec,cutoff_mask);
978             velec            = _mm256_andnot_ps(dummy_mask,velec);
979             velecsum         = _mm256_add_ps(velecsum,velec);
980
981             fscal            = felec;
982
983             fscal            = _mm256_and_ps(fscal,cutoff_mask);
984
985             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
986
987             /* Calculate temporary vectorial force */
988             tx               = _mm256_mul_ps(fscal,dx11);
989             ty               = _mm256_mul_ps(fscal,dy11);
990             tz               = _mm256_mul_ps(fscal,dz11);
991
992             /* Update vectorial force */
993             fix1             = _mm256_add_ps(fix1,tx);
994             fiy1             = _mm256_add_ps(fiy1,ty);
995             fiz1             = _mm256_add_ps(fiz1,tz);
996
997             fjx1             = _mm256_add_ps(fjx1,tx);
998             fjy1             = _mm256_add_ps(fjy1,ty);
999             fjz1             = _mm256_add_ps(fjz1,tz);
1000
1001             }
1002
1003             /**************************
1004              * CALCULATE INTERACTIONS *
1005              **************************/
1006
1007             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1008             {
1009
1010             /* REACTION-FIELD ELECTROSTATICS */
1011             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1012             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1013
1014             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm256_and_ps(velec,cutoff_mask);
1018             velec            = _mm256_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm256_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1024
1025             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1026
1027             /* Calculate temporary vectorial force */
1028             tx               = _mm256_mul_ps(fscal,dx12);
1029             ty               = _mm256_mul_ps(fscal,dy12);
1030             tz               = _mm256_mul_ps(fscal,dz12);
1031
1032             /* Update vectorial force */
1033             fix1             = _mm256_add_ps(fix1,tx);
1034             fiy1             = _mm256_add_ps(fiy1,ty);
1035             fiz1             = _mm256_add_ps(fiz1,tz);
1036
1037             fjx2             = _mm256_add_ps(fjx2,tx);
1038             fjy2             = _mm256_add_ps(fjy2,ty);
1039             fjz2             = _mm256_add_ps(fjz2,tz);
1040
1041             }
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1048             {
1049
1050             /* REACTION-FIELD ELECTROSTATICS */
1051             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1052             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1053
1054             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1055
1056             /* Update potential sum for this i atom from the interaction with this j atom. */
1057             velec            = _mm256_and_ps(velec,cutoff_mask);
1058             velec            = _mm256_andnot_ps(dummy_mask,velec);
1059             velecsum         = _mm256_add_ps(velecsum,velec);
1060
1061             fscal            = felec;
1062
1063             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1064
1065             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm256_mul_ps(fscal,dx20);
1069             ty               = _mm256_mul_ps(fscal,dy20);
1070             tz               = _mm256_mul_ps(fscal,dz20);
1071
1072             /* Update vectorial force */
1073             fix2             = _mm256_add_ps(fix2,tx);
1074             fiy2             = _mm256_add_ps(fiy2,ty);
1075             fiz2             = _mm256_add_ps(fiz2,tz);
1076
1077             fjx0             = _mm256_add_ps(fjx0,tx);
1078             fjy0             = _mm256_add_ps(fjy0,ty);
1079             fjz0             = _mm256_add_ps(fjz0,tz);
1080
1081             }
1082
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1088             {
1089
1090             /* REACTION-FIELD ELECTROSTATICS */
1091             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1092             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1093
1094             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1095
1096             /* Update potential sum for this i atom from the interaction with this j atom. */
1097             velec            = _mm256_and_ps(velec,cutoff_mask);
1098             velec            = _mm256_andnot_ps(dummy_mask,velec);
1099             velecsum         = _mm256_add_ps(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1104
1105             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1106
1107             /* Calculate temporary vectorial force */
1108             tx               = _mm256_mul_ps(fscal,dx21);
1109             ty               = _mm256_mul_ps(fscal,dy21);
1110             tz               = _mm256_mul_ps(fscal,dz21);
1111
1112             /* Update vectorial force */
1113             fix2             = _mm256_add_ps(fix2,tx);
1114             fiy2             = _mm256_add_ps(fiy2,ty);
1115             fiz2             = _mm256_add_ps(fiz2,tz);
1116
1117             fjx1             = _mm256_add_ps(fjx1,tx);
1118             fjy1             = _mm256_add_ps(fjy1,ty);
1119             fjz1             = _mm256_add_ps(fjz1,tz);
1120
1121             }
1122
1123             /**************************
1124              * CALCULATE INTERACTIONS *
1125              **************************/
1126
1127             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1128             {
1129
1130             /* REACTION-FIELD ELECTROSTATICS */
1131             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1132             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1133
1134             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1135
1136             /* Update potential sum for this i atom from the interaction with this j atom. */
1137             velec            = _mm256_and_ps(velec,cutoff_mask);
1138             velec            = _mm256_andnot_ps(dummy_mask,velec);
1139             velecsum         = _mm256_add_ps(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1144
1145             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1146
1147             /* Calculate temporary vectorial force */
1148             tx               = _mm256_mul_ps(fscal,dx22);
1149             ty               = _mm256_mul_ps(fscal,dy22);
1150             tz               = _mm256_mul_ps(fscal,dz22);
1151
1152             /* Update vectorial force */
1153             fix2             = _mm256_add_ps(fix2,tx);
1154             fiy2             = _mm256_add_ps(fiy2,ty);
1155             fiz2             = _mm256_add_ps(fiz2,tz);
1156
1157             fjx2             = _mm256_add_ps(fjx2,tx);
1158             fjy2             = _mm256_add_ps(fjy2,ty);
1159             fjz2             = _mm256_add_ps(fjz2,tz);
1160
1161             }
1162
1163             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1164             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1165             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1166             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1167             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1168             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1169             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1170             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1171
1172             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1173                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1174
1175             /* Inner loop uses 342 flops */
1176         }
1177
1178         /* End of innermost loop */
1179
1180         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1181                                                  f+i_coord_offset,fshift+i_shift_offset);
1182
1183         ggid                        = gid[iidx];
1184         /* Update potential energies */
1185         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1186         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1187
1188         /* Increment number of inner iterations */
1189         inneriter                  += j_index_end - j_index_start;
1190
1191         /* Outer loop uses 20 flops */
1192     }
1193
1194     /* Increment number of outer iterations */
1195     outeriter        += nri;
1196
1197     /* Update outer/inner flops */
1198
1199     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1200 }
1201 /*
1202  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1203  * Electrostatics interaction: ReactionField
1204  * VdW interaction:            LennardJones
1205  * Geometry:                   Water3-Water3
1206  * Calculate force/pot:        Force
1207  */
1208 void
1209 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1210                     (t_nblist                    * gmx_restrict       nlist,
1211                      rvec                        * gmx_restrict          xx,
1212                      rvec                        * gmx_restrict          ff,
1213                      struct t_forcerec           * gmx_restrict          fr,
1214                      t_mdatoms                   * gmx_restrict     mdatoms,
1215                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1216                      t_nrnb                      * gmx_restrict        nrnb)
1217 {
1218     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1219      * just 0 for non-waters.
1220      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1221      * jnr indices corresponding to data put in the four positions in the SIMD register.
1222      */
1223     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1224     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1225     int              jnrA,jnrB,jnrC,jnrD;
1226     int              jnrE,jnrF,jnrG,jnrH;
1227     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1228     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1229     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1230     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1231     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1232     real             rcutoff_scalar;
1233     real             *shiftvec,*fshift,*x,*f;
1234     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1235     real             scratch[4*DIM];
1236     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1237     real *           vdwioffsetptr0;
1238     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1239     real *           vdwioffsetptr1;
1240     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1241     real *           vdwioffsetptr2;
1242     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1243     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1244     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1245     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1246     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1247     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1248     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1249     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1250     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1251     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1252     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1253     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1254     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1255     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1256     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1257     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1258     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1259     real             *charge;
1260     int              nvdwtype;
1261     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1262     int              *vdwtype;
1263     real             *vdwparam;
1264     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1265     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1266     __m256           dummy_mask,cutoff_mask;
1267     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1268     __m256           one     = _mm256_set1_ps(1.0);
1269     __m256           two     = _mm256_set1_ps(2.0);
1270     x                = xx[0];
1271     f                = ff[0];
1272
1273     nri              = nlist->nri;
1274     iinr             = nlist->iinr;
1275     jindex           = nlist->jindex;
1276     jjnr             = nlist->jjnr;
1277     shiftidx         = nlist->shift;
1278     gid              = nlist->gid;
1279     shiftvec         = fr->shift_vec[0];
1280     fshift           = fr->fshift[0];
1281     facel            = _mm256_set1_ps(fr->ic->epsfac);
1282     charge           = mdatoms->chargeA;
1283     krf              = _mm256_set1_ps(fr->ic->k_rf);
1284     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1285     crf              = _mm256_set1_ps(fr->ic->c_rf);
1286     nvdwtype         = fr->ntype;
1287     vdwparam         = fr->nbfp;
1288     vdwtype          = mdatoms->typeA;
1289
1290     /* Setup water-specific parameters */
1291     inr              = nlist->iinr[0];
1292     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1293     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1294     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1295     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1296
1297     jq0              = _mm256_set1_ps(charge[inr+0]);
1298     jq1              = _mm256_set1_ps(charge[inr+1]);
1299     jq2              = _mm256_set1_ps(charge[inr+2]);
1300     vdwjidx0A        = 2*vdwtype[inr+0];
1301     qq00             = _mm256_mul_ps(iq0,jq0);
1302     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1303     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1304     qq01             = _mm256_mul_ps(iq0,jq1);
1305     qq02             = _mm256_mul_ps(iq0,jq2);
1306     qq10             = _mm256_mul_ps(iq1,jq0);
1307     qq11             = _mm256_mul_ps(iq1,jq1);
1308     qq12             = _mm256_mul_ps(iq1,jq2);
1309     qq20             = _mm256_mul_ps(iq2,jq0);
1310     qq21             = _mm256_mul_ps(iq2,jq1);
1311     qq22             = _mm256_mul_ps(iq2,jq2);
1312
1313     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1314     rcutoff_scalar   = fr->ic->rcoulomb;
1315     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1316     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1317
1318     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1319     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
1320
1321     /* Avoid stupid compiler warnings */
1322     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1323     j_coord_offsetA = 0;
1324     j_coord_offsetB = 0;
1325     j_coord_offsetC = 0;
1326     j_coord_offsetD = 0;
1327     j_coord_offsetE = 0;
1328     j_coord_offsetF = 0;
1329     j_coord_offsetG = 0;
1330     j_coord_offsetH = 0;
1331
1332     outeriter        = 0;
1333     inneriter        = 0;
1334
1335     for(iidx=0;iidx<4*DIM;iidx++)
1336     {
1337         scratch[iidx] = 0.0;
1338     }
1339
1340     /* Start outer loop over neighborlists */
1341     for(iidx=0; iidx<nri; iidx++)
1342     {
1343         /* Load shift vector for this list */
1344         i_shift_offset   = DIM*shiftidx[iidx];
1345
1346         /* Load limits for loop over neighbors */
1347         j_index_start    = jindex[iidx];
1348         j_index_end      = jindex[iidx+1];
1349
1350         /* Get outer coordinate index */
1351         inr              = iinr[iidx];
1352         i_coord_offset   = DIM*inr;
1353
1354         /* Load i particle coords and add shift vector */
1355         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1356                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1357
1358         fix0             = _mm256_setzero_ps();
1359         fiy0             = _mm256_setzero_ps();
1360         fiz0             = _mm256_setzero_ps();
1361         fix1             = _mm256_setzero_ps();
1362         fiy1             = _mm256_setzero_ps();
1363         fiz1             = _mm256_setzero_ps();
1364         fix2             = _mm256_setzero_ps();
1365         fiy2             = _mm256_setzero_ps();
1366         fiz2             = _mm256_setzero_ps();
1367
1368         /* Start inner kernel loop */
1369         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1370         {
1371
1372             /* Get j neighbor index, and coordinate index */
1373             jnrA             = jjnr[jidx];
1374             jnrB             = jjnr[jidx+1];
1375             jnrC             = jjnr[jidx+2];
1376             jnrD             = jjnr[jidx+3];
1377             jnrE             = jjnr[jidx+4];
1378             jnrF             = jjnr[jidx+5];
1379             jnrG             = jjnr[jidx+6];
1380             jnrH             = jjnr[jidx+7];
1381             j_coord_offsetA  = DIM*jnrA;
1382             j_coord_offsetB  = DIM*jnrB;
1383             j_coord_offsetC  = DIM*jnrC;
1384             j_coord_offsetD  = DIM*jnrD;
1385             j_coord_offsetE  = DIM*jnrE;
1386             j_coord_offsetF  = DIM*jnrF;
1387             j_coord_offsetG  = DIM*jnrG;
1388             j_coord_offsetH  = DIM*jnrH;
1389
1390             /* load j atom coordinates */
1391             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1392                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1393                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1394                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1395                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1396
1397             /* Calculate displacement vector */
1398             dx00             = _mm256_sub_ps(ix0,jx0);
1399             dy00             = _mm256_sub_ps(iy0,jy0);
1400             dz00             = _mm256_sub_ps(iz0,jz0);
1401             dx01             = _mm256_sub_ps(ix0,jx1);
1402             dy01             = _mm256_sub_ps(iy0,jy1);
1403             dz01             = _mm256_sub_ps(iz0,jz1);
1404             dx02             = _mm256_sub_ps(ix0,jx2);
1405             dy02             = _mm256_sub_ps(iy0,jy2);
1406             dz02             = _mm256_sub_ps(iz0,jz2);
1407             dx10             = _mm256_sub_ps(ix1,jx0);
1408             dy10             = _mm256_sub_ps(iy1,jy0);
1409             dz10             = _mm256_sub_ps(iz1,jz0);
1410             dx11             = _mm256_sub_ps(ix1,jx1);
1411             dy11             = _mm256_sub_ps(iy1,jy1);
1412             dz11             = _mm256_sub_ps(iz1,jz1);
1413             dx12             = _mm256_sub_ps(ix1,jx2);
1414             dy12             = _mm256_sub_ps(iy1,jy2);
1415             dz12             = _mm256_sub_ps(iz1,jz2);
1416             dx20             = _mm256_sub_ps(ix2,jx0);
1417             dy20             = _mm256_sub_ps(iy2,jy0);
1418             dz20             = _mm256_sub_ps(iz2,jz0);
1419             dx21             = _mm256_sub_ps(ix2,jx1);
1420             dy21             = _mm256_sub_ps(iy2,jy1);
1421             dz21             = _mm256_sub_ps(iz2,jz1);
1422             dx22             = _mm256_sub_ps(ix2,jx2);
1423             dy22             = _mm256_sub_ps(iy2,jy2);
1424             dz22             = _mm256_sub_ps(iz2,jz2);
1425
1426             /* Calculate squared distance and things based on it */
1427             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1428             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1429             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1430             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1431             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1432             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1433             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1434             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1435             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1436
1437             rinv00           = avx256_invsqrt_f(rsq00);
1438             rinv01           = avx256_invsqrt_f(rsq01);
1439             rinv02           = avx256_invsqrt_f(rsq02);
1440             rinv10           = avx256_invsqrt_f(rsq10);
1441             rinv11           = avx256_invsqrt_f(rsq11);
1442             rinv12           = avx256_invsqrt_f(rsq12);
1443             rinv20           = avx256_invsqrt_f(rsq20);
1444             rinv21           = avx256_invsqrt_f(rsq21);
1445             rinv22           = avx256_invsqrt_f(rsq22);
1446
1447             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1448             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1449             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1450             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1451             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1452             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1453             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1454             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1455             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1456
1457             fjx0             = _mm256_setzero_ps();
1458             fjy0             = _mm256_setzero_ps();
1459             fjz0             = _mm256_setzero_ps();
1460             fjx1             = _mm256_setzero_ps();
1461             fjy1             = _mm256_setzero_ps();
1462             fjz1             = _mm256_setzero_ps();
1463             fjx2             = _mm256_setzero_ps();
1464             fjy2             = _mm256_setzero_ps();
1465             fjz2             = _mm256_setzero_ps();
1466
1467             /**************************
1468              * CALCULATE INTERACTIONS *
1469              **************************/
1470
1471             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1472             {
1473
1474             /* REACTION-FIELD ELECTROSTATICS */
1475             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1476
1477             /* LENNARD-JONES DISPERSION/REPULSION */
1478
1479             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1480             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1481
1482             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1483
1484             fscal            = _mm256_add_ps(felec,fvdw);
1485
1486             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1487
1488             /* Calculate temporary vectorial force */
1489             tx               = _mm256_mul_ps(fscal,dx00);
1490             ty               = _mm256_mul_ps(fscal,dy00);
1491             tz               = _mm256_mul_ps(fscal,dz00);
1492
1493             /* Update vectorial force */
1494             fix0             = _mm256_add_ps(fix0,tx);
1495             fiy0             = _mm256_add_ps(fiy0,ty);
1496             fiz0             = _mm256_add_ps(fiz0,tz);
1497
1498             fjx0             = _mm256_add_ps(fjx0,tx);
1499             fjy0             = _mm256_add_ps(fjy0,ty);
1500             fjz0             = _mm256_add_ps(fjz0,tz);
1501
1502             }
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1509             {
1510
1511             /* REACTION-FIELD ELECTROSTATICS */
1512             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1513
1514             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1515
1516             fscal            = felec;
1517
1518             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm256_mul_ps(fscal,dx01);
1522             ty               = _mm256_mul_ps(fscal,dy01);
1523             tz               = _mm256_mul_ps(fscal,dz01);
1524
1525             /* Update vectorial force */
1526             fix0             = _mm256_add_ps(fix0,tx);
1527             fiy0             = _mm256_add_ps(fiy0,ty);
1528             fiz0             = _mm256_add_ps(fiz0,tz);
1529
1530             fjx1             = _mm256_add_ps(fjx1,tx);
1531             fjy1             = _mm256_add_ps(fjy1,ty);
1532             fjz1             = _mm256_add_ps(fjz1,tz);
1533
1534             }
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1541             {
1542
1543             /* REACTION-FIELD ELECTROSTATICS */
1544             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1545
1546             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1547
1548             fscal            = felec;
1549
1550             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1551
1552             /* Calculate temporary vectorial force */
1553             tx               = _mm256_mul_ps(fscal,dx02);
1554             ty               = _mm256_mul_ps(fscal,dy02);
1555             tz               = _mm256_mul_ps(fscal,dz02);
1556
1557             /* Update vectorial force */
1558             fix0             = _mm256_add_ps(fix0,tx);
1559             fiy0             = _mm256_add_ps(fiy0,ty);
1560             fiz0             = _mm256_add_ps(fiz0,tz);
1561
1562             fjx2             = _mm256_add_ps(fjx2,tx);
1563             fjy2             = _mm256_add_ps(fjy2,ty);
1564             fjz2             = _mm256_add_ps(fjz2,tz);
1565
1566             }
1567
1568             /**************************
1569              * CALCULATE INTERACTIONS *
1570              **************************/
1571
1572             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1573             {
1574
1575             /* REACTION-FIELD ELECTROSTATICS */
1576             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1577
1578             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1579
1580             fscal            = felec;
1581
1582             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm256_mul_ps(fscal,dx10);
1586             ty               = _mm256_mul_ps(fscal,dy10);
1587             tz               = _mm256_mul_ps(fscal,dz10);
1588
1589             /* Update vectorial force */
1590             fix1             = _mm256_add_ps(fix1,tx);
1591             fiy1             = _mm256_add_ps(fiy1,ty);
1592             fiz1             = _mm256_add_ps(fiz1,tz);
1593
1594             fjx0             = _mm256_add_ps(fjx0,tx);
1595             fjy0             = _mm256_add_ps(fjy0,ty);
1596             fjz0             = _mm256_add_ps(fjz0,tz);
1597
1598             }
1599
1600             /**************************
1601              * CALCULATE INTERACTIONS *
1602              **************************/
1603
1604             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1605             {
1606
1607             /* REACTION-FIELD ELECTROSTATICS */
1608             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1609
1610             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1611
1612             fscal            = felec;
1613
1614             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1615
1616             /* Calculate temporary vectorial force */
1617             tx               = _mm256_mul_ps(fscal,dx11);
1618             ty               = _mm256_mul_ps(fscal,dy11);
1619             tz               = _mm256_mul_ps(fscal,dz11);
1620
1621             /* Update vectorial force */
1622             fix1             = _mm256_add_ps(fix1,tx);
1623             fiy1             = _mm256_add_ps(fiy1,ty);
1624             fiz1             = _mm256_add_ps(fiz1,tz);
1625
1626             fjx1             = _mm256_add_ps(fjx1,tx);
1627             fjy1             = _mm256_add_ps(fjy1,ty);
1628             fjz1             = _mm256_add_ps(fjz1,tz);
1629
1630             }
1631
1632             /**************************
1633              * CALCULATE INTERACTIONS *
1634              **************************/
1635
1636             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1637             {
1638
1639             /* REACTION-FIELD ELECTROSTATICS */
1640             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1641
1642             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1643
1644             fscal            = felec;
1645
1646             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1647
1648             /* Calculate temporary vectorial force */
1649             tx               = _mm256_mul_ps(fscal,dx12);
1650             ty               = _mm256_mul_ps(fscal,dy12);
1651             tz               = _mm256_mul_ps(fscal,dz12);
1652
1653             /* Update vectorial force */
1654             fix1             = _mm256_add_ps(fix1,tx);
1655             fiy1             = _mm256_add_ps(fiy1,ty);
1656             fiz1             = _mm256_add_ps(fiz1,tz);
1657
1658             fjx2             = _mm256_add_ps(fjx2,tx);
1659             fjy2             = _mm256_add_ps(fjy2,ty);
1660             fjz2             = _mm256_add_ps(fjz2,tz);
1661
1662             }
1663
1664             /**************************
1665              * CALCULATE INTERACTIONS *
1666              **************************/
1667
1668             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1669             {
1670
1671             /* REACTION-FIELD ELECTROSTATICS */
1672             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1673
1674             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm256_mul_ps(fscal,dx20);
1682             ty               = _mm256_mul_ps(fscal,dy20);
1683             tz               = _mm256_mul_ps(fscal,dz20);
1684
1685             /* Update vectorial force */
1686             fix2             = _mm256_add_ps(fix2,tx);
1687             fiy2             = _mm256_add_ps(fiy2,ty);
1688             fiz2             = _mm256_add_ps(fiz2,tz);
1689
1690             fjx0             = _mm256_add_ps(fjx0,tx);
1691             fjy0             = _mm256_add_ps(fjy0,ty);
1692             fjz0             = _mm256_add_ps(fjz0,tz);
1693
1694             }
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1701             {
1702
1703             /* REACTION-FIELD ELECTROSTATICS */
1704             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1705
1706             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1707
1708             fscal            = felec;
1709
1710             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm256_mul_ps(fscal,dx21);
1714             ty               = _mm256_mul_ps(fscal,dy21);
1715             tz               = _mm256_mul_ps(fscal,dz21);
1716
1717             /* Update vectorial force */
1718             fix2             = _mm256_add_ps(fix2,tx);
1719             fiy2             = _mm256_add_ps(fiy2,ty);
1720             fiz2             = _mm256_add_ps(fiz2,tz);
1721
1722             fjx1             = _mm256_add_ps(fjx1,tx);
1723             fjy1             = _mm256_add_ps(fjy1,ty);
1724             fjz1             = _mm256_add_ps(fjz1,tz);
1725
1726             }
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1733             {
1734
1735             /* REACTION-FIELD ELECTROSTATICS */
1736             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1737
1738             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1739
1740             fscal            = felec;
1741
1742             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm256_mul_ps(fscal,dx22);
1746             ty               = _mm256_mul_ps(fscal,dy22);
1747             tz               = _mm256_mul_ps(fscal,dz22);
1748
1749             /* Update vectorial force */
1750             fix2             = _mm256_add_ps(fix2,tx);
1751             fiy2             = _mm256_add_ps(fiy2,ty);
1752             fiz2             = _mm256_add_ps(fiz2,tz);
1753
1754             fjx2             = _mm256_add_ps(fjx2,tx);
1755             fjy2             = _mm256_add_ps(fjy2,ty);
1756             fjz2             = _mm256_add_ps(fjz2,tz);
1757
1758             }
1759
1760             fjptrA             = f+j_coord_offsetA;
1761             fjptrB             = f+j_coord_offsetB;
1762             fjptrC             = f+j_coord_offsetC;
1763             fjptrD             = f+j_coord_offsetD;
1764             fjptrE             = f+j_coord_offsetE;
1765             fjptrF             = f+j_coord_offsetF;
1766             fjptrG             = f+j_coord_offsetG;
1767             fjptrH             = f+j_coord_offsetH;
1768
1769             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1770                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1771
1772             /* Inner loop uses 277 flops */
1773         }
1774
1775         if(jidx<j_index_end)
1776         {
1777
1778             /* Get j neighbor index, and coordinate index */
1779             jnrlistA         = jjnr[jidx];
1780             jnrlistB         = jjnr[jidx+1];
1781             jnrlistC         = jjnr[jidx+2];
1782             jnrlistD         = jjnr[jidx+3];
1783             jnrlistE         = jjnr[jidx+4];
1784             jnrlistF         = jjnr[jidx+5];
1785             jnrlistG         = jjnr[jidx+6];
1786             jnrlistH         = jjnr[jidx+7];
1787             /* Sign of each element will be negative for non-real atoms.
1788              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1789              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1790              */
1791             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1792                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1793                                             
1794             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1795             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1796             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1797             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1798             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1799             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1800             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1801             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1802             j_coord_offsetA  = DIM*jnrA;
1803             j_coord_offsetB  = DIM*jnrB;
1804             j_coord_offsetC  = DIM*jnrC;
1805             j_coord_offsetD  = DIM*jnrD;
1806             j_coord_offsetE  = DIM*jnrE;
1807             j_coord_offsetF  = DIM*jnrF;
1808             j_coord_offsetG  = DIM*jnrG;
1809             j_coord_offsetH  = DIM*jnrH;
1810
1811             /* load j atom coordinates */
1812             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1813                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1814                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1815                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1816                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1817
1818             /* Calculate displacement vector */
1819             dx00             = _mm256_sub_ps(ix0,jx0);
1820             dy00             = _mm256_sub_ps(iy0,jy0);
1821             dz00             = _mm256_sub_ps(iz0,jz0);
1822             dx01             = _mm256_sub_ps(ix0,jx1);
1823             dy01             = _mm256_sub_ps(iy0,jy1);
1824             dz01             = _mm256_sub_ps(iz0,jz1);
1825             dx02             = _mm256_sub_ps(ix0,jx2);
1826             dy02             = _mm256_sub_ps(iy0,jy2);
1827             dz02             = _mm256_sub_ps(iz0,jz2);
1828             dx10             = _mm256_sub_ps(ix1,jx0);
1829             dy10             = _mm256_sub_ps(iy1,jy0);
1830             dz10             = _mm256_sub_ps(iz1,jz0);
1831             dx11             = _mm256_sub_ps(ix1,jx1);
1832             dy11             = _mm256_sub_ps(iy1,jy1);
1833             dz11             = _mm256_sub_ps(iz1,jz1);
1834             dx12             = _mm256_sub_ps(ix1,jx2);
1835             dy12             = _mm256_sub_ps(iy1,jy2);
1836             dz12             = _mm256_sub_ps(iz1,jz2);
1837             dx20             = _mm256_sub_ps(ix2,jx0);
1838             dy20             = _mm256_sub_ps(iy2,jy0);
1839             dz20             = _mm256_sub_ps(iz2,jz0);
1840             dx21             = _mm256_sub_ps(ix2,jx1);
1841             dy21             = _mm256_sub_ps(iy2,jy1);
1842             dz21             = _mm256_sub_ps(iz2,jz1);
1843             dx22             = _mm256_sub_ps(ix2,jx2);
1844             dy22             = _mm256_sub_ps(iy2,jy2);
1845             dz22             = _mm256_sub_ps(iz2,jz2);
1846
1847             /* Calculate squared distance and things based on it */
1848             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1849             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1850             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1851             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1852             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1853             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1854             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1855             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1856             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1857
1858             rinv00           = avx256_invsqrt_f(rsq00);
1859             rinv01           = avx256_invsqrt_f(rsq01);
1860             rinv02           = avx256_invsqrt_f(rsq02);
1861             rinv10           = avx256_invsqrt_f(rsq10);
1862             rinv11           = avx256_invsqrt_f(rsq11);
1863             rinv12           = avx256_invsqrt_f(rsq12);
1864             rinv20           = avx256_invsqrt_f(rsq20);
1865             rinv21           = avx256_invsqrt_f(rsq21);
1866             rinv22           = avx256_invsqrt_f(rsq22);
1867
1868             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1869             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1870             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1871             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1872             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1873             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1874             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1875             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1876             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1877
1878             fjx0             = _mm256_setzero_ps();
1879             fjy0             = _mm256_setzero_ps();
1880             fjz0             = _mm256_setzero_ps();
1881             fjx1             = _mm256_setzero_ps();
1882             fjy1             = _mm256_setzero_ps();
1883             fjz1             = _mm256_setzero_ps();
1884             fjx2             = _mm256_setzero_ps();
1885             fjy2             = _mm256_setzero_ps();
1886             fjz2             = _mm256_setzero_ps();
1887
1888             /**************************
1889              * CALCULATE INTERACTIONS *
1890              **************************/
1891
1892             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1893             {
1894
1895             /* REACTION-FIELD ELECTROSTATICS */
1896             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1897
1898             /* LENNARD-JONES DISPERSION/REPULSION */
1899
1900             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1901             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1902
1903             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1904
1905             fscal            = _mm256_add_ps(felec,fvdw);
1906
1907             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1908
1909             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1910
1911             /* Calculate temporary vectorial force */
1912             tx               = _mm256_mul_ps(fscal,dx00);
1913             ty               = _mm256_mul_ps(fscal,dy00);
1914             tz               = _mm256_mul_ps(fscal,dz00);
1915
1916             /* Update vectorial force */
1917             fix0             = _mm256_add_ps(fix0,tx);
1918             fiy0             = _mm256_add_ps(fiy0,ty);
1919             fiz0             = _mm256_add_ps(fiz0,tz);
1920
1921             fjx0             = _mm256_add_ps(fjx0,tx);
1922             fjy0             = _mm256_add_ps(fjy0,ty);
1923             fjz0             = _mm256_add_ps(fjz0,tz);
1924
1925             }
1926
1927             /**************************
1928              * CALCULATE INTERACTIONS *
1929              **************************/
1930
1931             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1932             {
1933
1934             /* REACTION-FIELD ELECTROSTATICS */
1935             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1936
1937             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1938
1939             fscal            = felec;
1940
1941             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1942
1943             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1944
1945             /* Calculate temporary vectorial force */
1946             tx               = _mm256_mul_ps(fscal,dx01);
1947             ty               = _mm256_mul_ps(fscal,dy01);
1948             tz               = _mm256_mul_ps(fscal,dz01);
1949
1950             /* Update vectorial force */
1951             fix0             = _mm256_add_ps(fix0,tx);
1952             fiy0             = _mm256_add_ps(fiy0,ty);
1953             fiz0             = _mm256_add_ps(fiz0,tz);
1954
1955             fjx1             = _mm256_add_ps(fjx1,tx);
1956             fjy1             = _mm256_add_ps(fjy1,ty);
1957             fjz1             = _mm256_add_ps(fjz1,tz);
1958
1959             }
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1966             {
1967
1968             /* REACTION-FIELD ELECTROSTATICS */
1969             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1970
1971             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1976
1977             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1978
1979             /* Calculate temporary vectorial force */
1980             tx               = _mm256_mul_ps(fscal,dx02);
1981             ty               = _mm256_mul_ps(fscal,dy02);
1982             tz               = _mm256_mul_ps(fscal,dz02);
1983
1984             /* Update vectorial force */
1985             fix0             = _mm256_add_ps(fix0,tx);
1986             fiy0             = _mm256_add_ps(fiy0,ty);
1987             fiz0             = _mm256_add_ps(fiz0,tz);
1988
1989             fjx2             = _mm256_add_ps(fjx2,tx);
1990             fjy2             = _mm256_add_ps(fjy2,ty);
1991             fjz2             = _mm256_add_ps(fjz2,tz);
1992
1993             }
1994
1995             /**************************
1996              * CALCULATE INTERACTIONS *
1997              **************************/
1998
1999             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2000             {
2001
2002             /* REACTION-FIELD ELECTROSTATICS */
2003             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2004
2005             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2006
2007             fscal            = felec;
2008
2009             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2010
2011             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2012
2013             /* Calculate temporary vectorial force */
2014             tx               = _mm256_mul_ps(fscal,dx10);
2015             ty               = _mm256_mul_ps(fscal,dy10);
2016             tz               = _mm256_mul_ps(fscal,dz10);
2017
2018             /* Update vectorial force */
2019             fix1             = _mm256_add_ps(fix1,tx);
2020             fiy1             = _mm256_add_ps(fiy1,ty);
2021             fiz1             = _mm256_add_ps(fiz1,tz);
2022
2023             fjx0             = _mm256_add_ps(fjx0,tx);
2024             fjy0             = _mm256_add_ps(fjy0,ty);
2025             fjz0             = _mm256_add_ps(fjz0,tz);
2026
2027             }
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2034             {
2035
2036             /* REACTION-FIELD ELECTROSTATICS */
2037             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2038
2039             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2040
2041             fscal            = felec;
2042
2043             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2044
2045             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2046
2047             /* Calculate temporary vectorial force */
2048             tx               = _mm256_mul_ps(fscal,dx11);
2049             ty               = _mm256_mul_ps(fscal,dy11);
2050             tz               = _mm256_mul_ps(fscal,dz11);
2051
2052             /* Update vectorial force */
2053             fix1             = _mm256_add_ps(fix1,tx);
2054             fiy1             = _mm256_add_ps(fiy1,ty);
2055             fiz1             = _mm256_add_ps(fiz1,tz);
2056
2057             fjx1             = _mm256_add_ps(fjx1,tx);
2058             fjy1             = _mm256_add_ps(fjy1,ty);
2059             fjz1             = _mm256_add_ps(fjz1,tz);
2060
2061             }
2062
2063             /**************************
2064              * CALCULATE INTERACTIONS *
2065              **************************/
2066
2067             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2068             {
2069
2070             /* REACTION-FIELD ELECTROSTATICS */
2071             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2072
2073             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2074
2075             fscal            = felec;
2076
2077             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2078
2079             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm256_mul_ps(fscal,dx12);
2083             ty               = _mm256_mul_ps(fscal,dy12);
2084             tz               = _mm256_mul_ps(fscal,dz12);
2085
2086             /* Update vectorial force */
2087             fix1             = _mm256_add_ps(fix1,tx);
2088             fiy1             = _mm256_add_ps(fiy1,ty);
2089             fiz1             = _mm256_add_ps(fiz1,tz);
2090
2091             fjx2             = _mm256_add_ps(fjx2,tx);
2092             fjy2             = _mm256_add_ps(fjy2,ty);
2093             fjz2             = _mm256_add_ps(fjz2,tz);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2102             {
2103
2104             /* REACTION-FIELD ELECTROSTATICS */
2105             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2106
2107             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2108
2109             fscal            = felec;
2110
2111             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2112
2113             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2114
2115             /* Calculate temporary vectorial force */
2116             tx               = _mm256_mul_ps(fscal,dx20);
2117             ty               = _mm256_mul_ps(fscal,dy20);
2118             tz               = _mm256_mul_ps(fscal,dz20);
2119
2120             /* Update vectorial force */
2121             fix2             = _mm256_add_ps(fix2,tx);
2122             fiy2             = _mm256_add_ps(fiy2,ty);
2123             fiz2             = _mm256_add_ps(fiz2,tz);
2124
2125             fjx0             = _mm256_add_ps(fjx0,tx);
2126             fjy0             = _mm256_add_ps(fjy0,ty);
2127             fjz0             = _mm256_add_ps(fjz0,tz);
2128
2129             }
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2136             {
2137
2138             /* REACTION-FIELD ELECTROSTATICS */
2139             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2140
2141             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2142
2143             fscal            = felec;
2144
2145             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2146
2147             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2148
2149             /* Calculate temporary vectorial force */
2150             tx               = _mm256_mul_ps(fscal,dx21);
2151             ty               = _mm256_mul_ps(fscal,dy21);
2152             tz               = _mm256_mul_ps(fscal,dz21);
2153
2154             /* Update vectorial force */
2155             fix2             = _mm256_add_ps(fix2,tx);
2156             fiy2             = _mm256_add_ps(fiy2,ty);
2157             fiz2             = _mm256_add_ps(fiz2,tz);
2158
2159             fjx1             = _mm256_add_ps(fjx1,tx);
2160             fjy1             = _mm256_add_ps(fjy1,ty);
2161             fjz1             = _mm256_add_ps(fjz1,tz);
2162
2163             }
2164
2165             /**************************
2166              * CALCULATE INTERACTIONS *
2167              **************************/
2168
2169             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2170             {
2171
2172             /* REACTION-FIELD ELECTROSTATICS */
2173             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2174
2175             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2176
2177             fscal            = felec;
2178
2179             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2180
2181             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2182
2183             /* Calculate temporary vectorial force */
2184             tx               = _mm256_mul_ps(fscal,dx22);
2185             ty               = _mm256_mul_ps(fscal,dy22);
2186             tz               = _mm256_mul_ps(fscal,dz22);
2187
2188             /* Update vectorial force */
2189             fix2             = _mm256_add_ps(fix2,tx);
2190             fiy2             = _mm256_add_ps(fiy2,ty);
2191             fiz2             = _mm256_add_ps(fiz2,tz);
2192
2193             fjx2             = _mm256_add_ps(fjx2,tx);
2194             fjy2             = _mm256_add_ps(fjy2,ty);
2195             fjz2             = _mm256_add_ps(fjz2,tz);
2196
2197             }
2198
2199             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2200             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2201             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2202             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2203             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2204             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2205             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2206             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2207
2208             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2209                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2210
2211             /* Inner loop uses 277 flops */
2212         }
2213
2214         /* End of innermost loop */
2215
2216         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2217                                                  f+i_coord_offset,fshift+i_shift_offset);
2218
2219         /* Increment number of inner iterations */
2220         inneriter                  += j_index_end - j_index_start;
2221
2222         /* Outer loop uses 18 flops */
2223     }
2224
2225     /* Increment number of outer iterations */
2226     outeriter        += nri;
2227
2228     /* Update outer/inner flops */
2229
2230     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
2231 }