Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     int              nvdwtype;
110     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111     int              *vdwtype;
112     real             *vdwparam;
113     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
114     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
115     __m256           dummy_mask,cutoff_mask;
116     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117     __m256           one     = _mm256_set1_ps(1.0);
118     __m256           two     = _mm256_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm256_set1_ps(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     krf              = _mm256_set1_ps(fr->ic->k_rf);
133     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
134     crf              = _mm256_set1_ps(fr->ic->c_rf);
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
142     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
143     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
144     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm256_set1_ps(charge[inr+0]);
147     jq1              = _mm256_set1_ps(charge[inr+1]);
148     jq2              = _mm256_set1_ps(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm256_mul_ps(iq0,jq0);
151     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
152     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
153     qq01             = _mm256_mul_ps(iq0,jq1);
154     qq02             = _mm256_mul_ps(iq0,jq2);
155     qq10             = _mm256_mul_ps(iq1,jq0);
156     qq11             = _mm256_mul_ps(iq1,jq1);
157     qq12             = _mm256_mul_ps(iq1,jq2);
158     qq20             = _mm256_mul_ps(iq2,jq0);
159     qq21             = _mm256_mul_ps(iq2,jq1);
160     qq22             = _mm256_mul_ps(iq2,jq2);
161
162     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163     rcutoff_scalar   = fr->rcoulomb;
164     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
165     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
166
167     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
168     rvdw             = _mm256_set1_ps(fr->rvdw);
169
170     /* Avoid stupid compiler warnings */
171     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
172     j_coord_offsetA = 0;
173     j_coord_offsetB = 0;
174     j_coord_offsetC = 0;
175     j_coord_offsetD = 0;
176     j_coord_offsetE = 0;
177     j_coord_offsetF = 0;
178     j_coord_offsetG = 0;
179     j_coord_offsetH = 0;
180
181     outeriter        = 0;
182     inneriter        = 0;
183
184     for(iidx=0;iidx<4*DIM;iidx++)
185     {
186         scratch[iidx] = 0.0;
187     }
188
189     /* Start outer loop over neighborlists */
190     for(iidx=0; iidx<nri; iidx++)
191     {
192         /* Load shift vector for this list */
193         i_shift_offset   = DIM*shiftidx[iidx];
194
195         /* Load limits for loop over neighbors */
196         j_index_start    = jindex[iidx];
197         j_index_end      = jindex[iidx+1];
198
199         /* Get outer coordinate index */
200         inr              = iinr[iidx];
201         i_coord_offset   = DIM*inr;
202
203         /* Load i particle coords and add shift vector */
204         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206
207         fix0             = _mm256_setzero_ps();
208         fiy0             = _mm256_setzero_ps();
209         fiz0             = _mm256_setzero_ps();
210         fix1             = _mm256_setzero_ps();
211         fiy1             = _mm256_setzero_ps();
212         fiz1             = _mm256_setzero_ps();
213         fix2             = _mm256_setzero_ps();
214         fiy2             = _mm256_setzero_ps();
215         fiz2             = _mm256_setzero_ps();
216
217         /* Reset potential sums */
218         velecsum         = _mm256_setzero_ps();
219         vvdwsum          = _mm256_setzero_ps();
220
221         /* Start inner kernel loop */
222         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
223         {
224
225             /* Get j neighbor index, and coordinate index */
226             jnrA             = jjnr[jidx];
227             jnrB             = jjnr[jidx+1];
228             jnrC             = jjnr[jidx+2];
229             jnrD             = jjnr[jidx+3];
230             jnrE             = jjnr[jidx+4];
231             jnrF             = jjnr[jidx+5];
232             jnrG             = jjnr[jidx+6];
233             jnrH             = jjnr[jidx+7];
234             j_coord_offsetA  = DIM*jnrA;
235             j_coord_offsetB  = DIM*jnrB;
236             j_coord_offsetC  = DIM*jnrC;
237             j_coord_offsetD  = DIM*jnrD;
238             j_coord_offsetE  = DIM*jnrE;
239             j_coord_offsetF  = DIM*jnrF;
240             j_coord_offsetG  = DIM*jnrG;
241             j_coord_offsetH  = DIM*jnrH;
242
243             /* load j atom coordinates */
244             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
245                                                  x+j_coord_offsetC,x+j_coord_offsetD,
246                                                  x+j_coord_offsetE,x+j_coord_offsetF,
247                                                  x+j_coord_offsetG,x+j_coord_offsetH,
248                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249
250             /* Calculate displacement vector */
251             dx00             = _mm256_sub_ps(ix0,jx0);
252             dy00             = _mm256_sub_ps(iy0,jy0);
253             dz00             = _mm256_sub_ps(iz0,jz0);
254             dx01             = _mm256_sub_ps(ix0,jx1);
255             dy01             = _mm256_sub_ps(iy0,jy1);
256             dz01             = _mm256_sub_ps(iz0,jz1);
257             dx02             = _mm256_sub_ps(ix0,jx2);
258             dy02             = _mm256_sub_ps(iy0,jy2);
259             dz02             = _mm256_sub_ps(iz0,jz2);
260             dx10             = _mm256_sub_ps(ix1,jx0);
261             dy10             = _mm256_sub_ps(iy1,jy0);
262             dz10             = _mm256_sub_ps(iz1,jz0);
263             dx11             = _mm256_sub_ps(ix1,jx1);
264             dy11             = _mm256_sub_ps(iy1,jy1);
265             dz11             = _mm256_sub_ps(iz1,jz1);
266             dx12             = _mm256_sub_ps(ix1,jx2);
267             dy12             = _mm256_sub_ps(iy1,jy2);
268             dz12             = _mm256_sub_ps(iz1,jz2);
269             dx20             = _mm256_sub_ps(ix2,jx0);
270             dy20             = _mm256_sub_ps(iy2,jy0);
271             dz20             = _mm256_sub_ps(iz2,jz0);
272             dx21             = _mm256_sub_ps(ix2,jx1);
273             dy21             = _mm256_sub_ps(iy2,jy1);
274             dz21             = _mm256_sub_ps(iz2,jz1);
275             dx22             = _mm256_sub_ps(ix2,jx2);
276             dy22             = _mm256_sub_ps(iy2,jy2);
277             dz22             = _mm256_sub_ps(iz2,jz2);
278
279             /* Calculate squared distance and things based on it */
280             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
281             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
282             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
283             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
284             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
285             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
286             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
287             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
288             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
289
290             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
291             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
292             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
293             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
294             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
295             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
296             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
297             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
298             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
299
300             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
301             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
302             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
303             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
304             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
305             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
306             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
307             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
308             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
309
310             fjx0             = _mm256_setzero_ps();
311             fjy0             = _mm256_setzero_ps();
312             fjz0             = _mm256_setzero_ps();
313             fjx1             = _mm256_setzero_ps();
314             fjy1             = _mm256_setzero_ps();
315             fjz1             = _mm256_setzero_ps();
316             fjx2             = _mm256_setzero_ps();
317             fjy2             = _mm256_setzero_ps();
318             fjz2             = _mm256_setzero_ps();
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             if (gmx_mm256_any_lt(rsq00,rcutoff2))
325             {
326
327             /* REACTION-FIELD ELECTROSTATICS */
328             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
329             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
335             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
336             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
337                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
338             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
339
340             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
341
342             /* Update potential sum for this i atom from the interaction with this j atom. */
343             velec            = _mm256_and_ps(velec,cutoff_mask);
344             velecsum         = _mm256_add_ps(velecsum,velec);
345             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
346             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
347
348             fscal            = _mm256_add_ps(felec,fvdw);
349
350             fscal            = _mm256_and_ps(fscal,cutoff_mask);
351
352             /* Calculate temporary vectorial force */
353             tx               = _mm256_mul_ps(fscal,dx00);
354             ty               = _mm256_mul_ps(fscal,dy00);
355             tz               = _mm256_mul_ps(fscal,dz00);
356
357             /* Update vectorial force */
358             fix0             = _mm256_add_ps(fix0,tx);
359             fiy0             = _mm256_add_ps(fiy0,ty);
360             fiz0             = _mm256_add_ps(fiz0,tz);
361
362             fjx0             = _mm256_add_ps(fjx0,tx);
363             fjy0             = _mm256_add_ps(fjy0,ty);
364             fjz0             = _mm256_add_ps(fjz0,tz);
365
366             }
367
368             /**************************
369              * CALCULATE INTERACTIONS *
370              **************************/
371
372             if (gmx_mm256_any_lt(rsq01,rcutoff2))
373             {
374
375             /* REACTION-FIELD ELECTROSTATICS */
376             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
377             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
378
379             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velec            = _mm256_and_ps(velec,cutoff_mask);
383             velecsum         = _mm256_add_ps(velecsum,velec);
384
385             fscal            = felec;
386
387             fscal            = _mm256_and_ps(fscal,cutoff_mask);
388
389             /* Calculate temporary vectorial force */
390             tx               = _mm256_mul_ps(fscal,dx01);
391             ty               = _mm256_mul_ps(fscal,dy01);
392             tz               = _mm256_mul_ps(fscal,dz01);
393
394             /* Update vectorial force */
395             fix0             = _mm256_add_ps(fix0,tx);
396             fiy0             = _mm256_add_ps(fiy0,ty);
397             fiz0             = _mm256_add_ps(fiz0,tz);
398
399             fjx1             = _mm256_add_ps(fjx1,tx);
400             fjy1             = _mm256_add_ps(fjy1,ty);
401             fjz1             = _mm256_add_ps(fjz1,tz);
402
403             }
404
405             /**************************
406              * CALCULATE INTERACTIONS *
407              **************************/
408
409             if (gmx_mm256_any_lt(rsq02,rcutoff2))
410             {
411
412             /* REACTION-FIELD ELECTROSTATICS */
413             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
414             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
415
416             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
417
418             /* Update potential sum for this i atom from the interaction with this j atom. */
419             velec            = _mm256_and_ps(velec,cutoff_mask);
420             velecsum         = _mm256_add_ps(velecsum,velec);
421
422             fscal            = felec;
423
424             fscal            = _mm256_and_ps(fscal,cutoff_mask);
425
426             /* Calculate temporary vectorial force */
427             tx               = _mm256_mul_ps(fscal,dx02);
428             ty               = _mm256_mul_ps(fscal,dy02);
429             tz               = _mm256_mul_ps(fscal,dz02);
430
431             /* Update vectorial force */
432             fix0             = _mm256_add_ps(fix0,tx);
433             fiy0             = _mm256_add_ps(fiy0,ty);
434             fiz0             = _mm256_add_ps(fiz0,tz);
435
436             fjx2             = _mm256_add_ps(fjx2,tx);
437             fjy2             = _mm256_add_ps(fjy2,ty);
438             fjz2             = _mm256_add_ps(fjz2,tz);
439
440             }
441
442             /**************************
443              * CALCULATE INTERACTIONS *
444              **************************/
445
446             if (gmx_mm256_any_lt(rsq10,rcutoff2))
447             {
448
449             /* REACTION-FIELD ELECTROSTATICS */
450             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
451             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
452
453             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
454
455             /* Update potential sum for this i atom from the interaction with this j atom. */
456             velec            = _mm256_and_ps(velec,cutoff_mask);
457             velecsum         = _mm256_add_ps(velecsum,velec);
458
459             fscal            = felec;
460
461             fscal            = _mm256_and_ps(fscal,cutoff_mask);
462
463             /* Calculate temporary vectorial force */
464             tx               = _mm256_mul_ps(fscal,dx10);
465             ty               = _mm256_mul_ps(fscal,dy10);
466             tz               = _mm256_mul_ps(fscal,dz10);
467
468             /* Update vectorial force */
469             fix1             = _mm256_add_ps(fix1,tx);
470             fiy1             = _mm256_add_ps(fiy1,ty);
471             fiz1             = _mm256_add_ps(fiz1,tz);
472
473             fjx0             = _mm256_add_ps(fjx0,tx);
474             fjy0             = _mm256_add_ps(fjy0,ty);
475             fjz0             = _mm256_add_ps(fjz0,tz);
476
477             }
478
479             /**************************
480              * CALCULATE INTERACTIONS *
481              **************************/
482
483             if (gmx_mm256_any_lt(rsq11,rcutoff2))
484             {
485
486             /* REACTION-FIELD ELECTROSTATICS */
487             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
488             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
489
490             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
491
492             /* Update potential sum for this i atom from the interaction with this j atom. */
493             velec            = _mm256_and_ps(velec,cutoff_mask);
494             velecsum         = _mm256_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             fscal            = _mm256_and_ps(fscal,cutoff_mask);
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm256_mul_ps(fscal,dx11);
502             ty               = _mm256_mul_ps(fscal,dy11);
503             tz               = _mm256_mul_ps(fscal,dz11);
504
505             /* Update vectorial force */
506             fix1             = _mm256_add_ps(fix1,tx);
507             fiy1             = _mm256_add_ps(fiy1,ty);
508             fiz1             = _mm256_add_ps(fiz1,tz);
509
510             fjx1             = _mm256_add_ps(fjx1,tx);
511             fjy1             = _mm256_add_ps(fjy1,ty);
512             fjz1             = _mm256_add_ps(fjz1,tz);
513
514             }
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             if (gmx_mm256_any_lt(rsq12,rcutoff2))
521             {
522
523             /* REACTION-FIELD ELECTROSTATICS */
524             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
525             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
526
527             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
528
529             /* Update potential sum for this i atom from the interaction with this j atom. */
530             velec            = _mm256_and_ps(velec,cutoff_mask);
531             velecsum         = _mm256_add_ps(velecsum,velec);
532
533             fscal            = felec;
534
535             fscal            = _mm256_and_ps(fscal,cutoff_mask);
536
537             /* Calculate temporary vectorial force */
538             tx               = _mm256_mul_ps(fscal,dx12);
539             ty               = _mm256_mul_ps(fscal,dy12);
540             tz               = _mm256_mul_ps(fscal,dz12);
541
542             /* Update vectorial force */
543             fix1             = _mm256_add_ps(fix1,tx);
544             fiy1             = _mm256_add_ps(fiy1,ty);
545             fiz1             = _mm256_add_ps(fiz1,tz);
546
547             fjx2             = _mm256_add_ps(fjx2,tx);
548             fjy2             = _mm256_add_ps(fjy2,ty);
549             fjz2             = _mm256_add_ps(fjz2,tz);
550
551             }
552
553             /**************************
554              * CALCULATE INTERACTIONS *
555              **************************/
556
557             if (gmx_mm256_any_lt(rsq20,rcutoff2))
558             {
559
560             /* REACTION-FIELD ELECTROSTATICS */
561             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
562             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
563
564             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velec            = _mm256_and_ps(velec,cutoff_mask);
568             velecsum         = _mm256_add_ps(velecsum,velec);
569
570             fscal            = felec;
571
572             fscal            = _mm256_and_ps(fscal,cutoff_mask);
573
574             /* Calculate temporary vectorial force */
575             tx               = _mm256_mul_ps(fscal,dx20);
576             ty               = _mm256_mul_ps(fscal,dy20);
577             tz               = _mm256_mul_ps(fscal,dz20);
578
579             /* Update vectorial force */
580             fix2             = _mm256_add_ps(fix2,tx);
581             fiy2             = _mm256_add_ps(fiy2,ty);
582             fiz2             = _mm256_add_ps(fiz2,tz);
583
584             fjx0             = _mm256_add_ps(fjx0,tx);
585             fjy0             = _mm256_add_ps(fjy0,ty);
586             fjz0             = _mm256_add_ps(fjz0,tz);
587
588             }
589
590             /**************************
591              * CALCULATE INTERACTIONS *
592              **************************/
593
594             if (gmx_mm256_any_lt(rsq21,rcutoff2))
595             {
596
597             /* REACTION-FIELD ELECTROSTATICS */
598             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
599             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
600
601             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
602
603             /* Update potential sum for this i atom from the interaction with this j atom. */
604             velec            = _mm256_and_ps(velec,cutoff_mask);
605             velecsum         = _mm256_add_ps(velecsum,velec);
606
607             fscal            = felec;
608
609             fscal            = _mm256_and_ps(fscal,cutoff_mask);
610
611             /* Calculate temporary vectorial force */
612             tx               = _mm256_mul_ps(fscal,dx21);
613             ty               = _mm256_mul_ps(fscal,dy21);
614             tz               = _mm256_mul_ps(fscal,dz21);
615
616             /* Update vectorial force */
617             fix2             = _mm256_add_ps(fix2,tx);
618             fiy2             = _mm256_add_ps(fiy2,ty);
619             fiz2             = _mm256_add_ps(fiz2,tz);
620
621             fjx1             = _mm256_add_ps(fjx1,tx);
622             fjy1             = _mm256_add_ps(fjy1,ty);
623             fjz1             = _mm256_add_ps(fjz1,tz);
624
625             }
626
627             /**************************
628              * CALCULATE INTERACTIONS *
629              **************************/
630
631             if (gmx_mm256_any_lt(rsq22,rcutoff2))
632             {
633
634             /* REACTION-FIELD ELECTROSTATICS */
635             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
636             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
637
638             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
639
640             /* Update potential sum for this i atom from the interaction with this j atom. */
641             velec            = _mm256_and_ps(velec,cutoff_mask);
642             velecsum         = _mm256_add_ps(velecsum,velec);
643
644             fscal            = felec;
645
646             fscal            = _mm256_and_ps(fscal,cutoff_mask);
647
648             /* Calculate temporary vectorial force */
649             tx               = _mm256_mul_ps(fscal,dx22);
650             ty               = _mm256_mul_ps(fscal,dy22);
651             tz               = _mm256_mul_ps(fscal,dz22);
652
653             /* Update vectorial force */
654             fix2             = _mm256_add_ps(fix2,tx);
655             fiy2             = _mm256_add_ps(fiy2,ty);
656             fiz2             = _mm256_add_ps(fiz2,tz);
657
658             fjx2             = _mm256_add_ps(fjx2,tx);
659             fjy2             = _mm256_add_ps(fjy2,ty);
660             fjz2             = _mm256_add_ps(fjz2,tz);
661
662             }
663
664             fjptrA             = f+j_coord_offsetA;
665             fjptrB             = f+j_coord_offsetB;
666             fjptrC             = f+j_coord_offsetC;
667             fjptrD             = f+j_coord_offsetD;
668             fjptrE             = f+j_coord_offsetE;
669             fjptrF             = f+j_coord_offsetF;
670             fjptrG             = f+j_coord_offsetG;
671             fjptrH             = f+j_coord_offsetH;
672
673             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
674                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
675
676             /* Inner loop uses 342 flops */
677         }
678
679         if(jidx<j_index_end)
680         {
681
682             /* Get j neighbor index, and coordinate index */
683             jnrlistA         = jjnr[jidx];
684             jnrlistB         = jjnr[jidx+1];
685             jnrlistC         = jjnr[jidx+2];
686             jnrlistD         = jjnr[jidx+3];
687             jnrlistE         = jjnr[jidx+4];
688             jnrlistF         = jjnr[jidx+5];
689             jnrlistG         = jjnr[jidx+6];
690             jnrlistH         = jjnr[jidx+7];
691             /* Sign of each element will be negative for non-real atoms.
692              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
693              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
694              */
695             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
696                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
697                                             
698             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
699             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
700             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
701             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
702             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
703             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
704             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
705             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
706             j_coord_offsetA  = DIM*jnrA;
707             j_coord_offsetB  = DIM*jnrB;
708             j_coord_offsetC  = DIM*jnrC;
709             j_coord_offsetD  = DIM*jnrD;
710             j_coord_offsetE  = DIM*jnrE;
711             j_coord_offsetF  = DIM*jnrF;
712             j_coord_offsetG  = DIM*jnrG;
713             j_coord_offsetH  = DIM*jnrH;
714
715             /* load j atom coordinates */
716             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
717                                                  x+j_coord_offsetC,x+j_coord_offsetD,
718                                                  x+j_coord_offsetE,x+j_coord_offsetF,
719                                                  x+j_coord_offsetG,x+j_coord_offsetH,
720                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
721
722             /* Calculate displacement vector */
723             dx00             = _mm256_sub_ps(ix0,jx0);
724             dy00             = _mm256_sub_ps(iy0,jy0);
725             dz00             = _mm256_sub_ps(iz0,jz0);
726             dx01             = _mm256_sub_ps(ix0,jx1);
727             dy01             = _mm256_sub_ps(iy0,jy1);
728             dz01             = _mm256_sub_ps(iz0,jz1);
729             dx02             = _mm256_sub_ps(ix0,jx2);
730             dy02             = _mm256_sub_ps(iy0,jy2);
731             dz02             = _mm256_sub_ps(iz0,jz2);
732             dx10             = _mm256_sub_ps(ix1,jx0);
733             dy10             = _mm256_sub_ps(iy1,jy0);
734             dz10             = _mm256_sub_ps(iz1,jz0);
735             dx11             = _mm256_sub_ps(ix1,jx1);
736             dy11             = _mm256_sub_ps(iy1,jy1);
737             dz11             = _mm256_sub_ps(iz1,jz1);
738             dx12             = _mm256_sub_ps(ix1,jx2);
739             dy12             = _mm256_sub_ps(iy1,jy2);
740             dz12             = _mm256_sub_ps(iz1,jz2);
741             dx20             = _mm256_sub_ps(ix2,jx0);
742             dy20             = _mm256_sub_ps(iy2,jy0);
743             dz20             = _mm256_sub_ps(iz2,jz0);
744             dx21             = _mm256_sub_ps(ix2,jx1);
745             dy21             = _mm256_sub_ps(iy2,jy1);
746             dz21             = _mm256_sub_ps(iz2,jz1);
747             dx22             = _mm256_sub_ps(ix2,jx2);
748             dy22             = _mm256_sub_ps(iy2,jy2);
749             dz22             = _mm256_sub_ps(iz2,jz2);
750
751             /* Calculate squared distance and things based on it */
752             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
753             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
754             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
755             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
756             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
757             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
758             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
759             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
760             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
761
762             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
763             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
764             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
765             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
766             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
767             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
768             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
769             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
770             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
771
772             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
773             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
774             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
775             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
776             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
777             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
778             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
779             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
780             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
781
782             fjx0             = _mm256_setzero_ps();
783             fjy0             = _mm256_setzero_ps();
784             fjz0             = _mm256_setzero_ps();
785             fjx1             = _mm256_setzero_ps();
786             fjy1             = _mm256_setzero_ps();
787             fjz1             = _mm256_setzero_ps();
788             fjx2             = _mm256_setzero_ps();
789             fjy2             = _mm256_setzero_ps();
790             fjz2             = _mm256_setzero_ps();
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             if (gmx_mm256_any_lt(rsq00,rcutoff2))
797             {
798
799             /* REACTION-FIELD ELECTROSTATICS */
800             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
801             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
802
803             /* LENNARD-JONES DISPERSION/REPULSION */
804
805             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
806             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
807             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
808             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
809                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
810             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
811
812             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
813
814             /* Update potential sum for this i atom from the interaction with this j atom. */
815             velec            = _mm256_and_ps(velec,cutoff_mask);
816             velec            = _mm256_andnot_ps(dummy_mask,velec);
817             velecsum         = _mm256_add_ps(velecsum,velec);
818             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
819             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
820             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
821
822             fscal            = _mm256_add_ps(felec,fvdw);
823
824             fscal            = _mm256_and_ps(fscal,cutoff_mask);
825
826             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
827
828             /* Calculate temporary vectorial force */
829             tx               = _mm256_mul_ps(fscal,dx00);
830             ty               = _mm256_mul_ps(fscal,dy00);
831             tz               = _mm256_mul_ps(fscal,dz00);
832
833             /* Update vectorial force */
834             fix0             = _mm256_add_ps(fix0,tx);
835             fiy0             = _mm256_add_ps(fiy0,ty);
836             fiz0             = _mm256_add_ps(fiz0,tz);
837
838             fjx0             = _mm256_add_ps(fjx0,tx);
839             fjy0             = _mm256_add_ps(fjy0,ty);
840             fjz0             = _mm256_add_ps(fjz0,tz);
841
842             }
843
844             /**************************
845              * CALCULATE INTERACTIONS *
846              **************************/
847
848             if (gmx_mm256_any_lt(rsq01,rcutoff2))
849             {
850
851             /* REACTION-FIELD ELECTROSTATICS */
852             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
853             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
854
855             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _mm256_and_ps(velec,cutoff_mask);
859             velec            = _mm256_andnot_ps(dummy_mask,velec);
860             velecsum         = _mm256_add_ps(velecsum,velec);
861
862             fscal            = felec;
863
864             fscal            = _mm256_and_ps(fscal,cutoff_mask);
865
866             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm256_mul_ps(fscal,dx01);
870             ty               = _mm256_mul_ps(fscal,dy01);
871             tz               = _mm256_mul_ps(fscal,dz01);
872
873             /* Update vectorial force */
874             fix0             = _mm256_add_ps(fix0,tx);
875             fiy0             = _mm256_add_ps(fiy0,ty);
876             fiz0             = _mm256_add_ps(fiz0,tz);
877
878             fjx1             = _mm256_add_ps(fjx1,tx);
879             fjy1             = _mm256_add_ps(fjy1,ty);
880             fjz1             = _mm256_add_ps(fjz1,tz);
881
882             }
883
884             /**************************
885              * CALCULATE INTERACTIONS *
886              **************************/
887
888             if (gmx_mm256_any_lt(rsq02,rcutoff2))
889             {
890
891             /* REACTION-FIELD ELECTROSTATICS */
892             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
893             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
894
895             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
896
897             /* Update potential sum for this i atom from the interaction with this j atom. */
898             velec            = _mm256_and_ps(velec,cutoff_mask);
899             velec            = _mm256_andnot_ps(dummy_mask,velec);
900             velecsum         = _mm256_add_ps(velecsum,velec);
901
902             fscal            = felec;
903
904             fscal            = _mm256_and_ps(fscal,cutoff_mask);
905
906             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
907
908             /* Calculate temporary vectorial force */
909             tx               = _mm256_mul_ps(fscal,dx02);
910             ty               = _mm256_mul_ps(fscal,dy02);
911             tz               = _mm256_mul_ps(fscal,dz02);
912
913             /* Update vectorial force */
914             fix0             = _mm256_add_ps(fix0,tx);
915             fiy0             = _mm256_add_ps(fiy0,ty);
916             fiz0             = _mm256_add_ps(fiz0,tz);
917
918             fjx2             = _mm256_add_ps(fjx2,tx);
919             fjy2             = _mm256_add_ps(fjy2,ty);
920             fjz2             = _mm256_add_ps(fjz2,tz);
921
922             }
923
924             /**************************
925              * CALCULATE INTERACTIONS *
926              **************************/
927
928             if (gmx_mm256_any_lt(rsq10,rcutoff2))
929             {
930
931             /* REACTION-FIELD ELECTROSTATICS */
932             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
933             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
934
935             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
936
937             /* Update potential sum for this i atom from the interaction with this j atom. */
938             velec            = _mm256_and_ps(velec,cutoff_mask);
939             velec            = _mm256_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm256_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm256_and_ps(fscal,cutoff_mask);
945
946             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm256_mul_ps(fscal,dx10);
950             ty               = _mm256_mul_ps(fscal,dy10);
951             tz               = _mm256_mul_ps(fscal,dz10);
952
953             /* Update vectorial force */
954             fix1             = _mm256_add_ps(fix1,tx);
955             fiy1             = _mm256_add_ps(fiy1,ty);
956             fiz1             = _mm256_add_ps(fiz1,tz);
957
958             fjx0             = _mm256_add_ps(fjx0,tx);
959             fjy0             = _mm256_add_ps(fjy0,ty);
960             fjz0             = _mm256_add_ps(fjz0,tz);
961
962             }
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm256_any_lt(rsq11,rcutoff2))
969             {
970
971             /* REACTION-FIELD ELECTROSTATICS */
972             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
973             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
974
975             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm256_and_ps(velec,cutoff_mask);
979             velec            = _mm256_andnot_ps(dummy_mask,velec);
980             velecsum         = _mm256_add_ps(velecsum,velec);
981
982             fscal            = felec;
983
984             fscal            = _mm256_and_ps(fscal,cutoff_mask);
985
986             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm256_mul_ps(fscal,dx11);
990             ty               = _mm256_mul_ps(fscal,dy11);
991             tz               = _mm256_mul_ps(fscal,dz11);
992
993             /* Update vectorial force */
994             fix1             = _mm256_add_ps(fix1,tx);
995             fiy1             = _mm256_add_ps(fiy1,ty);
996             fiz1             = _mm256_add_ps(fiz1,tz);
997
998             fjx1             = _mm256_add_ps(fjx1,tx);
999             fjy1             = _mm256_add_ps(fjy1,ty);
1000             fjz1             = _mm256_add_ps(fjz1,tz);
1001
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1009             {
1010
1011             /* REACTION-FIELD ELECTROSTATICS */
1012             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1013             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1014
1015             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm256_and_ps(velec,cutoff_mask);
1019             velec            = _mm256_andnot_ps(dummy_mask,velec);
1020             velecsum         = _mm256_add_ps(velecsum,velec);
1021
1022             fscal            = felec;
1023
1024             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1025
1026             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1027
1028             /* Calculate temporary vectorial force */
1029             tx               = _mm256_mul_ps(fscal,dx12);
1030             ty               = _mm256_mul_ps(fscal,dy12);
1031             tz               = _mm256_mul_ps(fscal,dz12);
1032
1033             /* Update vectorial force */
1034             fix1             = _mm256_add_ps(fix1,tx);
1035             fiy1             = _mm256_add_ps(fiy1,ty);
1036             fiz1             = _mm256_add_ps(fiz1,tz);
1037
1038             fjx2             = _mm256_add_ps(fjx2,tx);
1039             fjy2             = _mm256_add_ps(fjy2,ty);
1040             fjz2             = _mm256_add_ps(fjz2,tz);
1041
1042             }
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1049             {
1050
1051             /* REACTION-FIELD ELECTROSTATICS */
1052             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1053             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1054
1055             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1056
1057             /* Update potential sum for this i atom from the interaction with this j atom. */
1058             velec            = _mm256_and_ps(velec,cutoff_mask);
1059             velec            = _mm256_andnot_ps(dummy_mask,velec);
1060             velecsum         = _mm256_add_ps(velecsum,velec);
1061
1062             fscal            = felec;
1063
1064             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1065
1066             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm256_mul_ps(fscal,dx20);
1070             ty               = _mm256_mul_ps(fscal,dy20);
1071             tz               = _mm256_mul_ps(fscal,dz20);
1072
1073             /* Update vectorial force */
1074             fix2             = _mm256_add_ps(fix2,tx);
1075             fiy2             = _mm256_add_ps(fiy2,ty);
1076             fiz2             = _mm256_add_ps(fiz2,tz);
1077
1078             fjx0             = _mm256_add_ps(fjx0,tx);
1079             fjy0             = _mm256_add_ps(fjy0,ty);
1080             fjz0             = _mm256_add_ps(fjz0,tz);
1081
1082             }
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1089             {
1090
1091             /* REACTION-FIELD ELECTROSTATICS */
1092             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1093             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1094
1095             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1096
1097             /* Update potential sum for this i atom from the interaction with this j atom. */
1098             velec            = _mm256_and_ps(velec,cutoff_mask);
1099             velec            = _mm256_andnot_ps(dummy_mask,velec);
1100             velecsum         = _mm256_add_ps(velecsum,velec);
1101
1102             fscal            = felec;
1103
1104             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1105
1106             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1107
1108             /* Calculate temporary vectorial force */
1109             tx               = _mm256_mul_ps(fscal,dx21);
1110             ty               = _mm256_mul_ps(fscal,dy21);
1111             tz               = _mm256_mul_ps(fscal,dz21);
1112
1113             /* Update vectorial force */
1114             fix2             = _mm256_add_ps(fix2,tx);
1115             fiy2             = _mm256_add_ps(fiy2,ty);
1116             fiz2             = _mm256_add_ps(fiz2,tz);
1117
1118             fjx1             = _mm256_add_ps(fjx1,tx);
1119             fjy1             = _mm256_add_ps(fjy1,ty);
1120             fjz1             = _mm256_add_ps(fjz1,tz);
1121
1122             }
1123
1124             /**************************
1125              * CALCULATE INTERACTIONS *
1126              **************************/
1127
1128             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1129             {
1130
1131             /* REACTION-FIELD ELECTROSTATICS */
1132             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1133             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1134
1135             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1136
1137             /* Update potential sum for this i atom from the interaction with this j atom. */
1138             velec            = _mm256_and_ps(velec,cutoff_mask);
1139             velec            = _mm256_andnot_ps(dummy_mask,velec);
1140             velecsum         = _mm256_add_ps(velecsum,velec);
1141
1142             fscal            = felec;
1143
1144             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1145
1146             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1147
1148             /* Calculate temporary vectorial force */
1149             tx               = _mm256_mul_ps(fscal,dx22);
1150             ty               = _mm256_mul_ps(fscal,dy22);
1151             tz               = _mm256_mul_ps(fscal,dz22);
1152
1153             /* Update vectorial force */
1154             fix2             = _mm256_add_ps(fix2,tx);
1155             fiy2             = _mm256_add_ps(fiy2,ty);
1156             fiz2             = _mm256_add_ps(fiz2,tz);
1157
1158             fjx2             = _mm256_add_ps(fjx2,tx);
1159             fjy2             = _mm256_add_ps(fjy2,ty);
1160             fjz2             = _mm256_add_ps(fjz2,tz);
1161
1162             }
1163
1164             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1165             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1166             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1167             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1168             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1169             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1170             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1171             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1172
1173             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1174                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1175
1176             /* Inner loop uses 342 flops */
1177         }
1178
1179         /* End of innermost loop */
1180
1181         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1182                                                  f+i_coord_offset,fshift+i_shift_offset);
1183
1184         ggid                        = gid[iidx];
1185         /* Update potential energies */
1186         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1187         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1188
1189         /* Increment number of inner iterations */
1190         inneriter                  += j_index_end - j_index_start;
1191
1192         /* Outer loop uses 20 flops */
1193     }
1194
1195     /* Increment number of outer iterations */
1196     outeriter        += nri;
1197
1198     /* Update outer/inner flops */
1199
1200     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1201 }
1202 /*
1203  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1204  * Electrostatics interaction: ReactionField
1205  * VdW interaction:            LennardJones
1206  * Geometry:                   Water3-Water3
1207  * Calculate force/pot:        Force
1208  */
1209 void
1210 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1211                     (t_nblist                    * gmx_restrict       nlist,
1212                      rvec                        * gmx_restrict          xx,
1213                      rvec                        * gmx_restrict          ff,
1214                      t_forcerec                  * gmx_restrict          fr,
1215                      t_mdatoms                   * gmx_restrict     mdatoms,
1216                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1217                      t_nrnb                      * gmx_restrict        nrnb)
1218 {
1219     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1220      * just 0 for non-waters.
1221      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1222      * jnr indices corresponding to data put in the four positions in the SIMD register.
1223      */
1224     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1225     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1226     int              jnrA,jnrB,jnrC,jnrD;
1227     int              jnrE,jnrF,jnrG,jnrH;
1228     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1229     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1230     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1231     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1232     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1233     real             rcutoff_scalar;
1234     real             *shiftvec,*fshift,*x,*f;
1235     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1236     real             scratch[4*DIM];
1237     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1238     real *           vdwioffsetptr0;
1239     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1240     real *           vdwioffsetptr1;
1241     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1242     real *           vdwioffsetptr2;
1243     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1244     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1245     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1246     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1247     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1248     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1249     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1250     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1251     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1252     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1253     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1254     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1255     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1256     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1257     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1258     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1259     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1260     real             *charge;
1261     int              nvdwtype;
1262     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1263     int              *vdwtype;
1264     real             *vdwparam;
1265     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1266     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1267     __m256           dummy_mask,cutoff_mask;
1268     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1269     __m256           one     = _mm256_set1_ps(1.0);
1270     __m256           two     = _mm256_set1_ps(2.0);
1271     x                = xx[0];
1272     f                = ff[0];
1273
1274     nri              = nlist->nri;
1275     iinr             = nlist->iinr;
1276     jindex           = nlist->jindex;
1277     jjnr             = nlist->jjnr;
1278     shiftidx         = nlist->shift;
1279     gid              = nlist->gid;
1280     shiftvec         = fr->shift_vec[0];
1281     fshift           = fr->fshift[0];
1282     facel            = _mm256_set1_ps(fr->epsfac);
1283     charge           = mdatoms->chargeA;
1284     krf              = _mm256_set1_ps(fr->ic->k_rf);
1285     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1286     crf              = _mm256_set1_ps(fr->ic->c_rf);
1287     nvdwtype         = fr->ntype;
1288     vdwparam         = fr->nbfp;
1289     vdwtype          = mdatoms->typeA;
1290
1291     /* Setup water-specific parameters */
1292     inr              = nlist->iinr[0];
1293     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1294     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1295     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1296     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1297
1298     jq0              = _mm256_set1_ps(charge[inr+0]);
1299     jq1              = _mm256_set1_ps(charge[inr+1]);
1300     jq2              = _mm256_set1_ps(charge[inr+2]);
1301     vdwjidx0A        = 2*vdwtype[inr+0];
1302     qq00             = _mm256_mul_ps(iq0,jq0);
1303     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1304     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1305     qq01             = _mm256_mul_ps(iq0,jq1);
1306     qq02             = _mm256_mul_ps(iq0,jq2);
1307     qq10             = _mm256_mul_ps(iq1,jq0);
1308     qq11             = _mm256_mul_ps(iq1,jq1);
1309     qq12             = _mm256_mul_ps(iq1,jq2);
1310     qq20             = _mm256_mul_ps(iq2,jq0);
1311     qq21             = _mm256_mul_ps(iq2,jq1);
1312     qq22             = _mm256_mul_ps(iq2,jq2);
1313
1314     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1315     rcutoff_scalar   = fr->rcoulomb;
1316     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1317     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1318
1319     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1320     rvdw             = _mm256_set1_ps(fr->rvdw);
1321
1322     /* Avoid stupid compiler warnings */
1323     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1324     j_coord_offsetA = 0;
1325     j_coord_offsetB = 0;
1326     j_coord_offsetC = 0;
1327     j_coord_offsetD = 0;
1328     j_coord_offsetE = 0;
1329     j_coord_offsetF = 0;
1330     j_coord_offsetG = 0;
1331     j_coord_offsetH = 0;
1332
1333     outeriter        = 0;
1334     inneriter        = 0;
1335
1336     for(iidx=0;iidx<4*DIM;iidx++)
1337     {
1338         scratch[iidx] = 0.0;
1339     }
1340
1341     /* Start outer loop over neighborlists */
1342     for(iidx=0; iidx<nri; iidx++)
1343     {
1344         /* Load shift vector for this list */
1345         i_shift_offset   = DIM*shiftidx[iidx];
1346
1347         /* Load limits for loop over neighbors */
1348         j_index_start    = jindex[iidx];
1349         j_index_end      = jindex[iidx+1];
1350
1351         /* Get outer coordinate index */
1352         inr              = iinr[iidx];
1353         i_coord_offset   = DIM*inr;
1354
1355         /* Load i particle coords and add shift vector */
1356         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1357                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1358
1359         fix0             = _mm256_setzero_ps();
1360         fiy0             = _mm256_setzero_ps();
1361         fiz0             = _mm256_setzero_ps();
1362         fix1             = _mm256_setzero_ps();
1363         fiy1             = _mm256_setzero_ps();
1364         fiz1             = _mm256_setzero_ps();
1365         fix2             = _mm256_setzero_ps();
1366         fiy2             = _mm256_setzero_ps();
1367         fiz2             = _mm256_setzero_ps();
1368
1369         /* Start inner kernel loop */
1370         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1371         {
1372
1373             /* Get j neighbor index, and coordinate index */
1374             jnrA             = jjnr[jidx];
1375             jnrB             = jjnr[jidx+1];
1376             jnrC             = jjnr[jidx+2];
1377             jnrD             = jjnr[jidx+3];
1378             jnrE             = jjnr[jidx+4];
1379             jnrF             = jjnr[jidx+5];
1380             jnrG             = jjnr[jidx+6];
1381             jnrH             = jjnr[jidx+7];
1382             j_coord_offsetA  = DIM*jnrA;
1383             j_coord_offsetB  = DIM*jnrB;
1384             j_coord_offsetC  = DIM*jnrC;
1385             j_coord_offsetD  = DIM*jnrD;
1386             j_coord_offsetE  = DIM*jnrE;
1387             j_coord_offsetF  = DIM*jnrF;
1388             j_coord_offsetG  = DIM*jnrG;
1389             j_coord_offsetH  = DIM*jnrH;
1390
1391             /* load j atom coordinates */
1392             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1393                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1394                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1395                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1396                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1397
1398             /* Calculate displacement vector */
1399             dx00             = _mm256_sub_ps(ix0,jx0);
1400             dy00             = _mm256_sub_ps(iy0,jy0);
1401             dz00             = _mm256_sub_ps(iz0,jz0);
1402             dx01             = _mm256_sub_ps(ix0,jx1);
1403             dy01             = _mm256_sub_ps(iy0,jy1);
1404             dz01             = _mm256_sub_ps(iz0,jz1);
1405             dx02             = _mm256_sub_ps(ix0,jx2);
1406             dy02             = _mm256_sub_ps(iy0,jy2);
1407             dz02             = _mm256_sub_ps(iz0,jz2);
1408             dx10             = _mm256_sub_ps(ix1,jx0);
1409             dy10             = _mm256_sub_ps(iy1,jy0);
1410             dz10             = _mm256_sub_ps(iz1,jz0);
1411             dx11             = _mm256_sub_ps(ix1,jx1);
1412             dy11             = _mm256_sub_ps(iy1,jy1);
1413             dz11             = _mm256_sub_ps(iz1,jz1);
1414             dx12             = _mm256_sub_ps(ix1,jx2);
1415             dy12             = _mm256_sub_ps(iy1,jy2);
1416             dz12             = _mm256_sub_ps(iz1,jz2);
1417             dx20             = _mm256_sub_ps(ix2,jx0);
1418             dy20             = _mm256_sub_ps(iy2,jy0);
1419             dz20             = _mm256_sub_ps(iz2,jz0);
1420             dx21             = _mm256_sub_ps(ix2,jx1);
1421             dy21             = _mm256_sub_ps(iy2,jy1);
1422             dz21             = _mm256_sub_ps(iz2,jz1);
1423             dx22             = _mm256_sub_ps(ix2,jx2);
1424             dy22             = _mm256_sub_ps(iy2,jy2);
1425             dz22             = _mm256_sub_ps(iz2,jz2);
1426
1427             /* Calculate squared distance and things based on it */
1428             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1429             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1430             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1431             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1432             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1433             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1434             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1435             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1436             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1437
1438             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1439             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1440             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1441             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1442             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1443             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1444             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1445             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1446             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1447
1448             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1449             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1450             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1451             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1452             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1453             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1454             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1455             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1456             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1457
1458             fjx0             = _mm256_setzero_ps();
1459             fjy0             = _mm256_setzero_ps();
1460             fjz0             = _mm256_setzero_ps();
1461             fjx1             = _mm256_setzero_ps();
1462             fjy1             = _mm256_setzero_ps();
1463             fjz1             = _mm256_setzero_ps();
1464             fjx2             = _mm256_setzero_ps();
1465             fjy2             = _mm256_setzero_ps();
1466             fjz2             = _mm256_setzero_ps();
1467
1468             /**************************
1469              * CALCULATE INTERACTIONS *
1470              **************************/
1471
1472             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1473             {
1474
1475             /* REACTION-FIELD ELECTROSTATICS */
1476             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1477
1478             /* LENNARD-JONES DISPERSION/REPULSION */
1479
1480             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1481             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1482
1483             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1484
1485             fscal            = _mm256_add_ps(felec,fvdw);
1486
1487             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1488
1489             /* Calculate temporary vectorial force */
1490             tx               = _mm256_mul_ps(fscal,dx00);
1491             ty               = _mm256_mul_ps(fscal,dy00);
1492             tz               = _mm256_mul_ps(fscal,dz00);
1493
1494             /* Update vectorial force */
1495             fix0             = _mm256_add_ps(fix0,tx);
1496             fiy0             = _mm256_add_ps(fiy0,ty);
1497             fiz0             = _mm256_add_ps(fiz0,tz);
1498
1499             fjx0             = _mm256_add_ps(fjx0,tx);
1500             fjy0             = _mm256_add_ps(fjy0,ty);
1501             fjz0             = _mm256_add_ps(fjz0,tz);
1502
1503             }
1504
1505             /**************************
1506              * CALCULATE INTERACTIONS *
1507              **************************/
1508
1509             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1510             {
1511
1512             /* REACTION-FIELD ELECTROSTATICS */
1513             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1514
1515             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1516
1517             fscal            = felec;
1518
1519             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1520
1521             /* Calculate temporary vectorial force */
1522             tx               = _mm256_mul_ps(fscal,dx01);
1523             ty               = _mm256_mul_ps(fscal,dy01);
1524             tz               = _mm256_mul_ps(fscal,dz01);
1525
1526             /* Update vectorial force */
1527             fix0             = _mm256_add_ps(fix0,tx);
1528             fiy0             = _mm256_add_ps(fiy0,ty);
1529             fiz0             = _mm256_add_ps(fiz0,tz);
1530
1531             fjx1             = _mm256_add_ps(fjx1,tx);
1532             fjy1             = _mm256_add_ps(fjy1,ty);
1533             fjz1             = _mm256_add_ps(fjz1,tz);
1534
1535             }
1536
1537             /**************************
1538              * CALCULATE INTERACTIONS *
1539              **************************/
1540
1541             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1542             {
1543
1544             /* REACTION-FIELD ELECTROSTATICS */
1545             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1546
1547             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1548
1549             fscal            = felec;
1550
1551             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1552
1553             /* Calculate temporary vectorial force */
1554             tx               = _mm256_mul_ps(fscal,dx02);
1555             ty               = _mm256_mul_ps(fscal,dy02);
1556             tz               = _mm256_mul_ps(fscal,dz02);
1557
1558             /* Update vectorial force */
1559             fix0             = _mm256_add_ps(fix0,tx);
1560             fiy0             = _mm256_add_ps(fiy0,ty);
1561             fiz0             = _mm256_add_ps(fiz0,tz);
1562
1563             fjx2             = _mm256_add_ps(fjx2,tx);
1564             fjy2             = _mm256_add_ps(fjy2,ty);
1565             fjz2             = _mm256_add_ps(fjz2,tz);
1566
1567             }
1568
1569             /**************************
1570              * CALCULATE INTERACTIONS *
1571              **************************/
1572
1573             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1574             {
1575
1576             /* REACTION-FIELD ELECTROSTATICS */
1577             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1578
1579             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1580
1581             fscal            = felec;
1582
1583             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1584
1585             /* Calculate temporary vectorial force */
1586             tx               = _mm256_mul_ps(fscal,dx10);
1587             ty               = _mm256_mul_ps(fscal,dy10);
1588             tz               = _mm256_mul_ps(fscal,dz10);
1589
1590             /* Update vectorial force */
1591             fix1             = _mm256_add_ps(fix1,tx);
1592             fiy1             = _mm256_add_ps(fiy1,ty);
1593             fiz1             = _mm256_add_ps(fiz1,tz);
1594
1595             fjx0             = _mm256_add_ps(fjx0,tx);
1596             fjy0             = _mm256_add_ps(fjy0,ty);
1597             fjz0             = _mm256_add_ps(fjz0,tz);
1598
1599             }
1600
1601             /**************************
1602              * CALCULATE INTERACTIONS *
1603              **************************/
1604
1605             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1606             {
1607
1608             /* REACTION-FIELD ELECTROSTATICS */
1609             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1610
1611             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1612
1613             fscal            = felec;
1614
1615             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1616
1617             /* Calculate temporary vectorial force */
1618             tx               = _mm256_mul_ps(fscal,dx11);
1619             ty               = _mm256_mul_ps(fscal,dy11);
1620             tz               = _mm256_mul_ps(fscal,dz11);
1621
1622             /* Update vectorial force */
1623             fix1             = _mm256_add_ps(fix1,tx);
1624             fiy1             = _mm256_add_ps(fiy1,ty);
1625             fiz1             = _mm256_add_ps(fiz1,tz);
1626
1627             fjx1             = _mm256_add_ps(fjx1,tx);
1628             fjy1             = _mm256_add_ps(fjy1,ty);
1629             fjz1             = _mm256_add_ps(fjz1,tz);
1630
1631             }
1632
1633             /**************************
1634              * CALCULATE INTERACTIONS *
1635              **************************/
1636
1637             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1638             {
1639
1640             /* REACTION-FIELD ELECTROSTATICS */
1641             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1642
1643             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1644
1645             fscal            = felec;
1646
1647             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1648
1649             /* Calculate temporary vectorial force */
1650             tx               = _mm256_mul_ps(fscal,dx12);
1651             ty               = _mm256_mul_ps(fscal,dy12);
1652             tz               = _mm256_mul_ps(fscal,dz12);
1653
1654             /* Update vectorial force */
1655             fix1             = _mm256_add_ps(fix1,tx);
1656             fiy1             = _mm256_add_ps(fiy1,ty);
1657             fiz1             = _mm256_add_ps(fiz1,tz);
1658
1659             fjx2             = _mm256_add_ps(fjx2,tx);
1660             fjy2             = _mm256_add_ps(fjy2,ty);
1661             fjz2             = _mm256_add_ps(fjz2,tz);
1662
1663             }
1664
1665             /**************************
1666              * CALCULATE INTERACTIONS *
1667              **************************/
1668
1669             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1670             {
1671
1672             /* REACTION-FIELD ELECTROSTATICS */
1673             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1674
1675             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1676
1677             fscal            = felec;
1678
1679             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1680
1681             /* Calculate temporary vectorial force */
1682             tx               = _mm256_mul_ps(fscal,dx20);
1683             ty               = _mm256_mul_ps(fscal,dy20);
1684             tz               = _mm256_mul_ps(fscal,dz20);
1685
1686             /* Update vectorial force */
1687             fix2             = _mm256_add_ps(fix2,tx);
1688             fiy2             = _mm256_add_ps(fiy2,ty);
1689             fiz2             = _mm256_add_ps(fiz2,tz);
1690
1691             fjx0             = _mm256_add_ps(fjx0,tx);
1692             fjy0             = _mm256_add_ps(fjy0,ty);
1693             fjz0             = _mm256_add_ps(fjz0,tz);
1694
1695             }
1696
1697             /**************************
1698              * CALCULATE INTERACTIONS *
1699              **************************/
1700
1701             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1702             {
1703
1704             /* REACTION-FIELD ELECTROSTATICS */
1705             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1706
1707             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1708
1709             fscal            = felec;
1710
1711             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1712
1713             /* Calculate temporary vectorial force */
1714             tx               = _mm256_mul_ps(fscal,dx21);
1715             ty               = _mm256_mul_ps(fscal,dy21);
1716             tz               = _mm256_mul_ps(fscal,dz21);
1717
1718             /* Update vectorial force */
1719             fix2             = _mm256_add_ps(fix2,tx);
1720             fiy2             = _mm256_add_ps(fiy2,ty);
1721             fiz2             = _mm256_add_ps(fiz2,tz);
1722
1723             fjx1             = _mm256_add_ps(fjx1,tx);
1724             fjy1             = _mm256_add_ps(fjy1,ty);
1725             fjz1             = _mm256_add_ps(fjz1,tz);
1726
1727             }
1728
1729             /**************************
1730              * CALCULATE INTERACTIONS *
1731              **************************/
1732
1733             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1734             {
1735
1736             /* REACTION-FIELD ELECTROSTATICS */
1737             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1738
1739             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1740
1741             fscal            = felec;
1742
1743             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1744
1745             /* Calculate temporary vectorial force */
1746             tx               = _mm256_mul_ps(fscal,dx22);
1747             ty               = _mm256_mul_ps(fscal,dy22);
1748             tz               = _mm256_mul_ps(fscal,dz22);
1749
1750             /* Update vectorial force */
1751             fix2             = _mm256_add_ps(fix2,tx);
1752             fiy2             = _mm256_add_ps(fiy2,ty);
1753             fiz2             = _mm256_add_ps(fiz2,tz);
1754
1755             fjx2             = _mm256_add_ps(fjx2,tx);
1756             fjy2             = _mm256_add_ps(fjy2,ty);
1757             fjz2             = _mm256_add_ps(fjz2,tz);
1758
1759             }
1760
1761             fjptrA             = f+j_coord_offsetA;
1762             fjptrB             = f+j_coord_offsetB;
1763             fjptrC             = f+j_coord_offsetC;
1764             fjptrD             = f+j_coord_offsetD;
1765             fjptrE             = f+j_coord_offsetE;
1766             fjptrF             = f+j_coord_offsetF;
1767             fjptrG             = f+j_coord_offsetG;
1768             fjptrH             = f+j_coord_offsetH;
1769
1770             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1771                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1772
1773             /* Inner loop uses 277 flops */
1774         }
1775
1776         if(jidx<j_index_end)
1777         {
1778
1779             /* Get j neighbor index, and coordinate index */
1780             jnrlistA         = jjnr[jidx];
1781             jnrlistB         = jjnr[jidx+1];
1782             jnrlistC         = jjnr[jidx+2];
1783             jnrlistD         = jjnr[jidx+3];
1784             jnrlistE         = jjnr[jidx+4];
1785             jnrlistF         = jjnr[jidx+5];
1786             jnrlistG         = jjnr[jidx+6];
1787             jnrlistH         = jjnr[jidx+7];
1788             /* Sign of each element will be negative for non-real atoms.
1789              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1790              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1791              */
1792             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1793                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1794                                             
1795             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1796             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1797             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1798             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1799             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1800             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1801             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1802             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1803             j_coord_offsetA  = DIM*jnrA;
1804             j_coord_offsetB  = DIM*jnrB;
1805             j_coord_offsetC  = DIM*jnrC;
1806             j_coord_offsetD  = DIM*jnrD;
1807             j_coord_offsetE  = DIM*jnrE;
1808             j_coord_offsetF  = DIM*jnrF;
1809             j_coord_offsetG  = DIM*jnrG;
1810             j_coord_offsetH  = DIM*jnrH;
1811
1812             /* load j atom coordinates */
1813             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1814                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1815                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1816                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1817                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1818
1819             /* Calculate displacement vector */
1820             dx00             = _mm256_sub_ps(ix0,jx0);
1821             dy00             = _mm256_sub_ps(iy0,jy0);
1822             dz00             = _mm256_sub_ps(iz0,jz0);
1823             dx01             = _mm256_sub_ps(ix0,jx1);
1824             dy01             = _mm256_sub_ps(iy0,jy1);
1825             dz01             = _mm256_sub_ps(iz0,jz1);
1826             dx02             = _mm256_sub_ps(ix0,jx2);
1827             dy02             = _mm256_sub_ps(iy0,jy2);
1828             dz02             = _mm256_sub_ps(iz0,jz2);
1829             dx10             = _mm256_sub_ps(ix1,jx0);
1830             dy10             = _mm256_sub_ps(iy1,jy0);
1831             dz10             = _mm256_sub_ps(iz1,jz0);
1832             dx11             = _mm256_sub_ps(ix1,jx1);
1833             dy11             = _mm256_sub_ps(iy1,jy1);
1834             dz11             = _mm256_sub_ps(iz1,jz1);
1835             dx12             = _mm256_sub_ps(ix1,jx2);
1836             dy12             = _mm256_sub_ps(iy1,jy2);
1837             dz12             = _mm256_sub_ps(iz1,jz2);
1838             dx20             = _mm256_sub_ps(ix2,jx0);
1839             dy20             = _mm256_sub_ps(iy2,jy0);
1840             dz20             = _mm256_sub_ps(iz2,jz0);
1841             dx21             = _mm256_sub_ps(ix2,jx1);
1842             dy21             = _mm256_sub_ps(iy2,jy1);
1843             dz21             = _mm256_sub_ps(iz2,jz1);
1844             dx22             = _mm256_sub_ps(ix2,jx2);
1845             dy22             = _mm256_sub_ps(iy2,jy2);
1846             dz22             = _mm256_sub_ps(iz2,jz2);
1847
1848             /* Calculate squared distance and things based on it */
1849             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1850             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1851             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1852             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1853             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1854             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1855             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1856             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1857             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1858
1859             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1860             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1861             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1862             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1863             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1864             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1865             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1866             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1867             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1868
1869             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1870             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1871             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1872             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1873             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1874             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1875             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1876             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1877             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1878
1879             fjx0             = _mm256_setzero_ps();
1880             fjy0             = _mm256_setzero_ps();
1881             fjz0             = _mm256_setzero_ps();
1882             fjx1             = _mm256_setzero_ps();
1883             fjy1             = _mm256_setzero_ps();
1884             fjz1             = _mm256_setzero_ps();
1885             fjx2             = _mm256_setzero_ps();
1886             fjy2             = _mm256_setzero_ps();
1887             fjz2             = _mm256_setzero_ps();
1888
1889             /**************************
1890              * CALCULATE INTERACTIONS *
1891              **************************/
1892
1893             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1894             {
1895
1896             /* REACTION-FIELD ELECTROSTATICS */
1897             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1898
1899             /* LENNARD-JONES DISPERSION/REPULSION */
1900
1901             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1902             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1903
1904             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1905
1906             fscal            = _mm256_add_ps(felec,fvdw);
1907
1908             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1909
1910             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1911
1912             /* Calculate temporary vectorial force */
1913             tx               = _mm256_mul_ps(fscal,dx00);
1914             ty               = _mm256_mul_ps(fscal,dy00);
1915             tz               = _mm256_mul_ps(fscal,dz00);
1916
1917             /* Update vectorial force */
1918             fix0             = _mm256_add_ps(fix0,tx);
1919             fiy0             = _mm256_add_ps(fiy0,ty);
1920             fiz0             = _mm256_add_ps(fiz0,tz);
1921
1922             fjx0             = _mm256_add_ps(fjx0,tx);
1923             fjy0             = _mm256_add_ps(fjy0,ty);
1924             fjz0             = _mm256_add_ps(fjz0,tz);
1925
1926             }
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1933             {
1934
1935             /* REACTION-FIELD ELECTROSTATICS */
1936             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1937
1938             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1939
1940             fscal            = felec;
1941
1942             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1943
1944             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm256_mul_ps(fscal,dx01);
1948             ty               = _mm256_mul_ps(fscal,dy01);
1949             tz               = _mm256_mul_ps(fscal,dz01);
1950
1951             /* Update vectorial force */
1952             fix0             = _mm256_add_ps(fix0,tx);
1953             fiy0             = _mm256_add_ps(fiy0,ty);
1954             fiz0             = _mm256_add_ps(fiz0,tz);
1955
1956             fjx1             = _mm256_add_ps(fjx1,tx);
1957             fjy1             = _mm256_add_ps(fjy1,ty);
1958             fjz1             = _mm256_add_ps(fjz1,tz);
1959
1960             }
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1967             {
1968
1969             /* REACTION-FIELD ELECTROSTATICS */
1970             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1971
1972             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1977
1978             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1979
1980             /* Calculate temporary vectorial force */
1981             tx               = _mm256_mul_ps(fscal,dx02);
1982             ty               = _mm256_mul_ps(fscal,dy02);
1983             tz               = _mm256_mul_ps(fscal,dz02);
1984
1985             /* Update vectorial force */
1986             fix0             = _mm256_add_ps(fix0,tx);
1987             fiy0             = _mm256_add_ps(fiy0,ty);
1988             fiz0             = _mm256_add_ps(fiz0,tz);
1989
1990             fjx2             = _mm256_add_ps(fjx2,tx);
1991             fjy2             = _mm256_add_ps(fjy2,ty);
1992             fjz2             = _mm256_add_ps(fjz2,tz);
1993
1994             }
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2001             {
2002
2003             /* REACTION-FIELD ELECTROSTATICS */
2004             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2005
2006             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2007
2008             fscal            = felec;
2009
2010             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2011
2012             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm256_mul_ps(fscal,dx10);
2016             ty               = _mm256_mul_ps(fscal,dy10);
2017             tz               = _mm256_mul_ps(fscal,dz10);
2018
2019             /* Update vectorial force */
2020             fix1             = _mm256_add_ps(fix1,tx);
2021             fiy1             = _mm256_add_ps(fiy1,ty);
2022             fiz1             = _mm256_add_ps(fiz1,tz);
2023
2024             fjx0             = _mm256_add_ps(fjx0,tx);
2025             fjy0             = _mm256_add_ps(fjy0,ty);
2026             fjz0             = _mm256_add_ps(fjz0,tz);
2027
2028             }
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2035             {
2036
2037             /* REACTION-FIELD ELECTROSTATICS */
2038             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2039
2040             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2041
2042             fscal            = felec;
2043
2044             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2045
2046             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2047
2048             /* Calculate temporary vectorial force */
2049             tx               = _mm256_mul_ps(fscal,dx11);
2050             ty               = _mm256_mul_ps(fscal,dy11);
2051             tz               = _mm256_mul_ps(fscal,dz11);
2052
2053             /* Update vectorial force */
2054             fix1             = _mm256_add_ps(fix1,tx);
2055             fiy1             = _mm256_add_ps(fiy1,ty);
2056             fiz1             = _mm256_add_ps(fiz1,tz);
2057
2058             fjx1             = _mm256_add_ps(fjx1,tx);
2059             fjy1             = _mm256_add_ps(fjy1,ty);
2060             fjz1             = _mm256_add_ps(fjz1,tz);
2061
2062             }
2063
2064             /**************************
2065              * CALCULATE INTERACTIONS *
2066              **************************/
2067
2068             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2069             {
2070
2071             /* REACTION-FIELD ELECTROSTATICS */
2072             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2073
2074             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2075
2076             fscal            = felec;
2077
2078             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2079
2080             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm256_mul_ps(fscal,dx12);
2084             ty               = _mm256_mul_ps(fscal,dy12);
2085             tz               = _mm256_mul_ps(fscal,dz12);
2086
2087             /* Update vectorial force */
2088             fix1             = _mm256_add_ps(fix1,tx);
2089             fiy1             = _mm256_add_ps(fiy1,ty);
2090             fiz1             = _mm256_add_ps(fiz1,tz);
2091
2092             fjx2             = _mm256_add_ps(fjx2,tx);
2093             fjy2             = _mm256_add_ps(fjy2,ty);
2094             fjz2             = _mm256_add_ps(fjz2,tz);
2095
2096             }
2097
2098             /**************************
2099              * CALCULATE INTERACTIONS *
2100              **************************/
2101
2102             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2103             {
2104
2105             /* REACTION-FIELD ELECTROSTATICS */
2106             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2107
2108             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2113
2114             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2115
2116             /* Calculate temporary vectorial force */
2117             tx               = _mm256_mul_ps(fscal,dx20);
2118             ty               = _mm256_mul_ps(fscal,dy20);
2119             tz               = _mm256_mul_ps(fscal,dz20);
2120
2121             /* Update vectorial force */
2122             fix2             = _mm256_add_ps(fix2,tx);
2123             fiy2             = _mm256_add_ps(fiy2,ty);
2124             fiz2             = _mm256_add_ps(fiz2,tz);
2125
2126             fjx0             = _mm256_add_ps(fjx0,tx);
2127             fjy0             = _mm256_add_ps(fjy0,ty);
2128             fjz0             = _mm256_add_ps(fjz0,tz);
2129
2130             }
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2137             {
2138
2139             /* REACTION-FIELD ELECTROSTATICS */
2140             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2141
2142             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2147
2148             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2149
2150             /* Calculate temporary vectorial force */
2151             tx               = _mm256_mul_ps(fscal,dx21);
2152             ty               = _mm256_mul_ps(fscal,dy21);
2153             tz               = _mm256_mul_ps(fscal,dz21);
2154
2155             /* Update vectorial force */
2156             fix2             = _mm256_add_ps(fix2,tx);
2157             fiy2             = _mm256_add_ps(fiy2,ty);
2158             fiz2             = _mm256_add_ps(fiz2,tz);
2159
2160             fjx1             = _mm256_add_ps(fjx1,tx);
2161             fjy1             = _mm256_add_ps(fjy1,ty);
2162             fjz1             = _mm256_add_ps(fjz1,tz);
2163
2164             }
2165
2166             /**************************
2167              * CALCULATE INTERACTIONS *
2168              **************************/
2169
2170             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2171             {
2172
2173             /* REACTION-FIELD ELECTROSTATICS */
2174             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2175
2176             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2181
2182             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm256_mul_ps(fscal,dx22);
2186             ty               = _mm256_mul_ps(fscal,dy22);
2187             tz               = _mm256_mul_ps(fscal,dz22);
2188
2189             /* Update vectorial force */
2190             fix2             = _mm256_add_ps(fix2,tx);
2191             fiy2             = _mm256_add_ps(fiy2,ty);
2192             fiz2             = _mm256_add_ps(fiz2,tz);
2193
2194             fjx2             = _mm256_add_ps(fjx2,tx);
2195             fjy2             = _mm256_add_ps(fjy2,ty);
2196             fjz2             = _mm256_add_ps(fjz2,tz);
2197
2198             }
2199
2200             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2201             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2202             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2203             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2204             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2205             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2206             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2207             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2208
2209             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2210                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2211
2212             /* Inner loop uses 277 flops */
2213         }
2214
2215         /* End of innermost loop */
2216
2217         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2218                                                  f+i_coord_offset,fshift+i_shift_offset);
2219
2220         /* Increment number of inner iterations */
2221         inneriter                  += j_index_end - j_index_start;
2222
2223         /* Outer loop uses 18 flops */
2224     }
2225
2226     /* Increment number of outer iterations */
2227     outeriter        += nri;
2228
2229     /* Update outer/inner flops */
2230
2231     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
2232 }