Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRF_VdwLJ_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_single
52  * Electrostatics interaction: ReactionField
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     int              nvdwtype;
110     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111     int              *vdwtype;
112     real             *vdwparam;
113     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
114     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
115     __m256           dummy_mask,cutoff_mask;
116     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117     __m256           one     = _mm256_set1_ps(1.0);
118     __m256           two     = _mm256_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm256_set1_ps(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     krf              = _mm256_set1_ps(fr->ic->k_rf);
133     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
134     crf              = _mm256_set1_ps(fr->ic->c_rf);
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
142     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
143     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
144     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm256_set1_ps(charge[inr+0]);
147     jq1              = _mm256_set1_ps(charge[inr+1]);
148     jq2              = _mm256_set1_ps(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm256_mul_ps(iq0,jq0);
151     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
152     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
153     qq01             = _mm256_mul_ps(iq0,jq1);
154     qq02             = _mm256_mul_ps(iq0,jq2);
155     qq10             = _mm256_mul_ps(iq1,jq0);
156     qq11             = _mm256_mul_ps(iq1,jq1);
157     qq12             = _mm256_mul_ps(iq1,jq2);
158     qq20             = _mm256_mul_ps(iq2,jq0);
159     qq21             = _mm256_mul_ps(iq2,jq1);
160     qq22             = _mm256_mul_ps(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166     j_coord_offsetC = 0;
167     j_coord_offsetD = 0;
168     j_coord_offsetE = 0;
169     j_coord_offsetF = 0;
170     j_coord_offsetG = 0;
171     j_coord_offsetH = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     for(iidx=0;iidx<4*DIM;iidx++)
177     {
178         scratch[iidx] = 0.0;
179     }
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186
187         /* Load limits for loop over neighbors */
188         j_index_start    = jindex[iidx];
189         j_index_end      = jindex[iidx+1];
190
191         /* Get outer coordinate index */
192         inr              = iinr[iidx];
193         i_coord_offset   = DIM*inr;
194
195         /* Load i particle coords and add shift vector */
196         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
197                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198
199         fix0             = _mm256_setzero_ps();
200         fiy0             = _mm256_setzero_ps();
201         fiz0             = _mm256_setzero_ps();
202         fix1             = _mm256_setzero_ps();
203         fiy1             = _mm256_setzero_ps();
204         fiz1             = _mm256_setzero_ps();
205         fix2             = _mm256_setzero_ps();
206         fiy2             = _mm256_setzero_ps();
207         fiz2             = _mm256_setzero_ps();
208
209         /* Reset potential sums */
210         velecsum         = _mm256_setzero_ps();
211         vvdwsum          = _mm256_setzero_ps();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222             jnrE             = jjnr[jidx+4];
223             jnrF             = jjnr[jidx+5];
224             jnrG             = jjnr[jidx+6];
225             jnrH             = jjnr[jidx+7];
226             j_coord_offsetA  = DIM*jnrA;
227             j_coord_offsetB  = DIM*jnrB;
228             j_coord_offsetC  = DIM*jnrC;
229             j_coord_offsetD  = DIM*jnrD;
230             j_coord_offsetE  = DIM*jnrE;
231             j_coord_offsetF  = DIM*jnrF;
232             j_coord_offsetG  = DIM*jnrG;
233             j_coord_offsetH  = DIM*jnrH;
234
235             /* load j atom coordinates */
236             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
237                                                  x+j_coord_offsetC,x+j_coord_offsetD,
238                                                  x+j_coord_offsetE,x+j_coord_offsetF,
239                                                  x+j_coord_offsetG,x+j_coord_offsetH,
240                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
241
242             /* Calculate displacement vector */
243             dx00             = _mm256_sub_ps(ix0,jx0);
244             dy00             = _mm256_sub_ps(iy0,jy0);
245             dz00             = _mm256_sub_ps(iz0,jz0);
246             dx01             = _mm256_sub_ps(ix0,jx1);
247             dy01             = _mm256_sub_ps(iy0,jy1);
248             dz01             = _mm256_sub_ps(iz0,jz1);
249             dx02             = _mm256_sub_ps(ix0,jx2);
250             dy02             = _mm256_sub_ps(iy0,jy2);
251             dz02             = _mm256_sub_ps(iz0,jz2);
252             dx10             = _mm256_sub_ps(ix1,jx0);
253             dy10             = _mm256_sub_ps(iy1,jy0);
254             dz10             = _mm256_sub_ps(iz1,jz0);
255             dx11             = _mm256_sub_ps(ix1,jx1);
256             dy11             = _mm256_sub_ps(iy1,jy1);
257             dz11             = _mm256_sub_ps(iz1,jz1);
258             dx12             = _mm256_sub_ps(ix1,jx2);
259             dy12             = _mm256_sub_ps(iy1,jy2);
260             dz12             = _mm256_sub_ps(iz1,jz2);
261             dx20             = _mm256_sub_ps(ix2,jx0);
262             dy20             = _mm256_sub_ps(iy2,jy0);
263             dz20             = _mm256_sub_ps(iz2,jz0);
264             dx21             = _mm256_sub_ps(ix2,jx1);
265             dy21             = _mm256_sub_ps(iy2,jy1);
266             dz21             = _mm256_sub_ps(iz2,jz1);
267             dx22             = _mm256_sub_ps(ix2,jx2);
268             dy22             = _mm256_sub_ps(iy2,jy2);
269             dz22             = _mm256_sub_ps(iz2,jz2);
270
271             /* Calculate squared distance and things based on it */
272             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
273             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
274             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
275             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
276             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
277             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
278             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
279             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
280             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281
282             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
283             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
284             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
285             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
286             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
287             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
288             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
289             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
290             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
291
292             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
293             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
294             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
295             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
296             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
297             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
298             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
299             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
300             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
301
302             fjx0             = _mm256_setzero_ps();
303             fjy0             = _mm256_setzero_ps();
304             fjz0             = _mm256_setzero_ps();
305             fjx1             = _mm256_setzero_ps();
306             fjy1             = _mm256_setzero_ps();
307             fjz1             = _mm256_setzero_ps();
308             fjx2             = _mm256_setzero_ps();
309             fjy2             = _mm256_setzero_ps();
310             fjz2             = _mm256_setzero_ps();
311
312             /**************************
313              * CALCULATE INTERACTIONS *
314              **************************/
315
316             /* REACTION-FIELD ELECTROSTATICS */
317             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
318             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
319
320             /* LENNARD-JONES DISPERSION/REPULSION */
321
322             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
323             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
324             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
325             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
326             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
327
328             /* Update potential sum for this i atom from the interaction with this j atom. */
329             velecsum         = _mm256_add_ps(velecsum,velec);
330             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
331
332             fscal            = _mm256_add_ps(felec,fvdw);
333
334             /* Calculate temporary vectorial force */
335             tx               = _mm256_mul_ps(fscal,dx00);
336             ty               = _mm256_mul_ps(fscal,dy00);
337             tz               = _mm256_mul_ps(fscal,dz00);
338
339             /* Update vectorial force */
340             fix0             = _mm256_add_ps(fix0,tx);
341             fiy0             = _mm256_add_ps(fiy0,ty);
342             fiz0             = _mm256_add_ps(fiz0,tz);
343
344             fjx0             = _mm256_add_ps(fjx0,tx);
345             fjy0             = _mm256_add_ps(fjy0,ty);
346             fjz0             = _mm256_add_ps(fjz0,tz);
347
348             /**************************
349              * CALCULATE INTERACTIONS *
350              **************************/
351
352             /* REACTION-FIELD ELECTROSTATICS */
353             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
354             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             velecsum         = _mm256_add_ps(velecsum,velec);
358
359             fscal            = felec;
360
361             /* Calculate temporary vectorial force */
362             tx               = _mm256_mul_ps(fscal,dx01);
363             ty               = _mm256_mul_ps(fscal,dy01);
364             tz               = _mm256_mul_ps(fscal,dz01);
365
366             /* Update vectorial force */
367             fix0             = _mm256_add_ps(fix0,tx);
368             fiy0             = _mm256_add_ps(fiy0,ty);
369             fiz0             = _mm256_add_ps(fiz0,tz);
370
371             fjx1             = _mm256_add_ps(fjx1,tx);
372             fjy1             = _mm256_add_ps(fjy1,ty);
373             fjz1             = _mm256_add_ps(fjz1,tz);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             /* REACTION-FIELD ELECTROSTATICS */
380             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
381             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
382
383             /* Update potential sum for this i atom from the interaction with this j atom. */
384             velecsum         = _mm256_add_ps(velecsum,velec);
385
386             fscal            = felec;
387
388             /* Calculate temporary vectorial force */
389             tx               = _mm256_mul_ps(fscal,dx02);
390             ty               = _mm256_mul_ps(fscal,dy02);
391             tz               = _mm256_mul_ps(fscal,dz02);
392
393             /* Update vectorial force */
394             fix0             = _mm256_add_ps(fix0,tx);
395             fiy0             = _mm256_add_ps(fiy0,ty);
396             fiz0             = _mm256_add_ps(fiz0,tz);
397
398             fjx2             = _mm256_add_ps(fjx2,tx);
399             fjy2             = _mm256_add_ps(fjy2,ty);
400             fjz2             = _mm256_add_ps(fjz2,tz);
401
402             /**************************
403              * CALCULATE INTERACTIONS *
404              **************************/
405
406             /* REACTION-FIELD ELECTROSTATICS */
407             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
408             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
409
410             /* Update potential sum for this i atom from the interaction with this j atom. */
411             velecsum         = _mm256_add_ps(velecsum,velec);
412
413             fscal            = felec;
414
415             /* Calculate temporary vectorial force */
416             tx               = _mm256_mul_ps(fscal,dx10);
417             ty               = _mm256_mul_ps(fscal,dy10);
418             tz               = _mm256_mul_ps(fscal,dz10);
419
420             /* Update vectorial force */
421             fix1             = _mm256_add_ps(fix1,tx);
422             fiy1             = _mm256_add_ps(fiy1,ty);
423             fiz1             = _mm256_add_ps(fiz1,tz);
424
425             fjx0             = _mm256_add_ps(fjx0,tx);
426             fjy0             = _mm256_add_ps(fjy0,ty);
427             fjz0             = _mm256_add_ps(fjz0,tz);
428
429             /**************************
430              * CALCULATE INTERACTIONS *
431              **************************/
432
433             /* REACTION-FIELD ELECTROSTATICS */
434             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
435             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
436
437             /* Update potential sum for this i atom from the interaction with this j atom. */
438             velecsum         = _mm256_add_ps(velecsum,velec);
439
440             fscal            = felec;
441
442             /* Calculate temporary vectorial force */
443             tx               = _mm256_mul_ps(fscal,dx11);
444             ty               = _mm256_mul_ps(fscal,dy11);
445             tz               = _mm256_mul_ps(fscal,dz11);
446
447             /* Update vectorial force */
448             fix1             = _mm256_add_ps(fix1,tx);
449             fiy1             = _mm256_add_ps(fiy1,ty);
450             fiz1             = _mm256_add_ps(fiz1,tz);
451
452             fjx1             = _mm256_add_ps(fjx1,tx);
453             fjy1             = _mm256_add_ps(fjy1,ty);
454             fjz1             = _mm256_add_ps(fjz1,tz);
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             /* REACTION-FIELD ELECTROSTATICS */
461             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
462             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
463
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velecsum         = _mm256_add_ps(velecsum,velec);
466
467             fscal            = felec;
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm256_mul_ps(fscal,dx12);
471             ty               = _mm256_mul_ps(fscal,dy12);
472             tz               = _mm256_mul_ps(fscal,dz12);
473
474             /* Update vectorial force */
475             fix1             = _mm256_add_ps(fix1,tx);
476             fiy1             = _mm256_add_ps(fiy1,ty);
477             fiz1             = _mm256_add_ps(fiz1,tz);
478
479             fjx2             = _mm256_add_ps(fjx2,tx);
480             fjy2             = _mm256_add_ps(fjy2,ty);
481             fjz2             = _mm256_add_ps(fjz2,tz);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             /* REACTION-FIELD ELECTROSTATICS */
488             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
489             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
490
491             /* Update potential sum for this i atom from the interaction with this j atom. */
492             velecsum         = _mm256_add_ps(velecsum,velec);
493
494             fscal            = felec;
495
496             /* Calculate temporary vectorial force */
497             tx               = _mm256_mul_ps(fscal,dx20);
498             ty               = _mm256_mul_ps(fscal,dy20);
499             tz               = _mm256_mul_ps(fscal,dz20);
500
501             /* Update vectorial force */
502             fix2             = _mm256_add_ps(fix2,tx);
503             fiy2             = _mm256_add_ps(fiy2,ty);
504             fiz2             = _mm256_add_ps(fiz2,tz);
505
506             fjx0             = _mm256_add_ps(fjx0,tx);
507             fjy0             = _mm256_add_ps(fjy0,ty);
508             fjz0             = _mm256_add_ps(fjz0,tz);
509
510             /**************************
511              * CALCULATE INTERACTIONS *
512              **************************/
513
514             /* REACTION-FIELD ELECTROSTATICS */
515             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
516             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
517
518             /* Update potential sum for this i atom from the interaction with this j atom. */
519             velecsum         = _mm256_add_ps(velecsum,velec);
520
521             fscal            = felec;
522
523             /* Calculate temporary vectorial force */
524             tx               = _mm256_mul_ps(fscal,dx21);
525             ty               = _mm256_mul_ps(fscal,dy21);
526             tz               = _mm256_mul_ps(fscal,dz21);
527
528             /* Update vectorial force */
529             fix2             = _mm256_add_ps(fix2,tx);
530             fiy2             = _mm256_add_ps(fiy2,ty);
531             fiz2             = _mm256_add_ps(fiz2,tz);
532
533             fjx1             = _mm256_add_ps(fjx1,tx);
534             fjy1             = _mm256_add_ps(fjy1,ty);
535             fjz1             = _mm256_add_ps(fjz1,tz);
536
537             /**************************
538              * CALCULATE INTERACTIONS *
539              **************************/
540
541             /* REACTION-FIELD ELECTROSTATICS */
542             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
543             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velecsum         = _mm256_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm256_mul_ps(fscal,dx22);
552             ty               = _mm256_mul_ps(fscal,dy22);
553             tz               = _mm256_mul_ps(fscal,dz22);
554
555             /* Update vectorial force */
556             fix2             = _mm256_add_ps(fix2,tx);
557             fiy2             = _mm256_add_ps(fiy2,ty);
558             fiz2             = _mm256_add_ps(fiz2,tz);
559
560             fjx2             = _mm256_add_ps(fjx2,tx);
561             fjy2             = _mm256_add_ps(fjy2,ty);
562             fjz2             = _mm256_add_ps(fjz2,tz);
563
564             fjptrA             = f+j_coord_offsetA;
565             fjptrB             = f+j_coord_offsetB;
566             fjptrC             = f+j_coord_offsetC;
567             fjptrD             = f+j_coord_offsetD;
568             fjptrE             = f+j_coord_offsetE;
569             fjptrF             = f+j_coord_offsetF;
570             fjptrG             = f+j_coord_offsetG;
571             fjptrH             = f+j_coord_offsetH;
572
573             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
574                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
575
576             /* Inner loop uses 300 flops */
577         }
578
579         if(jidx<j_index_end)
580         {
581
582             /* Get j neighbor index, and coordinate index */
583             jnrlistA         = jjnr[jidx];
584             jnrlistB         = jjnr[jidx+1];
585             jnrlistC         = jjnr[jidx+2];
586             jnrlistD         = jjnr[jidx+3];
587             jnrlistE         = jjnr[jidx+4];
588             jnrlistF         = jjnr[jidx+5];
589             jnrlistG         = jjnr[jidx+6];
590             jnrlistH         = jjnr[jidx+7];
591             /* Sign of each element will be negative for non-real atoms.
592              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
593              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
594              */
595             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
596                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
597                                             
598             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
599             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
600             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
601             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
602             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
603             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
604             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
605             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
606             j_coord_offsetA  = DIM*jnrA;
607             j_coord_offsetB  = DIM*jnrB;
608             j_coord_offsetC  = DIM*jnrC;
609             j_coord_offsetD  = DIM*jnrD;
610             j_coord_offsetE  = DIM*jnrE;
611             j_coord_offsetF  = DIM*jnrF;
612             j_coord_offsetG  = DIM*jnrG;
613             j_coord_offsetH  = DIM*jnrH;
614
615             /* load j atom coordinates */
616             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
617                                                  x+j_coord_offsetC,x+j_coord_offsetD,
618                                                  x+j_coord_offsetE,x+j_coord_offsetF,
619                                                  x+j_coord_offsetG,x+j_coord_offsetH,
620                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
621
622             /* Calculate displacement vector */
623             dx00             = _mm256_sub_ps(ix0,jx0);
624             dy00             = _mm256_sub_ps(iy0,jy0);
625             dz00             = _mm256_sub_ps(iz0,jz0);
626             dx01             = _mm256_sub_ps(ix0,jx1);
627             dy01             = _mm256_sub_ps(iy0,jy1);
628             dz01             = _mm256_sub_ps(iz0,jz1);
629             dx02             = _mm256_sub_ps(ix0,jx2);
630             dy02             = _mm256_sub_ps(iy0,jy2);
631             dz02             = _mm256_sub_ps(iz0,jz2);
632             dx10             = _mm256_sub_ps(ix1,jx0);
633             dy10             = _mm256_sub_ps(iy1,jy0);
634             dz10             = _mm256_sub_ps(iz1,jz0);
635             dx11             = _mm256_sub_ps(ix1,jx1);
636             dy11             = _mm256_sub_ps(iy1,jy1);
637             dz11             = _mm256_sub_ps(iz1,jz1);
638             dx12             = _mm256_sub_ps(ix1,jx2);
639             dy12             = _mm256_sub_ps(iy1,jy2);
640             dz12             = _mm256_sub_ps(iz1,jz2);
641             dx20             = _mm256_sub_ps(ix2,jx0);
642             dy20             = _mm256_sub_ps(iy2,jy0);
643             dz20             = _mm256_sub_ps(iz2,jz0);
644             dx21             = _mm256_sub_ps(ix2,jx1);
645             dy21             = _mm256_sub_ps(iy2,jy1);
646             dz21             = _mm256_sub_ps(iz2,jz1);
647             dx22             = _mm256_sub_ps(ix2,jx2);
648             dy22             = _mm256_sub_ps(iy2,jy2);
649             dz22             = _mm256_sub_ps(iz2,jz2);
650
651             /* Calculate squared distance and things based on it */
652             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
653             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
654             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
655             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
656             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
657             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
658             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
659             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
660             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
661
662             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
663             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
664             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
665             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
666             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
667             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
668             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
669             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
670             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
671
672             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
673             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
674             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
675             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
676             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
677             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
678             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
679             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
680             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
681
682             fjx0             = _mm256_setzero_ps();
683             fjy0             = _mm256_setzero_ps();
684             fjz0             = _mm256_setzero_ps();
685             fjx1             = _mm256_setzero_ps();
686             fjy1             = _mm256_setzero_ps();
687             fjz1             = _mm256_setzero_ps();
688             fjx2             = _mm256_setzero_ps();
689             fjy2             = _mm256_setzero_ps();
690             fjz2             = _mm256_setzero_ps();
691
692             /**************************
693              * CALCULATE INTERACTIONS *
694              **************************/
695
696             /* REACTION-FIELD ELECTROSTATICS */
697             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
698             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
699
700             /* LENNARD-JONES DISPERSION/REPULSION */
701
702             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
703             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
704             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
705             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
706             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
707
708             /* Update potential sum for this i atom from the interaction with this j atom. */
709             velec            = _mm256_andnot_ps(dummy_mask,velec);
710             velecsum         = _mm256_add_ps(velecsum,velec);
711             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
712             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
713
714             fscal            = _mm256_add_ps(felec,fvdw);
715
716             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
717
718             /* Calculate temporary vectorial force */
719             tx               = _mm256_mul_ps(fscal,dx00);
720             ty               = _mm256_mul_ps(fscal,dy00);
721             tz               = _mm256_mul_ps(fscal,dz00);
722
723             /* Update vectorial force */
724             fix0             = _mm256_add_ps(fix0,tx);
725             fiy0             = _mm256_add_ps(fiy0,ty);
726             fiz0             = _mm256_add_ps(fiz0,tz);
727
728             fjx0             = _mm256_add_ps(fjx0,tx);
729             fjy0             = _mm256_add_ps(fjy0,ty);
730             fjz0             = _mm256_add_ps(fjz0,tz);
731
732             /**************************
733              * CALCULATE INTERACTIONS *
734              **************************/
735
736             /* REACTION-FIELD ELECTROSTATICS */
737             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
738             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
739
740             /* Update potential sum for this i atom from the interaction with this j atom. */
741             velec            = _mm256_andnot_ps(dummy_mask,velec);
742             velecsum         = _mm256_add_ps(velecsum,velec);
743
744             fscal            = felec;
745
746             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
747
748             /* Calculate temporary vectorial force */
749             tx               = _mm256_mul_ps(fscal,dx01);
750             ty               = _mm256_mul_ps(fscal,dy01);
751             tz               = _mm256_mul_ps(fscal,dz01);
752
753             /* Update vectorial force */
754             fix0             = _mm256_add_ps(fix0,tx);
755             fiy0             = _mm256_add_ps(fiy0,ty);
756             fiz0             = _mm256_add_ps(fiz0,tz);
757
758             fjx1             = _mm256_add_ps(fjx1,tx);
759             fjy1             = _mm256_add_ps(fjy1,ty);
760             fjz1             = _mm256_add_ps(fjz1,tz);
761
762             /**************************
763              * CALCULATE INTERACTIONS *
764              **************************/
765
766             /* REACTION-FIELD ELECTROSTATICS */
767             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
768             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
769
770             /* Update potential sum for this i atom from the interaction with this j atom. */
771             velec            = _mm256_andnot_ps(dummy_mask,velec);
772             velecsum         = _mm256_add_ps(velecsum,velec);
773
774             fscal            = felec;
775
776             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
777
778             /* Calculate temporary vectorial force */
779             tx               = _mm256_mul_ps(fscal,dx02);
780             ty               = _mm256_mul_ps(fscal,dy02);
781             tz               = _mm256_mul_ps(fscal,dz02);
782
783             /* Update vectorial force */
784             fix0             = _mm256_add_ps(fix0,tx);
785             fiy0             = _mm256_add_ps(fiy0,ty);
786             fiz0             = _mm256_add_ps(fiz0,tz);
787
788             fjx2             = _mm256_add_ps(fjx2,tx);
789             fjy2             = _mm256_add_ps(fjy2,ty);
790             fjz2             = _mm256_add_ps(fjz2,tz);
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             /* REACTION-FIELD ELECTROSTATICS */
797             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
798             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm256_andnot_ps(dummy_mask,velec);
802             velecsum         = _mm256_add_ps(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
807
808             /* Calculate temporary vectorial force */
809             tx               = _mm256_mul_ps(fscal,dx10);
810             ty               = _mm256_mul_ps(fscal,dy10);
811             tz               = _mm256_mul_ps(fscal,dz10);
812
813             /* Update vectorial force */
814             fix1             = _mm256_add_ps(fix1,tx);
815             fiy1             = _mm256_add_ps(fiy1,ty);
816             fiz1             = _mm256_add_ps(fiz1,tz);
817
818             fjx0             = _mm256_add_ps(fjx0,tx);
819             fjy0             = _mm256_add_ps(fjy0,ty);
820             fjz0             = _mm256_add_ps(fjz0,tz);
821
822             /**************************
823              * CALCULATE INTERACTIONS *
824              **************************/
825
826             /* REACTION-FIELD ELECTROSTATICS */
827             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
828             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
829
830             /* Update potential sum for this i atom from the interaction with this j atom. */
831             velec            = _mm256_andnot_ps(dummy_mask,velec);
832             velecsum         = _mm256_add_ps(velecsum,velec);
833
834             fscal            = felec;
835
836             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
837
838             /* Calculate temporary vectorial force */
839             tx               = _mm256_mul_ps(fscal,dx11);
840             ty               = _mm256_mul_ps(fscal,dy11);
841             tz               = _mm256_mul_ps(fscal,dz11);
842
843             /* Update vectorial force */
844             fix1             = _mm256_add_ps(fix1,tx);
845             fiy1             = _mm256_add_ps(fiy1,ty);
846             fiz1             = _mm256_add_ps(fiz1,tz);
847
848             fjx1             = _mm256_add_ps(fjx1,tx);
849             fjy1             = _mm256_add_ps(fjy1,ty);
850             fjz1             = _mm256_add_ps(fjz1,tz);
851
852             /**************************
853              * CALCULATE INTERACTIONS *
854              **************************/
855
856             /* REACTION-FIELD ELECTROSTATICS */
857             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
858             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
859
860             /* Update potential sum for this i atom from the interaction with this j atom. */
861             velec            = _mm256_andnot_ps(dummy_mask,velec);
862             velecsum         = _mm256_add_ps(velecsum,velec);
863
864             fscal            = felec;
865
866             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm256_mul_ps(fscal,dx12);
870             ty               = _mm256_mul_ps(fscal,dy12);
871             tz               = _mm256_mul_ps(fscal,dz12);
872
873             /* Update vectorial force */
874             fix1             = _mm256_add_ps(fix1,tx);
875             fiy1             = _mm256_add_ps(fiy1,ty);
876             fiz1             = _mm256_add_ps(fiz1,tz);
877
878             fjx2             = _mm256_add_ps(fjx2,tx);
879             fjy2             = _mm256_add_ps(fjy2,ty);
880             fjz2             = _mm256_add_ps(fjz2,tz);
881
882             /**************************
883              * CALCULATE INTERACTIONS *
884              **************************/
885
886             /* REACTION-FIELD ELECTROSTATICS */
887             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
888             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
889
890             /* Update potential sum for this i atom from the interaction with this j atom. */
891             velec            = _mm256_andnot_ps(dummy_mask,velec);
892             velecsum         = _mm256_add_ps(velecsum,velec);
893
894             fscal            = felec;
895
896             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
897
898             /* Calculate temporary vectorial force */
899             tx               = _mm256_mul_ps(fscal,dx20);
900             ty               = _mm256_mul_ps(fscal,dy20);
901             tz               = _mm256_mul_ps(fscal,dz20);
902
903             /* Update vectorial force */
904             fix2             = _mm256_add_ps(fix2,tx);
905             fiy2             = _mm256_add_ps(fiy2,ty);
906             fiz2             = _mm256_add_ps(fiz2,tz);
907
908             fjx0             = _mm256_add_ps(fjx0,tx);
909             fjy0             = _mm256_add_ps(fjy0,ty);
910             fjz0             = _mm256_add_ps(fjz0,tz);
911
912             /**************************
913              * CALCULATE INTERACTIONS *
914              **************************/
915
916             /* REACTION-FIELD ELECTROSTATICS */
917             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
918             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm256_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm256_add_ps(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm256_mul_ps(fscal,dx21);
930             ty               = _mm256_mul_ps(fscal,dy21);
931             tz               = _mm256_mul_ps(fscal,dz21);
932
933             /* Update vectorial force */
934             fix2             = _mm256_add_ps(fix2,tx);
935             fiy2             = _mm256_add_ps(fiy2,ty);
936             fiz2             = _mm256_add_ps(fiz2,tz);
937
938             fjx1             = _mm256_add_ps(fjx1,tx);
939             fjy1             = _mm256_add_ps(fjy1,ty);
940             fjz1             = _mm256_add_ps(fjz1,tz);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             /* REACTION-FIELD ELECTROSTATICS */
947             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
948             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
949
950             /* Update potential sum for this i atom from the interaction with this j atom. */
951             velec            = _mm256_andnot_ps(dummy_mask,velec);
952             velecsum         = _mm256_add_ps(velecsum,velec);
953
954             fscal            = felec;
955
956             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
957
958             /* Calculate temporary vectorial force */
959             tx               = _mm256_mul_ps(fscal,dx22);
960             ty               = _mm256_mul_ps(fscal,dy22);
961             tz               = _mm256_mul_ps(fscal,dz22);
962
963             /* Update vectorial force */
964             fix2             = _mm256_add_ps(fix2,tx);
965             fiy2             = _mm256_add_ps(fiy2,ty);
966             fiz2             = _mm256_add_ps(fiz2,tz);
967
968             fjx2             = _mm256_add_ps(fjx2,tx);
969             fjy2             = _mm256_add_ps(fjy2,ty);
970             fjz2             = _mm256_add_ps(fjz2,tz);
971
972             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
973             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
974             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
975             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
976             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
977             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
978             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
979             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
980
981             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
982                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
983
984             /* Inner loop uses 300 flops */
985         }
986
987         /* End of innermost loop */
988
989         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
990                                                  f+i_coord_offset,fshift+i_shift_offset);
991
992         ggid                        = gid[iidx];
993         /* Update potential energies */
994         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
995         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
996
997         /* Increment number of inner iterations */
998         inneriter                  += j_index_end - j_index_start;
999
1000         /* Outer loop uses 20 flops */
1001     }
1002
1003     /* Increment number of outer iterations */
1004     outeriter        += nri;
1005
1006     /* Update outer/inner flops */
1007
1008     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
1009 }
1010 /*
1011  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_single
1012  * Electrostatics interaction: ReactionField
1013  * VdW interaction:            LennardJones
1014  * Geometry:                   Water3-Water3
1015  * Calculate force/pot:        Force
1016  */
1017 void
1018 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_avx_256_single
1019                     (t_nblist                    * gmx_restrict       nlist,
1020                      rvec                        * gmx_restrict          xx,
1021                      rvec                        * gmx_restrict          ff,
1022                      t_forcerec                  * gmx_restrict          fr,
1023                      t_mdatoms                   * gmx_restrict     mdatoms,
1024                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1025                      t_nrnb                      * gmx_restrict        nrnb)
1026 {
1027     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1028      * just 0 for non-waters.
1029      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1030      * jnr indices corresponding to data put in the four positions in the SIMD register.
1031      */
1032     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1033     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1034     int              jnrA,jnrB,jnrC,jnrD;
1035     int              jnrE,jnrF,jnrG,jnrH;
1036     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1037     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1038     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1039     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1040     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1041     real             rcutoff_scalar;
1042     real             *shiftvec,*fshift,*x,*f;
1043     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1044     real             scratch[4*DIM];
1045     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1046     real *           vdwioffsetptr0;
1047     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1048     real *           vdwioffsetptr1;
1049     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1050     real *           vdwioffsetptr2;
1051     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1052     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1053     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1054     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1055     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1056     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1057     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1058     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1059     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1060     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1061     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1062     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1063     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1064     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1065     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1066     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1067     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1068     real             *charge;
1069     int              nvdwtype;
1070     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1071     int              *vdwtype;
1072     real             *vdwparam;
1073     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1074     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1075     __m256           dummy_mask,cutoff_mask;
1076     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1077     __m256           one     = _mm256_set1_ps(1.0);
1078     __m256           two     = _mm256_set1_ps(2.0);
1079     x                = xx[0];
1080     f                = ff[0];
1081
1082     nri              = nlist->nri;
1083     iinr             = nlist->iinr;
1084     jindex           = nlist->jindex;
1085     jjnr             = nlist->jjnr;
1086     shiftidx         = nlist->shift;
1087     gid              = nlist->gid;
1088     shiftvec         = fr->shift_vec[0];
1089     fshift           = fr->fshift[0];
1090     facel            = _mm256_set1_ps(fr->epsfac);
1091     charge           = mdatoms->chargeA;
1092     krf              = _mm256_set1_ps(fr->ic->k_rf);
1093     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1094     crf              = _mm256_set1_ps(fr->ic->c_rf);
1095     nvdwtype         = fr->ntype;
1096     vdwparam         = fr->nbfp;
1097     vdwtype          = mdatoms->typeA;
1098
1099     /* Setup water-specific parameters */
1100     inr              = nlist->iinr[0];
1101     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1102     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1103     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1104     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1105
1106     jq0              = _mm256_set1_ps(charge[inr+0]);
1107     jq1              = _mm256_set1_ps(charge[inr+1]);
1108     jq2              = _mm256_set1_ps(charge[inr+2]);
1109     vdwjidx0A        = 2*vdwtype[inr+0];
1110     qq00             = _mm256_mul_ps(iq0,jq0);
1111     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1112     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1113     qq01             = _mm256_mul_ps(iq0,jq1);
1114     qq02             = _mm256_mul_ps(iq0,jq2);
1115     qq10             = _mm256_mul_ps(iq1,jq0);
1116     qq11             = _mm256_mul_ps(iq1,jq1);
1117     qq12             = _mm256_mul_ps(iq1,jq2);
1118     qq20             = _mm256_mul_ps(iq2,jq0);
1119     qq21             = _mm256_mul_ps(iq2,jq1);
1120     qq22             = _mm256_mul_ps(iq2,jq2);
1121
1122     /* Avoid stupid compiler warnings */
1123     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1124     j_coord_offsetA = 0;
1125     j_coord_offsetB = 0;
1126     j_coord_offsetC = 0;
1127     j_coord_offsetD = 0;
1128     j_coord_offsetE = 0;
1129     j_coord_offsetF = 0;
1130     j_coord_offsetG = 0;
1131     j_coord_offsetH = 0;
1132
1133     outeriter        = 0;
1134     inneriter        = 0;
1135
1136     for(iidx=0;iidx<4*DIM;iidx++)
1137     {
1138         scratch[iidx] = 0.0;
1139     }
1140
1141     /* Start outer loop over neighborlists */
1142     for(iidx=0; iidx<nri; iidx++)
1143     {
1144         /* Load shift vector for this list */
1145         i_shift_offset   = DIM*shiftidx[iidx];
1146
1147         /* Load limits for loop over neighbors */
1148         j_index_start    = jindex[iidx];
1149         j_index_end      = jindex[iidx+1];
1150
1151         /* Get outer coordinate index */
1152         inr              = iinr[iidx];
1153         i_coord_offset   = DIM*inr;
1154
1155         /* Load i particle coords and add shift vector */
1156         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1157                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1158
1159         fix0             = _mm256_setzero_ps();
1160         fiy0             = _mm256_setzero_ps();
1161         fiz0             = _mm256_setzero_ps();
1162         fix1             = _mm256_setzero_ps();
1163         fiy1             = _mm256_setzero_ps();
1164         fiz1             = _mm256_setzero_ps();
1165         fix2             = _mm256_setzero_ps();
1166         fiy2             = _mm256_setzero_ps();
1167         fiz2             = _mm256_setzero_ps();
1168
1169         /* Start inner kernel loop */
1170         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1171         {
1172
1173             /* Get j neighbor index, and coordinate index */
1174             jnrA             = jjnr[jidx];
1175             jnrB             = jjnr[jidx+1];
1176             jnrC             = jjnr[jidx+2];
1177             jnrD             = jjnr[jidx+3];
1178             jnrE             = jjnr[jidx+4];
1179             jnrF             = jjnr[jidx+5];
1180             jnrG             = jjnr[jidx+6];
1181             jnrH             = jjnr[jidx+7];
1182             j_coord_offsetA  = DIM*jnrA;
1183             j_coord_offsetB  = DIM*jnrB;
1184             j_coord_offsetC  = DIM*jnrC;
1185             j_coord_offsetD  = DIM*jnrD;
1186             j_coord_offsetE  = DIM*jnrE;
1187             j_coord_offsetF  = DIM*jnrF;
1188             j_coord_offsetG  = DIM*jnrG;
1189             j_coord_offsetH  = DIM*jnrH;
1190
1191             /* load j atom coordinates */
1192             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1193                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1194                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1195                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1196                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1197
1198             /* Calculate displacement vector */
1199             dx00             = _mm256_sub_ps(ix0,jx0);
1200             dy00             = _mm256_sub_ps(iy0,jy0);
1201             dz00             = _mm256_sub_ps(iz0,jz0);
1202             dx01             = _mm256_sub_ps(ix0,jx1);
1203             dy01             = _mm256_sub_ps(iy0,jy1);
1204             dz01             = _mm256_sub_ps(iz0,jz1);
1205             dx02             = _mm256_sub_ps(ix0,jx2);
1206             dy02             = _mm256_sub_ps(iy0,jy2);
1207             dz02             = _mm256_sub_ps(iz0,jz2);
1208             dx10             = _mm256_sub_ps(ix1,jx0);
1209             dy10             = _mm256_sub_ps(iy1,jy0);
1210             dz10             = _mm256_sub_ps(iz1,jz0);
1211             dx11             = _mm256_sub_ps(ix1,jx1);
1212             dy11             = _mm256_sub_ps(iy1,jy1);
1213             dz11             = _mm256_sub_ps(iz1,jz1);
1214             dx12             = _mm256_sub_ps(ix1,jx2);
1215             dy12             = _mm256_sub_ps(iy1,jy2);
1216             dz12             = _mm256_sub_ps(iz1,jz2);
1217             dx20             = _mm256_sub_ps(ix2,jx0);
1218             dy20             = _mm256_sub_ps(iy2,jy0);
1219             dz20             = _mm256_sub_ps(iz2,jz0);
1220             dx21             = _mm256_sub_ps(ix2,jx1);
1221             dy21             = _mm256_sub_ps(iy2,jy1);
1222             dz21             = _mm256_sub_ps(iz2,jz1);
1223             dx22             = _mm256_sub_ps(ix2,jx2);
1224             dy22             = _mm256_sub_ps(iy2,jy2);
1225             dz22             = _mm256_sub_ps(iz2,jz2);
1226
1227             /* Calculate squared distance and things based on it */
1228             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1229             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1230             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1231             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1232             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1233             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1234             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1235             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1236             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1237
1238             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1239             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1240             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1241             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1242             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1243             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1244             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1245             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1246             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1247
1248             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1249             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1250             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1251             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1252             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1253             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1254             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1255             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1256             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1257
1258             fjx0             = _mm256_setzero_ps();
1259             fjy0             = _mm256_setzero_ps();
1260             fjz0             = _mm256_setzero_ps();
1261             fjx1             = _mm256_setzero_ps();
1262             fjy1             = _mm256_setzero_ps();
1263             fjz1             = _mm256_setzero_ps();
1264             fjx2             = _mm256_setzero_ps();
1265             fjy2             = _mm256_setzero_ps();
1266             fjz2             = _mm256_setzero_ps();
1267
1268             /**************************
1269              * CALCULATE INTERACTIONS *
1270              **************************/
1271
1272             /* REACTION-FIELD ELECTROSTATICS */
1273             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1274
1275             /* LENNARD-JONES DISPERSION/REPULSION */
1276
1277             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1278             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1279
1280             fscal            = _mm256_add_ps(felec,fvdw);
1281
1282             /* Calculate temporary vectorial force */
1283             tx               = _mm256_mul_ps(fscal,dx00);
1284             ty               = _mm256_mul_ps(fscal,dy00);
1285             tz               = _mm256_mul_ps(fscal,dz00);
1286
1287             /* Update vectorial force */
1288             fix0             = _mm256_add_ps(fix0,tx);
1289             fiy0             = _mm256_add_ps(fiy0,ty);
1290             fiz0             = _mm256_add_ps(fiz0,tz);
1291
1292             fjx0             = _mm256_add_ps(fjx0,tx);
1293             fjy0             = _mm256_add_ps(fjy0,ty);
1294             fjz0             = _mm256_add_ps(fjz0,tz);
1295
1296             /**************************
1297              * CALCULATE INTERACTIONS *
1298              **************************/
1299
1300             /* REACTION-FIELD ELECTROSTATICS */
1301             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1302
1303             fscal            = felec;
1304
1305             /* Calculate temporary vectorial force */
1306             tx               = _mm256_mul_ps(fscal,dx01);
1307             ty               = _mm256_mul_ps(fscal,dy01);
1308             tz               = _mm256_mul_ps(fscal,dz01);
1309
1310             /* Update vectorial force */
1311             fix0             = _mm256_add_ps(fix0,tx);
1312             fiy0             = _mm256_add_ps(fiy0,ty);
1313             fiz0             = _mm256_add_ps(fiz0,tz);
1314
1315             fjx1             = _mm256_add_ps(fjx1,tx);
1316             fjy1             = _mm256_add_ps(fjy1,ty);
1317             fjz1             = _mm256_add_ps(fjz1,tz);
1318
1319             /**************************
1320              * CALCULATE INTERACTIONS *
1321              **************************/
1322
1323             /* REACTION-FIELD ELECTROSTATICS */
1324             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1325
1326             fscal            = felec;
1327
1328             /* Calculate temporary vectorial force */
1329             tx               = _mm256_mul_ps(fscal,dx02);
1330             ty               = _mm256_mul_ps(fscal,dy02);
1331             tz               = _mm256_mul_ps(fscal,dz02);
1332
1333             /* Update vectorial force */
1334             fix0             = _mm256_add_ps(fix0,tx);
1335             fiy0             = _mm256_add_ps(fiy0,ty);
1336             fiz0             = _mm256_add_ps(fiz0,tz);
1337
1338             fjx2             = _mm256_add_ps(fjx2,tx);
1339             fjy2             = _mm256_add_ps(fjy2,ty);
1340             fjz2             = _mm256_add_ps(fjz2,tz);
1341
1342             /**************************
1343              * CALCULATE INTERACTIONS *
1344              **************************/
1345
1346             /* REACTION-FIELD ELECTROSTATICS */
1347             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1348
1349             fscal            = felec;
1350
1351             /* Calculate temporary vectorial force */
1352             tx               = _mm256_mul_ps(fscal,dx10);
1353             ty               = _mm256_mul_ps(fscal,dy10);
1354             tz               = _mm256_mul_ps(fscal,dz10);
1355
1356             /* Update vectorial force */
1357             fix1             = _mm256_add_ps(fix1,tx);
1358             fiy1             = _mm256_add_ps(fiy1,ty);
1359             fiz1             = _mm256_add_ps(fiz1,tz);
1360
1361             fjx0             = _mm256_add_ps(fjx0,tx);
1362             fjy0             = _mm256_add_ps(fjy0,ty);
1363             fjz0             = _mm256_add_ps(fjz0,tz);
1364
1365             /**************************
1366              * CALCULATE INTERACTIONS *
1367              **************************/
1368
1369             /* REACTION-FIELD ELECTROSTATICS */
1370             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1371
1372             fscal            = felec;
1373
1374             /* Calculate temporary vectorial force */
1375             tx               = _mm256_mul_ps(fscal,dx11);
1376             ty               = _mm256_mul_ps(fscal,dy11);
1377             tz               = _mm256_mul_ps(fscal,dz11);
1378
1379             /* Update vectorial force */
1380             fix1             = _mm256_add_ps(fix1,tx);
1381             fiy1             = _mm256_add_ps(fiy1,ty);
1382             fiz1             = _mm256_add_ps(fiz1,tz);
1383
1384             fjx1             = _mm256_add_ps(fjx1,tx);
1385             fjy1             = _mm256_add_ps(fjy1,ty);
1386             fjz1             = _mm256_add_ps(fjz1,tz);
1387
1388             /**************************
1389              * CALCULATE INTERACTIONS *
1390              **************************/
1391
1392             /* REACTION-FIELD ELECTROSTATICS */
1393             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1394
1395             fscal            = felec;
1396
1397             /* Calculate temporary vectorial force */
1398             tx               = _mm256_mul_ps(fscal,dx12);
1399             ty               = _mm256_mul_ps(fscal,dy12);
1400             tz               = _mm256_mul_ps(fscal,dz12);
1401
1402             /* Update vectorial force */
1403             fix1             = _mm256_add_ps(fix1,tx);
1404             fiy1             = _mm256_add_ps(fiy1,ty);
1405             fiz1             = _mm256_add_ps(fiz1,tz);
1406
1407             fjx2             = _mm256_add_ps(fjx2,tx);
1408             fjy2             = _mm256_add_ps(fjy2,ty);
1409             fjz2             = _mm256_add_ps(fjz2,tz);
1410
1411             /**************************
1412              * CALCULATE INTERACTIONS *
1413              **************************/
1414
1415             /* REACTION-FIELD ELECTROSTATICS */
1416             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1417
1418             fscal            = felec;
1419
1420             /* Calculate temporary vectorial force */
1421             tx               = _mm256_mul_ps(fscal,dx20);
1422             ty               = _mm256_mul_ps(fscal,dy20);
1423             tz               = _mm256_mul_ps(fscal,dz20);
1424
1425             /* Update vectorial force */
1426             fix2             = _mm256_add_ps(fix2,tx);
1427             fiy2             = _mm256_add_ps(fiy2,ty);
1428             fiz2             = _mm256_add_ps(fiz2,tz);
1429
1430             fjx0             = _mm256_add_ps(fjx0,tx);
1431             fjy0             = _mm256_add_ps(fjy0,ty);
1432             fjz0             = _mm256_add_ps(fjz0,tz);
1433
1434             /**************************
1435              * CALCULATE INTERACTIONS *
1436              **************************/
1437
1438             /* REACTION-FIELD ELECTROSTATICS */
1439             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1440
1441             fscal            = felec;
1442
1443             /* Calculate temporary vectorial force */
1444             tx               = _mm256_mul_ps(fscal,dx21);
1445             ty               = _mm256_mul_ps(fscal,dy21);
1446             tz               = _mm256_mul_ps(fscal,dz21);
1447
1448             /* Update vectorial force */
1449             fix2             = _mm256_add_ps(fix2,tx);
1450             fiy2             = _mm256_add_ps(fiy2,ty);
1451             fiz2             = _mm256_add_ps(fiz2,tz);
1452
1453             fjx1             = _mm256_add_ps(fjx1,tx);
1454             fjy1             = _mm256_add_ps(fjy1,ty);
1455             fjz1             = _mm256_add_ps(fjz1,tz);
1456
1457             /**************************
1458              * CALCULATE INTERACTIONS *
1459              **************************/
1460
1461             /* REACTION-FIELD ELECTROSTATICS */
1462             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1463
1464             fscal            = felec;
1465
1466             /* Calculate temporary vectorial force */
1467             tx               = _mm256_mul_ps(fscal,dx22);
1468             ty               = _mm256_mul_ps(fscal,dy22);
1469             tz               = _mm256_mul_ps(fscal,dz22);
1470
1471             /* Update vectorial force */
1472             fix2             = _mm256_add_ps(fix2,tx);
1473             fiy2             = _mm256_add_ps(fiy2,ty);
1474             fiz2             = _mm256_add_ps(fiz2,tz);
1475
1476             fjx2             = _mm256_add_ps(fjx2,tx);
1477             fjy2             = _mm256_add_ps(fjy2,ty);
1478             fjz2             = _mm256_add_ps(fjz2,tz);
1479
1480             fjptrA             = f+j_coord_offsetA;
1481             fjptrB             = f+j_coord_offsetB;
1482             fjptrC             = f+j_coord_offsetC;
1483             fjptrD             = f+j_coord_offsetD;
1484             fjptrE             = f+j_coord_offsetE;
1485             fjptrF             = f+j_coord_offsetF;
1486             fjptrG             = f+j_coord_offsetG;
1487             fjptrH             = f+j_coord_offsetH;
1488
1489             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1490                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1491
1492             /* Inner loop uses 250 flops */
1493         }
1494
1495         if(jidx<j_index_end)
1496         {
1497
1498             /* Get j neighbor index, and coordinate index */
1499             jnrlistA         = jjnr[jidx];
1500             jnrlistB         = jjnr[jidx+1];
1501             jnrlistC         = jjnr[jidx+2];
1502             jnrlistD         = jjnr[jidx+3];
1503             jnrlistE         = jjnr[jidx+4];
1504             jnrlistF         = jjnr[jidx+5];
1505             jnrlistG         = jjnr[jidx+6];
1506             jnrlistH         = jjnr[jidx+7];
1507             /* Sign of each element will be negative for non-real atoms.
1508              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1509              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1510              */
1511             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1512                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1513                                             
1514             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1515             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1516             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1517             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1518             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1519             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1520             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1521             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1522             j_coord_offsetA  = DIM*jnrA;
1523             j_coord_offsetB  = DIM*jnrB;
1524             j_coord_offsetC  = DIM*jnrC;
1525             j_coord_offsetD  = DIM*jnrD;
1526             j_coord_offsetE  = DIM*jnrE;
1527             j_coord_offsetF  = DIM*jnrF;
1528             j_coord_offsetG  = DIM*jnrG;
1529             j_coord_offsetH  = DIM*jnrH;
1530
1531             /* load j atom coordinates */
1532             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1533                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1534                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1535                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1536                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1537
1538             /* Calculate displacement vector */
1539             dx00             = _mm256_sub_ps(ix0,jx0);
1540             dy00             = _mm256_sub_ps(iy0,jy0);
1541             dz00             = _mm256_sub_ps(iz0,jz0);
1542             dx01             = _mm256_sub_ps(ix0,jx1);
1543             dy01             = _mm256_sub_ps(iy0,jy1);
1544             dz01             = _mm256_sub_ps(iz0,jz1);
1545             dx02             = _mm256_sub_ps(ix0,jx2);
1546             dy02             = _mm256_sub_ps(iy0,jy2);
1547             dz02             = _mm256_sub_ps(iz0,jz2);
1548             dx10             = _mm256_sub_ps(ix1,jx0);
1549             dy10             = _mm256_sub_ps(iy1,jy0);
1550             dz10             = _mm256_sub_ps(iz1,jz0);
1551             dx11             = _mm256_sub_ps(ix1,jx1);
1552             dy11             = _mm256_sub_ps(iy1,jy1);
1553             dz11             = _mm256_sub_ps(iz1,jz1);
1554             dx12             = _mm256_sub_ps(ix1,jx2);
1555             dy12             = _mm256_sub_ps(iy1,jy2);
1556             dz12             = _mm256_sub_ps(iz1,jz2);
1557             dx20             = _mm256_sub_ps(ix2,jx0);
1558             dy20             = _mm256_sub_ps(iy2,jy0);
1559             dz20             = _mm256_sub_ps(iz2,jz0);
1560             dx21             = _mm256_sub_ps(ix2,jx1);
1561             dy21             = _mm256_sub_ps(iy2,jy1);
1562             dz21             = _mm256_sub_ps(iz2,jz1);
1563             dx22             = _mm256_sub_ps(ix2,jx2);
1564             dy22             = _mm256_sub_ps(iy2,jy2);
1565             dz22             = _mm256_sub_ps(iz2,jz2);
1566
1567             /* Calculate squared distance and things based on it */
1568             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1569             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1570             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1571             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1572             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1573             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1574             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1575             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1576             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1577
1578             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1579             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1580             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1581             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1582             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1583             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1584             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1585             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1586             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1587
1588             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1589             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1590             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1591             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1592             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1593             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1594             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1595             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1596             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1597
1598             fjx0             = _mm256_setzero_ps();
1599             fjy0             = _mm256_setzero_ps();
1600             fjz0             = _mm256_setzero_ps();
1601             fjx1             = _mm256_setzero_ps();
1602             fjy1             = _mm256_setzero_ps();
1603             fjz1             = _mm256_setzero_ps();
1604             fjx2             = _mm256_setzero_ps();
1605             fjy2             = _mm256_setzero_ps();
1606             fjz2             = _mm256_setzero_ps();
1607
1608             /**************************
1609              * CALCULATE INTERACTIONS *
1610              **************************/
1611
1612             /* REACTION-FIELD ELECTROSTATICS */
1613             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1614
1615             /* LENNARD-JONES DISPERSION/REPULSION */
1616
1617             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1618             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1619
1620             fscal            = _mm256_add_ps(felec,fvdw);
1621
1622             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1623
1624             /* Calculate temporary vectorial force */
1625             tx               = _mm256_mul_ps(fscal,dx00);
1626             ty               = _mm256_mul_ps(fscal,dy00);
1627             tz               = _mm256_mul_ps(fscal,dz00);
1628
1629             /* Update vectorial force */
1630             fix0             = _mm256_add_ps(fix0,tx);
1631             fiy0             = _mm256_add_ps(fiy0,ty);
1632             fiz0             = _mm256_add_ps(fiz0,tz);
1633
1634             fjx0             = _mm256_add_ps(fjx0,tx);
1635             fjy0             = _mm256_add_ps(fjy0,ty);
1636             fjz0             = _mm256_add_ps(fjz0,tz);
1637
1638             /**************************
1639              * CALCULATE INTERACTIONS *
1640              **************************/
1641
1642             /* REACTION-FIELD ELECTROSTATICS */
1643             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1644
1645             fscal            = felec;
1646
1647             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1648
1649             /* Calculate temporary vectorial force */
1650             tx               = _mm256_mul_ps(fscal,dx01);
1651             ty               = _mm256_mul_ps(fscal,dy01);
1652             tz               = _mm256_mul_ps(fscal,dz01);
1653
1654             /* Update vectorial force */
1655             fix0             = _mm256_add_ps(fix0,tx);
1656             fiy0             = _mm256_add_ps(fiy0,ty);
1657             fiz0             = _mm256_add_ps(fiz0,tz);
1658
1659             fjx1             = _mm256_add_ps(fjx1,tx);
1660             fjy1             = _mm256_add_ps(fjy1,ty);
1661             fjz1             = _mm256_add_ps(fjz1,tz);
1662
1663             /**************************
1664              * CALCULATE INTERACTIONS *
1665              **************************/
1666
1667             /* REACTION-FIELD ELECTROSTATICS */
1668             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1669
1670             fscal            = felec;
1671
1672             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1673
1674             /* Calculate temporary vectorial force */
1675             tx               = _mm256_mul_ps(fscal,dx02);
1676             ty               = _mm256_mul_ps(fscal,dy02);
1677             tz               = _mm256_mul_ps(fscal,dz02);
1678
1679             /* Update vectorial force */
1680             fix0             = _mm256_add_ps(fix0,tx);
1681             fiy0             = _mm256_add_ps(fiy0,ty);
1682             fiz0             = _mm256_add_ps(fiz0,tz);
1683
1684             fjx2             = _mm256_add_ps(fjx2,tx);
1685             fjy2             = _mm256_add_ps(fjy2,ty);
1686             fjz2             = _mm256_add_ps(fjz2,tz);
1687
1688             /**************************
1689              * CALCULATE INTERACTIONS *
1690              **************************/
1691
1692             /* REACTION-FIELD ELECTROSTATICS */
1693             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1694
1695             fscal            = felec;
1696
1697             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1698
1699             /* Calculate temporary vectorial force */
1700             tx               = _mm256_mul_ps(fscal,dx10);
1701             ty               = _mm256_mul_ps(fscal,dy10);
1702             tz               = _mm256_mul_ps(fscal,dz10);
1703
1704             /* Update vectorial force */
1705             fix1             = _mm256_add_ps(fix1,tx);
1706             fiy1             = _mm256_add_ps(fiy1,ty);
1707             fiz1             = _mm256_add_ps(fiz1,tz);
1708
1709             fjx0             = _mm256_add_ps(fjx0,tx);
1710             fjy0             = _mm256_add_ps(fjy0,ty);
1711             fjz0             = _mm256_add_ps(fjz0,tz);
1712
1713             /**************************
1714              * CALCULATE INTERACTIONS *
1715              **************************/
1716
1717             /* REACTION-FIELD ELECTROSTATICS */
1718             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1719
1720             fscal            = felec;
1721
1722             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1723
1724             /* Calculate temporary vectorial force */
1725             tx               = _mm256_mul_ps(fscal,dx11);
1726             ty               = _mm256_mul_ps(fscal,dy11);
1727             tz               = _mm256_mul_ps(fscal,dz11);
1728
1729             /* Update vectorial force */
1730             fix1             = _mm256_add_ps(fix1,tx);
1731             fiy1             = _mm256_add_ps(fiy1,ty);
1732             fiz1             = _mm256_add_ps(fiz1,tz);
1733
1734             fjx1             = _mm256_add_ps(fjx1,tx);
1735             fjy1             = _mm256_add_ps(fjy1,ty);
1736             fjz1             = _mm256_add_ps(fjz1,tz);
1737
1738             /**************************
1739              * CALCULATE INTERACTIONS *
1740              **************************/
1741
1742             /* REACTION-FIELD ELECTROSTATICS */
1743             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1744
1745             fscal            = felec;
1746
1747             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1748
1749             /* Calculate temporary vectorial force */
1750             tx               = _mm256_mul_ps(fscal,dx12);
1751             ty               = _mm256_mul_ps(fscal,dy12);
1752             tz               = _mm256_mul_ps(fscal,dz12);
1753
1754             /* Update vectorial force */
1755             fix1             = _mm256_add_ps(fix1,tx);
1756             fiy1             = _mm256_add_ps(fiy1,ty);
1757             fiz1             = _mm256_add_ps(fiz1,tz);
1758
1759             fjx2             = _mm256_add_ps(fjx2,tx);
1760             fjy2             = _mm256_add_ps(fjy2,ty);
1761             fjz2             = _mm256_add_ps(fjz2,tz);
1762
1763             /**************************
1764              * CALCULATE INTERACTIONS *
1765              **************************/
1766
1767             /* REACTION-FIELD ELECTROSTATICS */
1768             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1769
1770             fscal            = felec;
1771
1772             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1773
1774             /* Calculate temporary vectorial force */
1775             tx               = _mm256_mul_ps(fscal,dx20);
1776             ty               = _mm256_mul_ps(fscal,dy20);
1777             tz               = _mm256_mul_ps(fscal,dz20);
1778
1779             /* Update vectorial force */
1780             fix2             = _mm256_add_ps(fix2,tx);
1781             fiy2             = _mm256_add_ps(fiy2,ty);
1782             fiz2             = _mm256_add_ps(fiz2,tz);
1783
1784             fjx0             = _mm256_add_ps(fjx0,tx);
1785             fjy0             = _mm256_add_ps(fjy0,ty);
1786             fjz0             = _mm256_add_ps(fjz0,tz);
1787
1788             /**************************
1789              * CALCULATE INTERACTIONS *
1790              **************************/
1791
1792             /* REACTION-FIELD ELECTROSTATICS */
1793             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1794
1795             fscal            = felec;
1796
1797             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1798
1799             /* Calculate temporary vectorial force */
1800             tx               = _mm256_mul_ps(fscal,dx21);
1801             ty               = _mm256_mul_ps(fscal,dy21);
1802             tz               = _mm256_mul_ps(fscal,dz21);
1803
1804             /* Update vectorial force */
1805             fix2             = _mm256_add_ps(fix2,tx);
1806             fiy2             = _mm256_add_ps(fiy2,ty);
1807             fiz2             = _mm256_add_ps(fiz2,tz);
1808
1809             fjx1             = _mm256_add_ps(fjx1,tx);
1810             fjy1             = _mm256_add_ps(fjy1,ty);
1811             fjz1             = _mm256_add_ps(fjz1,tz);
1812
1813             /**************************
1814              * CALCULATE INTERACTIONS *
1815              **************************/
1816
1817             /* REACTION-FIELD ELECTROSTATICS */
1818             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1819
1820             fscal            = felec;
1821
1822             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1823
1824             /* Calculate temporary vectorial force */
1825             tx               = _mm256_mul_ps(fscal,dx22);
1826             ty               = _mm256_mul_ps(fscal,dy22);
1827             tz               = _mm256_mul_ps(fscal,dz22);
1828
1829             /* Update vectorial force */
1830             fix2             = _mm256_add_ps(fix2,tx);
1831             fiy2             = _mm256_add_ps(fiy2,ty);
1832             fiz2             = _mm256_add_ps(fiz2,tz);
1833
1834             fjx2             = _mm256_add_ps(fjx2,tx);
1835             fjy2             = _mm256_add_ps(fjy2,ty);
1836             fjz2             = _mm256_add_ps(fjz2,tz);
1837
1838             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1839             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1840             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1841             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1842             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1843             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1844             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1845             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1846
1847             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1848                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1849
1850             /* Inner loop uses 250 flops */
1851         }
1852
1853         /* End of innermost loop */
1854
1855         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1856                                                  f+i_coord_offset,fshift+i_shift_offset);
1857
1858         /* Increment number of inner iterations */
1859         inneriter                  += j_index_end - j_index_start;
1860
1861         /* Outer loop uses 18 flops */
1862     }
1863
1864     /* Increment number of outer iterations */
1865     outeriter        += nri;
1866
1867     /* Update outer/inner flops */
1868
1869     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
1870 }