Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwNone_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwNone_GeomW4W4_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr1;
87     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     real *           vdwioffsetptr3;
91     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
93     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
95     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
97     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     __m256i          ewitab;
110     __m128i          ewitab_lo,ewitab_hi;
111     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
113     real             *ewtab;
114     __m256           dummy_mask,cutoff_mask;
115     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
116     __m256           one     = _mm256_set1_ps(1.0);
117     __m256           two     = _mm256_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm256_set1_ps(fr->epsfac);
130     charge           = mdatoms->chargeA;
131
132     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
133     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
134     beta2            = _mm256_mul_ps(beta,beta);
135     beta3            = _mm256_mul_ps(beta,beta2);
136
137     ewtab            = fr->ic->tabq_coul_FDV0;
138     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
139     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
144     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
145     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
146
147     jq1              = _mm256_set1_ps(charge[inr+1]);
148     jq2              = _mm256_set1_ps(charge[inr+2]);
149     jq3              = _mm256_set1_ps(charge[inr+3]);
150     qq11             = _mm256_mul_ps(iq1,jq1);
151     qq12             = _mm256_mul_ps(iq1,jq2);
152     qq13             = _mm256_mul_ps(iq1,jq3);
153     qq21             = _mm256_mul_ps(iq2,jq1);
154     qq22             = _mm256_mul_ps(iq2,jq2);
155     qq23             = _mm256_mul_ps(iq2,jq3);
156     qq31             = _mm256_mul_ps(iq3,jq1);
157     qq32             = _mm256_mul_ps(iq3,jq2);
158     qq33             = _mm256_mul_ps(iq3,jq3);
159
160     /* Avoid stupid compiler warnings */
161     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
162     j_coord_offsetA = 0;
163     j_coord_offsetB = 0;
164     j_coord_offsetC = 0;
165     j_coord_offsetD = 0;
166     j_coord_offsetE = 0;
167     j_coord_offsetF = 0;
168     j_coord_offsetG = 0;
169     j_coord_offsetH = 0;
170
171     outeriter        = 0;
172     inneriter        = 0;
173
174     for(iidx=0;iidx<4*DIM;iidx++)
175     {
176         scratch[iidx] = 0.0;
177     }
178
179     /* Start outer loop over neighborlists */
180     for(iidx=0; iidx<nri; iidx++)
181     {
182         /* Load shift vector for this list */
183         i_shift_offset   = DIM*shiftidx[iidx];
184
185         /* Load limits for loop over neighbors */
186         j_index_start    = jindex[iidx];
187         j_index_end      = jindex[iidx+1];
188
189         /* Get outer coordinate index */
190         inr              = iinr[iidx];
191         i_coord_offset   = DIM*inr;
192
193         /* Load i particle coords and add shift vector */
194         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
195                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
196
197         fix1             = _mm256_setzero_ps();
198         fiy1             = _mm256_setzero_ps();
199         fiz1             = _mm256_setzero_ps();
200         fix2             = _mm256_setzero_ps();
201         fiy2             = _mm256_setzero_ps();
202         fiz2             = _mm256_setzero_ps();
203         fix3             = _mm256_setzero_ps();
204         fiy3             = _mm256_setzero_ps();
205         fiz3             = _mm256_setzero_ps();
206
207         /* Reset potential sums */
208         velecsum         = _mm256_setzero_ps();
209
210         /* Start inner kernel loop */
211         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
212         {
213
214             /* Get j neighbor index, and coordinate index */
215             jnrA             = jjnr[jidx];
216             jnrB             = jjnr[jidx+1];
217             jnrC             = jjnr[jidx+2];
218             jnrD             = jjnr[jidx+3];
219             jnrE             = jjnr[jidx+4];
220             jnrF             = jjnr[jidx+5];
221             jnrG             = jjnr[jidx+6];
222             jnrH             = jjnr[jidx+7];
223             j_coord_offsetA  = DIM*jnrA;
224             j_coord_offsetB  = DIM*jnrB;
225             j_coord_offsetC  = DIM*jnrC;
226             j_coord_offsetD  = DIM*jnrD;
227             j_coord_offsetE  = DIM*jnrE;
228             j_coord_offsetF  = DIM*jnrF;
229             j_coord_offsetG  = DIM*jnrG;
230             j_coord_offsetH  = DIM*jnrH;
231
232             /* load j atom coordinates */
233             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
234                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
235                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
236                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
237                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
238
239             /* Calculate displacement vector */
240             dx11             = _mm256_sub_ps(ix1,jx1);
241             dy11             = _mm256_sub_ps(iy1,jy1);
242             dz11             = _mm256_sub_ps(iz1,jz1);
243             dx12             = _mm256_sub_ps(ix1,jx2);
244             dy12             = _mm256_sub_ps(iy1,jy2);
245             dz12             = _mm256_sub_ps(iz1,jz2);
246             dx13             = _mm256_sub_ps(ix1,jx3);
247             dy13             = _mm256_sub_ps(iy1,jy3);
248             dz13             = _mm256_sub_ps(iz1,jz3);
249             dx21             = _mm256_sub_ps(ix2,jx1);
250             dy21             = _mm256_sub_ps(iy2,jy1);
251             dz21             = _mm256_sub_ps(iz2,jz1);
252             dx22             = _mm256_sub_ps(ix2,jx2);
253             dy22             = _mm256_sub_ps(iy2,jy2);
254             dz22             = _mm256_sub_ps(iz2,jz2);
255             dx23             = _mm256_sub_ps(ix2,jx3);
256             dy23             = _mm256_sub_ps(iy2,jy3);
257             dz23             = _mm256_sub_ps(iz2,jz3);
258             dx31             = _mm256_sub_ps(ix3,jx1);
259             dy31             = _mm256_sub_ps(iy3,jy1);
260             dz31             = _mm256_sub_ps(iz3,jz1);
261             dx32             = _mm256_sub_ps(ix3,jx2);
262             dy32             = _mm256_sub_ps(iy3,jy2);
263             dz32             = _mm256_sub_ps(iz3,jz2);
264             dx33             = _mm256_sub_ps(ix3,jx3);
265             dy33             = _mm256_sub_ps(iy3,jy3);
266             dz33             = _mm256_sub_ps(iz3,jz3);
267
268             /* Calculate squared distance and things based on it */
269             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
270             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
271             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
272             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
273             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
274             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
275             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
276             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
277             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
278
279             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
280             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
281             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
282             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
283             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
284             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
285             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
286             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
287             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
288
289             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
290             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
291             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
292             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
293             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
294             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
295             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
296             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
297             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
298
299             fjx1             = _mm256_setzero_ps();
300             fjy1             = _mm256_setzero_ps();
301             fjz1             = _mm256_setzero_ps();
302             fjx2             = _mm256_setzero_ps();
303             fjy2             = _mm256_setzero_ps();
304             fjz2             = _mm256_setzero_ps();
305             fjx3             = _mm256_setzero_ps();
306             fjy3             = _mm256_setzero_ps();
307             fjz3             = _mm256_setzero_ps();
308
309             /**************************
310              * CALCULATE INTERACTIONS *
311              **************************/
312
313             r11              = _mm256_mul_ps(rsq11,rinv11);
314
315             /* EWALD ELECTROSTATICS */
316             
317             /* Analytical PME correction */
318             zeta2            = _mm256_mul_ps(beta2,rsq11);
319             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
320             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
321             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
322             felec            = _mm256_mul_ps(qq11,felec);
323             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
324             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
325             velec            = _mm256_sub_ps(rinv11,pmecorrV);
326             velec            = _mm256_mul_ps(qq11,velec);
327             
328             /* Update potential sum for this i atom from the interaction with this j atom. */
329             velecsum         = _mm256_add_ps(velecsum,velec);
330
331             fscal            = felec;
332
333             /* Calculate temporary vectorial force */
334             tx               = _mm256_mul_ps(fscal,dx11);
335             ty               = _mm256_mul_ps(fscal,dy11);
336             tz               = _mm256_mul_ps(fscal,dz11);
337
338             /* Update vectorial force */
339             fix1             = _mm256_add_ps(fix1,tx);
340             fiy1             = _mm256_add_ps(fiy1,ty);
341             fiz1             = _mm256_add_ps(fiz1,tz);
342
343             fjx1             = _mm256_add_ps(fjx1,tx);
344             fjy1             = _mm256_add_ps(fjy1,ty);
345             fjz1             = _mm256_add_ps(fjz1,tz);
346
347             /**************************
348              * CALCULATE INTERACTIONS *
349              **************************/
350
351             r12              = _mm256_mul_ps(rsq12,rinv12);
352
353             /* EWALD ELECTROSTATICS */
354             
355             /* Analytical PME correction */
356             zeta2            = _mm256_mul_ps(beta2,rsq12);
357             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
358             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
359             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
360             felec            = _mm256_mul_ps(qq12,felec);
361             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
362             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
363             velec            = _mm256_sub_ps(rinv12,pmecorrV);
364             velec            = _mm256_mul_ps(qq12,velec);
365             
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             velecsum         = _mm256_add_ps(velecsum,velec);
368
369             fscal            = felec;
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_ps(fscal,dx12);
373             ty               = _mm256_mul_ps(fscal,dy12);
374             tz               = _mm256_mul_ps(fscal,dz12);
375
376             /* Update vectorial force */
377             fix1             = _mm256_add_ps(fix1,tx);
378             fiy1             = _mm256_add_ps(fiy1,ty);
379             fiz1             = _mm256_add_ps(fiz1,tz);
380
381             fjx2             = _mm256_add_ps(fjx2,tx);
382             fjy2             = _mm256_add_ps(fjy2,ty);
383             fjz2             = _mm256_add_ps(fjz2,tz);
384
385             /**************************
386              * CALCULATE INTERACTIONS *
387              **************************/
388
389             r13              = _mm256_mul_ps(rsq13,rinv13);
390
391             /* EWALD ELECTROSTATICS */
392             
393             /* Analytical PME correction */
394             zeta2            = _mm256_mul_ps(beta2,rsq13);
395             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
396             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
397             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
398             felec            = _mm256_mul_ps(qq13,felec);
399             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
400             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
401             velec            = _mm256_sub_ps(rinv13,pmecorrV);
402             velec            = _mm256_mul_ps(qq13,velec);
403             
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velecsum         = _mm256_add_ps(velecsum,velec);
406
407             fscal            = felec;
408
409             /* Calculate temporary vectorial force */
410             tx               = _mm256_mul_ps(fscal,dx13);
411             ty               = _mm256_mul_ps(fscal,dy13);
412             tz               = _mm256_mul_ps(fscal,dz13);
413
414             /* Update vectorial force */
415             fix1             = _mm256_add_ps(fix1,tx);
416             fiy1             = _mm256_add_ps(fiy1,ty);
417             fiz1             = _mm256_add_ps(fiz1,tz);
418
419             fjx3             = _mm256_add_ps(fjx3,tx);
420             fjy3             = _mm256_add_ps(fjy3,ty);
421             fjz3             = _mm256_add_ps(fjz3,tz);
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             r21              = _mm256_mul_ps(rsq21,rinv21);
428
429             /* EWALD ELECTROSTATICS */
430             
431             /* Analytical PME correction */
432             zeta2            = _mm256_mul_ps(beta2,rsq21);
433             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
434             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
435             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
436             felec            = _mm256_mul_ps(qq21,felec);
437             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
438             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
439             velec            = _mm256_sub_ps(rinv21,pmecorrV);
440             velec            = _mm256_mul_ps(qq21,velec);
441             
442             /* Update potential sum for this i atom from the interaction with this j atom. */
443             velecsum         = _mm256_add_ps(velecsum,velec);
444
445             fscal            = felec;
446
447             /* Calculate temporary vectorial force */
448             tx               = _mm256_mul_ps(fscal,dx21);
449             ty               = _mm256_mul_ps(fscal,dy21);
450             tz               = _mm256_mul_ps(fscal,dz21);
451
452             /* Update vectorial force */
453             fix2             = _mm256_add_ps(fix2,tx);
454             fiy2             = _mm256_add_ps(fiy2,ty);
455             fiz2             = _mm256_add_ps(fiz2,tz);
456
457             fjx1             = _mm256_add_ps(fjx1,tx);
458             fjy1             = _mm256_add_ps(fjy1,ty);
459             fjz1             = _mm256_add_ps(fjz1,tz);
460
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             r22              = _mm256_mul_ps(rsq22,rinv22);
466
467             /* EWALD ELECTROSTATICS */
468             
469             /* Analytical PME correction */
470             zeta2            = _mm256_mul_ps(beta2,rsq22);
471             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
472             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
473             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
474             felec            = _mm256_mul_ps(qq22,felec);
475             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
476             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
477             velec            = _mm256_sub_ps(rinv22,pmecorrV);
478             velec            = _mm256_mul_ps(qq22,velec);
479             
480             /* Update potential sum for this i atom from the interaction with this j atom. */
481             velecsum         = _mm256_add_ps(velecsum,velec);
482
483             fscal            = felec;
484
485             /* Calculate temporary vectorial force */
486             tx               = _mm256_mul_ps(fscal,dx22);
487             ty               = _mm256_mul_ps(fscal,dy22);
488             tz               = _mm256_mul_ps(fscal,dz22);
489
490             /* Update vectorial force */
491             fix2             = _mm256_add_ps(fix2,tx);
492             fiy2             = _mm256_add_ps(fiy2,ty);
493             fiz2             = _mm256_add_ps(fiz2,tz);
494
495             fjx2             = _mm256_add_ps(fjx2,tx);
496             fjy2             = _mm256_add_ps(fjy2,ty);
497             fjz2             = _mm256_add_ps(fjz2,tz);
498
499             /**************************
500              * CALCULATE INTERACTIONS *
501              **************************/
502
503             r23              = _mm256_mul_ps(rsq23,rinv23);
504
505             /* EWALD ELECTROSTATICS */
506             
507             /* Analytical PME correction */
508             zeta2            = _mm256_mul_ps(beta2,rsq23);
509             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
510             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
511             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
512             felec            = _mm256_mul_ps(qq23,felec);
513             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
514             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
515             velec            = _mm256_sub_ps(rinv23,pmecorrV);
516             velec            = _mm256_mul_ps(qq23,velec);
517             
518             /* Update potential sum for this i atom from the interaction with this j atom. */
519             velecsum         = _mm256_add_ps(velecsum,velec);
520
521             fscal            = felec;
522
523             /* Calculate temporary vectorial force */
524             tx               = _mm256_mul_ps(fscal,dx23);
525             ty               = _mm256_mul_ps(fscal,dy23);
526             tz               = _mm256_mul_ps(fscal,dz23);
527
528             /* Update vectorial force */
529             fix2             = _mm256_add_ps(fix2,tx);
530             fiy2             = _mm256_add_ps(fiy2,ty);
531             fiz2             = _mm256_add_ps(fiz2,tz);
532
533             fjx3             = _mm256_add_ps(fjx3,tx);
534             fjy3             = _mm256_add_ps(fjy3,ty);
535             fjz3             = _mm256_add_ps(fjz3,tz);
536
537             /**************************
538              * CALCULATE INTERACTIONS *
539              **************************/
540
541             r31              = _mm256_mul_ps(rsq31,rinv31);
542
543             /* EWALD ELECTROSTATICS */
544             
545             /* Analytical PME correction */
546             zeta2            = _mm256_mul_ps(beta2,rsq31);
547             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
548             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
549             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
550             felec            = _mm256_mul_ps(qq31,felec);
551             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
552             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
553             velec            = _mm256_sub_ps(rinv31,pmecorrV);
554             velec            = _mm256_mul_ps(qq31,velec);
555             
556             /* Update potential sum for this i atom from the interaction with this j atom. */
557             velecsum         = _mm256_add_ps(velecsum,velec);
558
559             fscal            = felec;
560
561             /* Calculate temporary vectorial force */
562             tx               = _mm256_mul_ps(fscal,dx31);
563             ty               = _mm256_mul_ps(fscal,dy31);
564             tz               = _mm256_mul_ps(fscal,dz31);
565
566             /* Update vectorial force */
567             fix3             = _mm256_add_ps(fix3,tx);
568             fiy3             = _mm256_add_ps(fiy3,ty);
569             fiz3             = _mm256_add_ps(fiz3,tz);
570
571             fjx1             = _mm256_add_ps(fjx1,tx);
572             fjy1             = _mm256_add_ps(fjy1,ty);
573             fjz1             = _mm256_add_ps(fjz1,tz);
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             r32              = _mm256_mul_ps(rsq32,rinv32);
580
581             /* EWALD ELECTROSTATICS */
582             
583             /* Analytical PME correction */
584             zeta2            = _mm256_mul_ps(beta2,rsq32);
585             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
586             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
587             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
588             felec            = _mm256_mul_ps(qq32,felec);
589             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
590             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
591             velec            = _mm256_sub_ps(rinv32,pmecorrV);
592             velec            = _mm256_mul_ps(qq32,velec);
593             
594             /* Update potential sum for this i atom from the interaction with this j atom. */
595             velecsum         = _mm256_add_ps(velecsum,velec);
596
597             fscal            = felec;
598
599             /* Calculate temporary vectorial force */
600             tx               = _mm256_mul_ps(fscal,dx32);
601             ty               = _mm256_mul_ps(fscal,dy32);
602             tz               = _mm256_mul_ps(fscal,dz32);
603
604             /* Update vectorial force */
605             fix3             = _mm256_add_ps(fix3,tx);
606             fiy3             = _mm256_add_ps(fiy3,ty);
607             fiz3             = _mm256_add_ps(fiz3,tz);
608
609             fjx2             = _mm256_add_ps(fjx2,tx);
610             fjy2             = _mm256_add_ps(fjy2,ty);
611             fjz2             = _mm256_add_ps(fjz2,tz);
612
613             /**************************
614              * CALCULATE INTERACTIONS *
615              **************************/
616
617             r33              = _mm256_mul_ps(rsq33,rinv33);
618
619             /* EWALD ELECTROSTATICS */
620             
621             /* Analytical PME correction */
622             zeta2            = _mm256_mul_ps(beta2,rsq33);
623             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
624             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
625             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
626             felec            = _mm256_mul_ps(qq33,felec);
627             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
628             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
629             velec            = _mm256_sub_ps(rinv33,pmecorrV);
630             velec            = _mm256_mul_ps(qq33,velec);
631             
632             /* Update potential sum for this i atom from the interaction with this j atom. */
633             velecsum         = _mm256_add_ps(velecsum,velec);
634
635             fscal            = felec;
636
637             /* Calculate temporary vectorial force */
638             tx               = _mm256_mul_ps(fscal,dx33);
639             ty               = _mm256_mul_ps(fscal,dy33);
640             tz               = _mm256_mul_ps(fscal,dz33);
641
642             /* Update vectorial force */
643             fix3             = _mm256_add_ps(fix3,tx);
644             fiy3             = _mm256_add_ps(fiy3,ty);
645             fiz3             = _mm256_add_ps(fiz3,tz);
646
647             fjx3             = _mm256_add_ps(fjx3,tx);
648             fjy3             = _mm256_add_ps(fjy3,ty);
649             fjz3             = _mm256_add_ps(fjz3,tz);
650
651             fjptrA             = f+j_coord_offsetA;
652             fjptrB             = f+j_coord_offsetB;
653             fjptrC             = f+j_coord_offsetC;
654             fjptrD             = f+j_coord_offsetD;
655             fjptrE             = f+j_coord_offsetE;
656             fjptrF             = f+j_coord_offsetF;
657             fjptrG             = f+j_coord_offsetG;
658             fjptrH             = f+j_coord_offsetH;
659
660             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
661                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
662                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
663
664             /* Inner loop uses 756 flops */
665         }
666
667         if(jidx<j_index_end)
668         {
669
670             /* Get j neighbor index, and coordinate index */
671             jnrlistA         = jjnr[jidx];
672             jnrlistB         = jjnr[jidx+1];
673             jnrlistC         = jjnr[jidx+2];
674             jnrlistD         = jjnr[jidx+3];
675             jnrlistE         = jjnr[jidx+4];
676             jnrlistF         = jjnr[jidx+5];
677             jnrlistG         = jjnr[jidx+6];
678             jnrlistH         = jjnr[jidx+7];
679             /* Sign of each element will be negative for non-real atoms.
680              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
681              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
682              */
683             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
684                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
685                                             
686             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
687             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
688             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
689             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
690             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
691             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
692             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
693             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
694             j_coord_offsetA  = DIM*jnrA;
695             j_coord_offsetB  = DIM*jnrB;
696             j_coord_offsetC  = DIM*jnrC;
697             j_coord_offsetD  = DIM*jnrD;
698             j_coord_offsetE  = DIM*jnrE;
699             j_coord_offsetF  = DIM*jnrF;
700             j_coord_offsetG  = DIM*jnrG;
701             j_coord_offsetH  = DIM*jnrH;
702
703             /* load j atom coordinates */
704             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
705                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
706                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
707                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
708                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
709
710             /* Calculate displacement vector */
711             dx11             = _mm256_sub_ps(ix1,jx1);
712             dy11             = _mm256_sub_ps(iy1,jy1);
713             dz11             = _mm256_sub_ps(iz1,jz1);
714             dx12             = _mm256_sub_ps(ix1,jx2);
715             dy12             = _mm256_sub_ps(iy1,jy2);
716             dz12             = _mm256_sub_ps(iz1,jz2);
717             dx13             = _mm256_sub_ps(ix1,jx3);
718             dy13             = _mm256_sub_ps(iy1,jy3);
719             dz13             = _mm256_sub_ps(iz1,jz3);
720             dx21             = _mm256_sub_ps(ix2,jx1);
721             dy21             = _mm256_sub_ps(iy2,jy1);
722             dz21             = _mm256_sub_ps(iz2,jz1);
723             dx22             = _mm256_sub_ps(ix2,jx2);
724             dy22             = _mm256_sub_ps(iy2,jy2);
725             dz22             = _mm256_sub_ps(iz2,jz2);
726             dx23             = _mm256_sub_ps(ix2,jx3);
727             dy23             = _mm256_sub_ps(iy2,jy3);
728             dz23             = _mm256_sub_ps(iz2,jz3);
729             dx31             = _mm256_sub_ps(ix3,jx1);
730             dy31             = _mm256_sub_ps(iy3,jy1);
731             dz31             = _mm256_sub_ps(iz3,jz1);
732             dx32             = _mm256_sub_ps(ix3,jx2);
733             dy32             = _mm256_sub_ps(iy3,jy2);
734             dz32             = _mm256_sub_ps(iz3,jz2);
735             dx33             = _mm256_sub_ps(ix3,jx3);
736             dy33             = _mm256_sub_ps(iy3,jy3);
737             dz33             = _mm256_sub_ps(iz3,jz3);
738
739             /* Calculate squared distance and things based on it */
740             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
741             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
742             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
743             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
744             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
745             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
746             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
747             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
748             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
749
750             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
751             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
752             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
753             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
754             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
755             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
756             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
757             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
758             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
759
760             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
761             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
762             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
763             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
764             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
765             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
766             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
767             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
768             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
769
770             fjx1             = _mm256_setzero_ps();
771             fjy1             = _mm256_setzero_ps();
772             fjz1             = _mm256_setzero_ps();
773             fjx2             = _mm256_setzero_ps();
774             fjy2             = _mm256_setzero_ps();
775             fjz2             = _mm256_setzero_ps();
776             fjx3             = _mm256_setzero_ps();
777             fjy3             = _mm256_setzero_ps();
778             fjz3             = _mm256_setzero_ps();
779
780             /**************************
781              * CALCULATE INTERACTIONS *
782              **************************/
783
784             r11              = _mm256_mul_ps(rsq11,rinv11);
785             r11              = _mm256_andnot_ps(dummy_mask,r11);
786
787             /* EWALD ELECTROSTATICS */
788             
789             /* Analytical PME correction */
790             zeta2            = _mm256_mul_ps(beta2,rsq11);
791             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
792             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
793             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
794             felec            = _mm256_mul_ps(qq11,felec);
795             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
796             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
797             velec            = _mm256_sub_ps(rinv11,pmecorrV);
798             velec            = _mm256_mul_ps(qq11,velec);
799             
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm256_andnot_ps(dummy_mask,velec);
802             velecsum         = _mm256_add_ps(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
807
808             /* Calculate temporary vectorial force */
809             tx               = _mm256_mul_ps(fscal,dx11);
810             ty               = _mm256_mul_ps(fscal,dy11);
811             tz               = _mm256_mul_ps(fscal,dz11);
812
813             /* Update vectorial force */
814             fix1             = _mm256_add_ps(fix1,tx);
815             fiy1             = _mm256_add_ps(fiy1,ty);
816             fiz1             = _mm256_add_ps(fiz1,tz);
817
818             fjx1             = _mm256_add_ps(fjx1,tx);
819             fjy1             = _mm256_add_ps(fjy1,ty);
820             fjz1             = _mm256_add_ps(fjz1,tz);
821
822             /**************************
823              * CALCULATE INTERACTIONS *
824              **************************/
825
826             r12              = _mm256_mul_ps(rsq12,rinv12);
827             r12              = _mm256_andnot_ps(dummy_mask,r12);
828
829             /* EWALD ELECTROSTATICS */
830             
831             /* Analytical PME correction */
832             zeta2            = _mm256_mul_ps(beta2,rsq12);
833             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
834             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
835             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
836             felec            = _mm256_mul_ps(qq12,felec);
837             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
838             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
839             velec            = _mm256_sub_ps(rinv12,pmecorrV);
840             velec            = _mm256_mul_ps(qq12,velec);
841             
842             /* Update potential sum for this i atom from the interaction with this j atom. */
843             velec            = _mm256_andnot_ps(dummy_mask,velec);
844             velecsum         = _mm256_add_ps(velecsum,velec);
845
846             fscal            = felec;
847
848             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
849
850             /* Calculate temporary vectorial force */
851             tx               = _mm256_mul_ps(fscal,dx12);
852             ty               = _mm256_mul_ps(fscal,dy12);
853             tz               = _mm256_mul_ps(fscal,dz12);
854
855             /* Update vectorial force */
856             fix1             = _mm256_add_ps(fix1,tx);
857             fiy1             = _mm256_add_ps(fiy1,ty);
858             fiz1             = _mm256_add_ps(fiz1,tz);
859
860             fjx2             = _mm256_add_ps(fjx2,tx);
861             fjy2             = _mm256_add_ps(fjy2,ty);
862             fjz2             = _mm256_add_ps(fjz2,tz);
863
864             /**************************
865              * CALCULATE INTERACTIONS *
866              **************************/
867
868             r13              = _mm256_mul_ps(rsq13,rinv13);
869             r13              = _mm256_andnot_ps(dummy_mask,r13);
870
871             /* EWALD ELECTROSTATICS */
872             
873             /* Analytical PME correction */
874             zeta2            = _mm256_mul_ps(beta2,rsq13);
875             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
876             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
877             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
878             felec            = _mm256_mul_ps(qq13,felec);
879             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
880             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
881             velec            = _mm256_sub_ps(rinv13,pmecorrV);
882             velec            = _mm256_mul_ps(qq13,velec);
883             
884             /* Update potential sum for this i atom from the interaction with this j atom. */
885             velec            = _mm256_andnot_ps(dummy_mask,velec);
886             velecsum         = _mm256_add_ps(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
891
892             /* Calculate temporary vectorial force */
893             tx               = _mm256_mul_ps(fscal,dx13);
894             ty               = _mm256_mul_ps(fscal,dy13);
895             tz               = _mm256_mul_ps(fscal,dz13);
896
897             /* Update vectorial force */
898             fix1             = _mm256_add_ps(fix1,tx);
899             fiy1             = _mm256_add_ps(fiy1,ty);
900             fiz1             = _mm256_add_ps(fiz1,tz);
901
902             fjx3             = _mm256_add_ps(fjx3,tx);
903             fjy3             = _mm256_add_ps(fjy3,ty);
904             fjz3             = _mm256_add_ps(fjz3,tz);
905
906             /**************************
907              * CALCULATE INTERACTIONS *
908              **************************/
909
910             r21              = _mm256_mul_ps(rsq21,rinv21);
911             r21              = _mm256_andnot_ps(dummy_mask,r21);
912
913             /* EWALD ELECTROSTATICS */
914             
915             /* Analytical PME correction */
916             zeta2            = _mm256_mul_ps(beta2,rsq21);
917             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
918             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
919             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
920             felec            = _mm256_mul_ps(qq21,felec);
921             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
922             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
923             velec            = _mm256_sub_ps(rinv21,pmecorrV);
924             velec            = _mm256_mul_ps(qq21,velec);
925             
926             /* Update potential sum for this i atom from the interaction with this j atom. */
927             velec            = _mm256_andnot_ps(dummy_mask,velec);
928             velecsum         = _mm256_add_ps(velecsum,velec);
929
930             fscal            = felec;
931
932             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm256_mul_ps(fscal,dx21);
936             ty               = _mm256_mul_ps(fscal,dy21);
937             tz               = _mm256_mul_ps(fscal,dz21);
938
939             /* Update vectorial force */
940             fix2             = _mm256_add_ps(fix2,tx);
941             fiy2             = _mm256_add_ps(fiy2,ty);
942             fiz2             = _mm256_add_ps(fiz2,tz);
943
944             fjx1             = _mm256_add_ps(fjx1,tx);
945             fjy1             = _mm256_add_ps(fjy1,ty);
946             fjz1             = _mm256_add_ps(fjz1,tz);
947
948             /**************************
949              * CALCULATE INTERACTIONS *
950              **************************/
951
952             r22              = _mm256_mul_ps(rsq22,rinv22);
953             r22              = _mm256_andnot_ps(dummy_mask,r22);
954
955             /* EWALD ELECTROSTATICS */
956             
957             /* Analytical PME correction */
958             zeta2            = _mm256_mul_ps(beta2,rsq22);
959             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
960             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
961             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
962             felec            = _mm256_mul_ps(qq22,felec);
963             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
964             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
965             velec            = _mm256_sub_ps(rinv22,pmecorrV);
966             velec            = _mm256_mul_ps(qq22,velec);
967             
968             /* Update potential sum for this i atom from the interaction with this j atom. */
969             velec            = _mm256_andnot_ps(dummy_mask,velec);
970             velecsum         = _mm256_add_ps(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm256_mul_ps(fscal,dx22);
978             ty               = _mm256_mul_ps(fscal,dy22);
979             tz               = _mm256_mul_ps(fscal,dz22);
980
981             /* Update vectorial force */
982             fix2             = _mm256_add_ps(fix2,tx);
983             fiy2             = _mm256_add_ps(fiy2,ty);
984             fiz2             = _mm256_add_ps(fiz2,tz);
985
986             fjx2             = _mm256_add_ps(fjx2,tx);
987             fjy2             = _mm256_add_ps(fjy2,ty);
988             fjz2             = _mm256_add_ps(fjz2,tz);
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r23              = _mm256_mul_ps(rsq23,rinv23);
995             r23              = _mm256_andnot_ps(dummy_mask,r23);
996
997             /* EWALD ELECTROSTATICS */
998             
999             /* Analytical PME correction */
1000             zeta2            = _mm256_mul_ps(beta2,rsq23);
1001             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1002             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1003             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1004             felec            = _mm256_mul_ps(qq23,felec);
1005             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1006             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1007             velec            = _mm256_sub_ps(rinv23,pmecorrV);
1008             velec            = _mm256_mul_ps(qq23,velec);
1009             
1010             /* Update potential sum for this i atom from the interaction with this j atom. */
1011             velec            = _mm256_andnot_ps(dummy_mask,velec);
1012             velecsum         = _mm256_add_ps(velecsum,velec);
1013
1014             fscal            = felec;
1015
1016             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1017
1018             /* Calculate temporary vectorial force */
1019             tx               = _mm256_mul_ps(fscal,dx23);
1020             ty               = _mm256_mul_ps(fscal,dy23);
1021             tz               = _mm256_mul_ps(fscal,dz23);
1022
1023             /* Update vectorial force */
1024             fix2             = _mm256_add_ps(fix2,tx);
1025             fiy2             = _mm256_add_ps(fiy2,ty);
1026             fiz2             = _mm256_add_ps(fiz2,tz);
1027
1028             fjx3             = _mm256_add_ps(fjx3,tx);
1029             fjy3             = _mm256_add_ps(fjy3,ty);
1030             fjz3             = _mm256_add_ps(fjz3,tz);
1031
1032             /**************************
1033              * CALCULATE INTERACTIONS *
1034              **************************/
1035
1036             r31              = _mm256_mul_ps(rsq31,rinv31);
1037             r31              = _mm256_andnot_ps(dummy_mask,r31);
1038
1039             /* EWALD ELECTROSTATICS */
1040             
1041             /* Analytical PME correction */
1042             zeta2            = _mm256_mul_ps(beta2,rsq31);
1043             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1044             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1045             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1046             felec            = _mm256_mul_ps(qq31,felec);
1047             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1048             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1049             velec            = _mm256_sub_ps(rinv31,pmecorrV);
1050             velec            = _mm256_mul_ps(qq31,velec);
1051             
1052             /* Update potential sum for this i atom from the interaction with this j atom. */
1053             velec            = _mm256_andnot_ps(dummy_mask,velec);
1054             velecsum         = _mm256_add_ps(velecsum,velec);
1055
1056             fscal            = felec;
1057
1058             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1059
1060             /* Calculate temporary vectorial force */
1061             tx               = _mm256_mul_ps(fscal,dx31);
1062             ty               = _mm256_mul_ps(fscal,dy31);
1063             tz               = _mm256_mul_ps(fscal,dz31);
1064
1065             /* Update vectorial force */
1066             fix3             = _mm256_add_ps(fix3,tx);
1067             fiy3             = _mm256_add_ps(fiy3,ty);
1068             fiz3             = _mm256_add_ps(fiz3,tz);
1069
1070             fjx1             = _mm256_add_ps(fjx1,tx);
1071             fjy1             = _mm256_add_ps(fjy1,ty);
1072             fjz1             = _mm256_add_ps(fjz1,tz);
1073
1074             /**************************
1075              * CALCULATE INTERACTIONS *
1076              **************************/
1077
1078             r32              = _mm256_mul_ps(rsq32,rinv32);
1079             r32              = _mm256_andnot_ps(dummy_mask,r32);
1080
1081             /* EWALD ELECTROSTATICS */
1082             
1083             /* Analytical PME correction */
1084             zeta2            = _mm256_mul_ps(beta2,rsq32);
1085             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1086             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1087             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1088             felec            = _mm256_mul_ps(qq32,felec);
1089             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1090             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1091             velec            = _mm256_sub_ps(rinv32,pmecorrV);
1092             velec            = _mm256_mul_ps(qq32,velec);
1093             
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm256_andnot_ps(dummy_mask,velec);
1096             velecsum         = _mm256_add_ps(velecsum,velec);
1097
1098             fscal            = felec;
1099
1100             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1101
1102             /* Calculate temporary vectorial force */
1103             tx               = _mm256_mul_ps(fscal,dx32);
1104             ty               = _mm256_mul_ps(fscal,dy32);
1105             tz               = _mm256_mul_ps(fscal,dz32);
1106
1107             /* Update vectorial force */
1108             fix3             = _mm256_add_ps(fix3,tx);
1109             fiy3             = _mm256_add_ps(fiy3,ty);
1110             fiz3             = _mm256_add_ps(fiz3,tz);
1111
1112             fjx2             = _mm256_add_ps(fjx2,tx);
1113             fjy2             = _mm256_add_ps(fjy2,ty);
1114             fjz2             = _mm256_add_ps(fjz2,tz);
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             r33              = _mm256_mul_ps(rsq33,rinv33);
1121             r33              = _mm256_andnot_ps(dummy_mask,r33);
1122
1123             /* EWALD ELECTROSTATICS */
1124             
1125             /* Analytical PME correction */
1126             zeta2            = _mm256_mul_ps(beta2,rsq33);
1127             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1128             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1129             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1130             felec            = _mm256_mul_ps(qq33,felec);
1131             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1132             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1133             velec            = _mm256_sub_ps(rinv33,pmecorrV);
1134             velec            = _mm256_mul_ps(qq33,velec);
1135             
1136             /* Update potential sum for this i atom from the interaction with this j atom. */
1137             velec            = _mm256_andnot_ps(dummy_mask,velec);
1138             velecsum         = _mm256_add_ps(velecsum,velec);
1139
1140             fscal            = felec;
1141
1142             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1143
1144             /* Calculate temporary vectorial force */
1145             tx               = _mm256_mul_ps(fscal,dx33);
1146             ty               = _mm256_mul_ps(fscal,dy33);
1147             tz               = _mm256_mul_ps(fscal,dz33);
1148
1149             /* Update vectorial force */
1150             fix3             = _mm256_add_ps(fix3,tx);
1151             fiy3             = _mm256_add_ps(fiy3,ty);
1152             fiz3             = _mm256_add_ps(fiz3,tz);
1153
1154             fjx3             = _mm256_add_ps(fjx3,tx);
1155             fjy3             = _mm256_add_ps(fjy3,ty);
1156             fjz3             = _mm256_add_ps(fjz3,tz);
1157
1158             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1159             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1160             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1161             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1162             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1163             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1164             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1165             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1166
1167             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1168                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1169                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1170
1171             /* Inner loop uses 765 flops */
1172         }
1173
1174         /* End of innermost loop */
1175
1176         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1177                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
1178
1179         ggid                        = gid[iidx];
1180         /* Update potential energies */
1181         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1182
1183         /* Increment number of inner iterations */
1184         inneriter                  += j_index_end - j_index_start;
1185
1186         /* Outer loop uses 19 flops */
1187     }
1188
1189     /* Increment number of outer iterations */
1190     outeriter        += nri;
1191
1192     /* Update outer/inner flops */
1193
1194     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*765);
1195 }
1196 /*
1197  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single
1198  * Electrostatics interaction: Ewald
1199  * VdW interaction:            None
1200  * Geometry:                   Water4-Water4
1201  * Calculate force/pot:        Force
1202  */
1203 void
1204 nb_kernel_ElecEw_VdwNone_GeomW4W4_F_avx_256_single
1205                     (t_nblist                    * gmx_restrict       nlist,
1206                      rvec                        * gmx_restrict          xx,
1207                      rvec                        * gmx_restrict          ff,
1208                      t_forcerec                  * gmx_restrict          fr,
1209                      t_mdatoms                   * gmx_restrict     mdatoms,
1210                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1211                      t_nrnb                      * gmx_restrict        nrnb)
1212 {
1213     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1214      * just 0 for non-waters.
1215      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1216      * jnr indices corresponding to data put in the four positions in the SIMD register.
1217      */
1218     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1219     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1220     int              jnrA,jnrB,jnrC,jnrD;
1221     int              jnrE,jnrF,jnrG,jnrH;
1222     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1223     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1224     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1225     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1226     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1227     real             rcutoff_scalar;
1228     real             *shiftvec,*fshift,*x,*f;
1229     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1230     real             scratch[4*DIM];
1231     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1232     real *           vdwioffsetptr1;
1233     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1234     real *           vdwioffsetptr2;
1235     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1236     real *           vdwioffsetptr3;
1237     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1238     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1239     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1240     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1241     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1242     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1243     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1244     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1245     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1246     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1247     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1248     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1249     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1250     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1251     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1252     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1253     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1254     real             *charge;
1255     __m256i          ewitab;
1256     __m128i          ewitab_lo,ewitab_hi;
1257     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1258     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1259     real             *ewtab;
1260     __m256           dummy_mask,cutoff_mask;
1261     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1262     __m256           one     = _mm256_set1_ps(1.0);
1263     __m256           two     = _mm256_set1_ps(2.0);
1264     x                = xx[0];
1265     f                = ff[0];
1266
1267     nri              = nlist->nri;
1268     iinr             = nlist->iinr;
1269     jindex           = nlist->jindex;
1270     jjnr             = nlist->jjnr;
1271     shiftidx         = nlist->shift;
1272     gid              = nlist->gid;
1273     shiftvec         = fr->shift_vec[0];
1274     fshift           = fr->fshift[0];
1275     facel            = _mm256_set1_ps(fr->epsfac);
1276     charge           = mdatoms->chargeA;
1277
1278     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1279     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1280     beta2            = _mm256_mul_ps(beta,beta);
1281     beta3            = _mm256_mul_ps(beta,beta2);
1282
1283     ewtab            = fr->ic->tabq_coul_F;
1284     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1285     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1286
1287     /* Setup water-specific parameters */
1288     inr              = nlist->iinr[0];
1289     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1290     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1291     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1292
1293     jq1              = _mm256_set1_ps(charge[inr+1]);
1294     jq2              = _mm256_set1_ps(charge[inr+2]);
1295     jq3              = _mm256_set1_ps(charge[inr+3]);
1296     qq11             = _mm256_mul_ps(iq1,jq1);
1297     qq12             = _mm256_mul_ps(iq1,jq2);
1298     qq13             = _mm256_mul_ps(iq1,jq3);
1299     qq21             = _mm256_mul_ps(iq2,jq1);
1300     qq22             = _mm256_mul_ps(iq2,jq2);
1301     qq23             = _mm256_mul_ps(iq2,jq3);
1302     qq31             = _mm256_mul_ps(iq3,jq1);
1303     qq32             = _mm256_mul_ps(iq3,jq2);
1304     qq33             = _mm256_mul_ps(iq3,jq3);
1305
1306     /* Avoid stupid compiler warnings */
1307     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1308     j_coord_offsetA = 0;
1309     j_coord_offsetB = 0;
1310     j_coord_offsetC = 0;
1311     j_coord_offsetD = 0;
1312     j_coord_offsetE = 0;
1313     j_coord_offsetF = 0;
1314     j_coord_offsetG = 0;
1315     j_coord_offsetH = 0;
1316
1317     outeriter        = 0;
1318     inneriter        = 0;
1319
1320     for(iidx=0;iidx<4*DIM;iidx++)
1321     {
1322         scratch[iidx] = 0.0;
1323     }
1324
1325     /* Start outer loop over neighborlists */
1326     for(iidx=0; iidx<nri; iidx++)
1327     {
1328         /* Load shift vector for this list */
1329         i_shift_offset   = DIM*shiftidx[iidx];
1330
1331         /* Load limits for loop over neighbors */
1332         j_index_start    = jindex[iidx];
1333         j_index_end      = jindex[iidx+1];
1334
1335         /* Get outer coordinate index */
1336         inr              = iinr[iidx];
1337         i_coord_offset   = DIM*inr;
1338
1339         /* Load i particle coords and add shift vector */
1340         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1341                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1342
1343         fix1             = _mm256_setzero_ps();
1344         fiy1             = _mm256_setzero_ps();
1345         fiz1             = _mm256_setzero_ps();
1346         fix2             = _mm256_setzero_ps();
1347         fiy2             = _mm256_setzero_ps();
1348         fiz2             = _mm256_setzero_ps();
1349         fix3             = _mm256_setzero_ps();
1350         fiy3             = _mm256_setzero_ps();
1351         fiz3             = _mm256_setzero_ps();
1352
1353         /* Start inner kernel loop */
1354         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1355         {
1356
1357             /* Get j neighbor index, and coordinate index */
1358             jnrA             = jjnr[jidx];
1359             jnrB             = jjnr[jidx+1];
1360             jnrC             = jjnr[jidx+2];
1361             jnrD             = jjnr[jidx+3];
1362             jnrE             = jjnr[jidx+4];
1363             jnrF             = jjnr[jidx+5];
1364             jnrG             = jjnr[jidx+6];
1365             jnrH             = jjnr[jidx+7];
1366             j_coord_offsetA  = DIM*jnrA;
1367             j_coord_offsetB  = DIM*jnrB;
1368             j_coord_offsetC  = DIM*jnrC;
1369             j_coord_offsetD  = DIM*jnrD;
1370             j_coord_offsetE  = DIM*jnrE;
1371             j_coord_offsetF  = DIM*jnrF;
1372             j_coord_offsetG  = DIM*jnrG;
1373             j_coord_offsetH  = DIM*jnrH;
1374
1375             /* load j atom coordinates */
1376             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1377                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1378                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1379                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1380                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1381
1382             /* Calculate displacement vector */
1383             dx11             = _mm256_sub_ps(ix1,jx1);
1384             dy11             = _mm256_sub_ps(iy1,jy1);
1385             dz11             = _mm256_sub_ps(iz1,jz1);
1386             dx12             = _mm256_sub_ps(ix1,jx2);
1387             dy12             = _mm256_sub_ps(iy1,jy2);
1388             dz12             = _mm256_sub_ps(iz1,jz2);
1389             dx13             = _mm256_sub_ps(ix1,jx3);
1390             dy13             = _mm256_sub_ps(iy1,jy3);
1391             dz13             = _mm256_sub_ps(iz1,jz3);
1392             dx21             = _mm256_sub_ps(ix2,jx1);
1393             dy21             = _mm256_sub_ps(iy2,jy1);
1394             dz21             = _mm256_sub_ps(iz2,jz1);
1395             dx22             = _mm256_sub_ps(ix2,jx2);
1396             dy22             = _mm256_sub_ps(iy2,jy2);
1397             dz22             = _mm256_sub_ps(iz2,jz2);
1398             dx23             = _mm256_sub_ps(ix2,jx3);
1399             dy23             = _mm256_sub_ps(iy2,jy3);
1400             dz23             = _mm256_sub_ps(iz2,jz3);
1401             dx31             = _mm256_sub_ps(ix3,jx1);
1402             dy31             = _mm256_sub_ps(iy3,jy1);
1403             dz31             = _mm256_sub_ps(iz3,jz1);
1404             dx32             = _mm256_sub_ps(ix3,jx2);
1405             dy32             = _mm256_sub_ps(iy3,jy2);
1406             dz32             = _mm256_sub_ps(iz3,jz2);
1407             dx33             = _mm256_sub_ps(ix3,jx3);
1408             dy33             = _mm256_sub_ps(iy3,jy3);
1409             dz33             = _mm256_sub_ps(iz3,jz3);
1410
1411             /* Calculate squared distance and things based on it */
1412             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1413             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1414             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1415             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1416             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1417             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1418             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1419             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1420             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1421
1422             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1423             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1424             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1425             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1426             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1427             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1428             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1429             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1430             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1431
1432             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1433             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1434             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1435             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1436             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1437             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1438             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1439             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1440             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1441
1442             fjx1             = _mm256_setzero_ps();
1443             fjy1             = _mm256_setzero_ps();
1444             fjz1             = _mm256_setzero_ps();
1445             fjx2             = _mm256_setzero_ps();
1446             fjy2             = _mm256_setzero_ps();
1447             fjz2             = _mm256_setzero_ps();
1448             fjx3             = _mm256_setzero_ps();
1449             fjy3             = _mm256_setzero_ps();
1450             fjz3             = _mm256_setzero_ps();
1451
1452             /**************************
1453              * CALCULATE INTERACTIONS *
1454              **************************/
1455
1456             r11              = _mm256_mul_ps(rsq11,rinv11);
1457
1458             /* EWALD ELECTROSTATICS */
1459             
1460             /* Analytical PME correction */
1461             zeta2            = _mm256_mul_ps(beta2,rsq11);
1462             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1463             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1464             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1465             felec            = _mm256_mul_ps(qq11,felec);
1466             
1467             fscal            = felec;
1468
1469             /* Calculate temporary vectorial force */
1470             tx               = _mm256_mul_ps(fscal,dx11);
1471             ty               = _mm256_mul_ps(fscal,dy11);
1472             tz               = _mm256_mul_ps(fscal,dz11);
1473
1474             /* Update vectorial force */
1475             fix1             = _mm256_add_ps(fix1,tx);
1476             fiy1             = _mm256_add_ps(fiy1,ty);
1477             fiz1             = _mm256_add_ps(fiz1,tz);
1478
1479             fjx1             = _mm256_add_ps(fjx1,tx);
1480             fjy1             = _mm256_add_ps(fjy1,ty);
1481             fjz1             = _mm256_add_ps(fjz1,tz);
1482
1483             /**************************
1484              * CALCULATE INTERACTIONS *
1485              **************************/
1486
1487             r12              = _mm256_mul_ps(rsq12,rinv12);
1488
1489             /* EWALD ELECTROSTATICS */
1490             
1491             /* Analytical PME correction */
1492             zeta2            = _mm256_mul_ps(beta2,rsq12);
1493             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1494             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1495             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1496             felec            = _mm256_mul_ps(qq12,felec);
1497             
1498             fscal            = felec;
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm256_mul_ps(fscal,dx12);
1502             ty               = _mm256_mul_ps(fscal,dy12);
1503             tz               = _mm256_mul_ps(fscal,dz12);
1504
1505             /* Update vectorial force */
1506             fix1             = _mm256_add_ps(fix1,tx);
1507             fiy1             = _mm256_add_ps(fiy1,ty);
1508             fiz1             = _mm256_add_ps(fiz1,tz);
1509
1510             fjx2             = _mm256_add_ps(fjx2,tx);
1511             fjy2             = _mm256_add_ps(fjy2,ty);
1512             fjz2             = _mm256_add_ps(fjz2,tz);
1513
1514             /**************************
1515              * CALCULATE INTERACTIONS *
1516              **************************/
1517
1518             r13              = _mm256_mul_ps(rsq13,rinv13);
1519
1520             /* EWALD ELECTROSTATICS */
1521             
1522             /* Analytical PME correction */
1523             zeta2            = _mm256_mul_ps(beta2,rsq13);
1524             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1525             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1526             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1527             felec            = _mm256_mul_ps(qq13,felec);
1528             
1529             fscal            = felec;
1530
1531             /* Calculate temporary vectorial force */
1532             tx               = _mm256_mul_ps(fscal,dx13);
1533             ty               = _mm256_mul_ps(fscal,dy13);
1534             tz               = _mm256_mul_ps(fscal,dz13);
1535
1536             /* Update vectorial force */
1537             fix1             = _mm256_add_ps(fix1,tx);
1538             fiy1             = _mm256_add_ps(fiy1,ty);
1539             fiz1             = _mm256_add_ps(fiz1,tz);
1540
1541             fjx3             = _mm256_add_ps(fjx3,tx);
1542             fjy3             = _mm256_add_ps(fjy3,ty);
1543             fjz3             = _mm256_add_ps(fjz3,tz);
1544
1545             /**************************
1546              * CALCULATE INTERACTIONS *
1547              **************************/
1548
1549             r21              = _mm256_mul_ps(rsq21,rinv21);
1550
1551             /* EWALD ELECTROSTATICS */
1552             
1553             /* Analytical PME correction */
1554             zeta2            = _mm256_mul_ps(beta2,rsq21);
1555             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1556             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1557             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1558             felec            = _mm256_mul_ps(qq21,felec);
1559             
1560             fscal            = felec;
1561
1562             /* Calculate temporary vectorial force */
1563             tx               = _mm256_mul_ps(fscal,dx21);
1564             ty               = _mm256_mul_ps(fscal,dy21);
1565             tz               = _mm256_mul_ps(fscal,dz21);
1566
1567             /* Update vectorial force */
1568             fix2             = _mm256_add_ps(fix2,tx);
1569             fiy2             = _mm256_add_ps(fiy2,ty);
1570             fiz2             = _mm256_add_ps(fiz2,tz);
1571
1572             fjx1             = _mm256_add_ps(fjx1,tx);
1573             fjy1             = _mm256_add_ps(fjy1,ty);
1574             fjz1             = _mm256_add_ps(fjz1,tz);
1575
1576             /**************************
1577              * CALCULATE INTERACTIONS *
1578              **************************/
1579
1580             r22              = _mm256_mul_ps(rsq22,rinv22);
1581
1582             /* EWALD ELECTROSTATICS */
1583             
1584             /* Analytical PME correction */
1585             zeta2            = _mm256_mul_ps(beta2,rsq22);
1586             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1587             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1588             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1589             felec            = _mm256_mul_ps(qq22,felec);
1590             
1591             fscal            = felec;
1592
1593             /* Calculate temporary vectorial force */
1594             tx               = _mm256_mul_ps(fscal,dx22);
1595             ty               = _mm256_mul_ps(fscal,dy22);
1596             tz               = _mm256_mul_ps(fscal,dz22);
1597
1598             /* Update vectorial force */
1599             fix2             = _mm256_add_ps(fix2,tx);
1600             fiy2             = _mm256_add_ps(fiy2,ty);
1601             fiz2             = _mm256_add_ps(fiz2,tz);
1602
1603             fjx2             = _mm256_add_ps(fjx2,tx);
1604             fjy2             = _mm256_add_ps(fjy2,ty);
1605             fjz2             = _mm256_add_ps(fjz2,tz);
1606
1607             /**************************
1608              * CALCULATE INTERACTIONS *
1609              **************************/
1610
1611             r23              = _mm256_mul_ps(rsq23,rinv23);
1612
1613             /* EWALD ELECTROSTATICS */
1614             
1615             /* Analytical PME correction */
1616             zeta2            = _mm256_mul_ps(beta2,rsq23);
1617             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1618             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1619             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1620             felec            = _mm256_mul_ps(qq23,felec);
1621             
1622             fscal            = felec;
1623
1624             /* Calculate temporary vectorial force */
1625             tx               = _mm256_mul_ps(fscal,dx23);
1626             ty               = _mm256_mul_ps(fscal,dy23);
1627             tz               = _mm256_mul_ps(fscal,dz23);
1628
1629             /* Update vectorial force */
1630             fix2             = _mm256_add_ps(fix2,tx);
1631             fiy2             = _mm256_add_ps(fiy2,ty);
1632             fiz2             = _mm256_add_ps(fiz2,tz);
1633
1634             fjx3             = _mm256_add_ps(fjx3,tx);
1635             fjy3             = _mm256_add_ps(fjy3,ty);
1636             fjz3             = _mm256_add_ps(fjz3,tz);
1637
1638             /**************************
1639              * CALCULATE INTERACTIONS *
1640              **************************/
1641
1642             r31              = _mm256_mul_ps(rsq31,rinv31);
1643
1644             /* EWALD ELECTROSTATICS */
1645             
1646             /* Analytical PME correction */
1647             zeta2            = _mm256_mul_ps(beta2,rsq31);
1648             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1649             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1650             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1651             felec            = _mm256_mul_ps(qq31,felec);
1652             
1653             fscal            = felec;
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm256_mul_ps(fscal,dx31);
1657             ty               = _mm256_mul_ps(fscal,dy31);
1658             tz               = _mm256_mul_ps(fscal,dz31);
1659
1660             /* Update vectorial force */
1661             fix3             = _mm256_add_ps(fix3,tx);
1662             fiy3             = _mm256_add_ps(fiy3,ty);
1663             fiz3             = _mm256_add_ps(fiz3,tz);
1664
1665             fjx1             = _mm256_add_ps(fjx1,tx);
1666             fjy1             = _mm256_add_ps(fjy1,ty);
1667             fjz1             = _mm256_add_ps(fjz1,tz);
1668
1669             /**************************
1670              * CALCULATE INTERACTIONS *
1671              **************************/
1672
1673             r32              = _mm256_mul_ps(rsq32,rinv32);
1674
1675             /* EWALD ELECTROSTATICS */
1676             
1677             /* Analytical PME correction */
1678             zeta2            = _mm256_mul_ps(beta2,rsq32);
1679             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1680             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1681             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1682             felec            = _mm256_mul_ps(qq32,felec);
1683             
1684             fscal            = felec;
1685
1686             /* Calculate temporary vectorial force */
1687             tx               = _mm256_mul_ps(fscal,dx32);
1688             ty               = _mm256_mul_ps(fscal,dy32);
1689             tz               = _mm256_mul_ps(fscal,dz32);
1690
1691             /* Update vectorial force */
1692             fix3             = _mm256_add_ps(fix3,tx);
1693             fiy3             = _mm256_add_ps(fiy3,ty);
1694             fiz3             = _mm256_add_ps(fiz3,tz);
1695
1696             fjx2             = _mm256_add_ps(fjx2,tx);
1697             fjy2             = _mm256_add_ps(fjy2,ty);
1698             fjz2             = _mm256_add_ps(fjz2,tz);
1699
1700             /**************************
1701              * CALCULATE INTERACTIONS *
1702              **************************/
1703
1704             r33              = _mm256_mul_ps(rsq33,rinv33);
1705
1706             /* EWALD ELECTROSTATICS */
1707             
1708             /* Analytical PME correction */
1709             zeta2            = _mm256_mul_ps(beta2,rsq33);
1710             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1711             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1712             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1713             felec            = _mm256_mul_ps(qq33,felec);
1714             
1715             fscal            = felec;
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm256_mul_ps(fscal,dx33);
1719             ty               = _mm256_mul_ps(fscal,dy33);
1720             tz               = _mm256_mul_ps(fscal,dz33);
1721
1722             /* Update vectorial force */
1723             fix3             = _mm256_add_ps(fix3,tx);
1724             fiy3             = _mm256_add_ps(fiy3,ty);
1725             fiz3             = _mm256_add_ps(fiz3,tz);
1726
1727             fjx3             = _mm256_add_ps(fjx3,tx);
1728             fjy3             = _mm256_add_ps(fjy3,ty);
1729             fjz3             = _mm256_add_ps(fjz3,tz);
1730
1731             fjptrA             = f+j_coord_offsetA;
1732             fjptrB             = f+j_coord_offsetB;
1733             fjptrC             = f+j_coord_offsetC;
1734             fjptrD             = f+j_coord_offsetD;
1735             fjptrE             = f+j_coord_offsetE;
1736             fjptrF             = f+j_coord_offsetF;
1737             fjptrG             = f+j_coord_offsetG;
1738             fjptrH             = f+j_coord_offsetH;
1739
1740             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1741                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1742                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1743
1744             /* Inner loop uses 504 flops */
1745         }
1746
1747         if(jidx<j_index_end)
1748         {
1749
1750             /* Get j neighbor index, and coordinate index */
1751             jnrlistA         = jjnr[jidx];
1752             jnrlistB         = jjnr[jidx+1];
1753             jnrlistC         = jjnr[jidx+2];
1754             jnrlistD         = jjnr[jidx+3];
1755             jnrlistE         = jjnr[jidx+4];
1756             jnrlistF         = jjnr[jidx+5];
1757             jnrlistG         = jjnr[jidx+6];
1758             jnrlistH         = jjnr[jidx+7];
1759             /* Sign of each element will be negative for non-real atoms.
1760              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1761              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1762              */
1763             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1764                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1765                                             
1766             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1767             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1768             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1769             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1770             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1771             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1772             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1773             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1774             j_coord_offsetA  = DIM*jnrA;
1775             j_coord_offsetB  = DIM*jnrB;
1776             j_coord_offsetC  = DIM*jnrC;
1777             j_coord_offsetD  = DIM*jnrD;
1778             j_coord_offsetE  = DIM*jnrE;
1779             j_coord_offsetF  = DIM*jnrF;
1780             j_coord_offsetG  = DIM*jnrG;
1781             j_coord_offsetH  = DIM*jnrH;
1782
1783             /* load j atom coordinates */
1784             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1785                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1786                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1787                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1788                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1789
1790             /* Calculate displacement vector */
1791             dx11             = _mm256_sub_ps(ix1,jx1);
1792             dy11             = _mm256_sub_ps(iy1,jy1);
1793             dz11             = _mm256_sub_ps(iz1,jz1);
1794             dx12             = _mm256_sub_ps(ix1,jx2);
1795             dy12             = _mm256_sub_ps(iy1,jy2);
1796             dz12             = _mm256_sub_ps(iz1,jz2);
1797             dx13             = _mm256_sub_ps(ix1,jx3);
1798             dy13             = _mm256_sub_ps(iy1,jy3);
1799             dz13             = _mm256_sub_ps(iz1,jz3);
1800             dx21             = _mm256_sub_ps(ix2,jx1);
1801             dy21             = _mm256_sub_ps(iy2,jy1);
1802             dz21             = _mm256_sub_ps(iz2,jz1);
1803             dx22             = _mm256_sub_ps(ix2,jx2);
1804             dy22             = _mm256_sub_ps(iy2,jy2);
1805             dz22             = _mm256_sub_ps(iz2,jz2);
1806             dx23             = _mm256_sub_ps(ix2,jx3);
1807             dy23             = _mm256_sub_ps(iy2,jy3);
1808             dz23             = _mm256_sub_ps(iz2,jz3);
1809             dx31             = _mm256_sub_ps(ix3,jx1);
1810             dy31             = _mm256_sub_ps(iy3,jy1);
1811             dz31             = _mm256_sub_ps(iz3,jz1);
1812             dx32             = _mm256_sub_ps(ix3,jx2);
1813             dy32             = _mm256_sub_ps(iy3,jy2);
1814             dz32             = _mm256_sub_ps(iz3,jz2);
1815             dx33             = _mm256_sub_ps(ix3,jx3);
1816             dy33             = _mm256_sub_ps(iy3,jy3);
1817             dz33             = _mm256_sub_ps(iz3,jz3);
1818
1819             /* Calculate squared distance and things based on it */
1820             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1821             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1822             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1823             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1824             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1825             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1826             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1827             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1828             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1829
1830             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1831             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1832             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1833             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1834             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1835             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1836             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1837             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1838             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1839
1840             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1841             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1842             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1843             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1844             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1845             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1846             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1847             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1848             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1849
1850             fjx1             = _mm256_setzero_ps();
1851             fjy1             = _mm256_setzero_ps();
1852             fjz1             = _mm256_setzero_ps();
1853             fjx2             = _mm256_setzero_ps();
1854             fjy2             = _mm256_setzero_ps();
1855             fjz2             = _mm256_setzero_ps();
1856             fjx3             = _mm256_setzero_ps();
1857             fjy3             = _mm256_setzero_ps();
1858             fjz3             = _mm256_setzero_ps();
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             r11              = _mm256_mul_ps(rsq11,rinv11);
1865             r11              = _mm256_andnot_ps(dummy_mask,r11);
1866
1867             /* EWALD ELECTROSTATICS */
1868             
1869             /* Analytical PME correction */
1870             zeta2            = _mm256_mul_ps(beta2,rsq11);
1871             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1872             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1873             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1874             felec            = _mm256_mul_ps(qq11,felec);
1875             
1876             fscal            = felec;
1877
1878             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1879
1880             /* Calculate temporary vectorial force */
1881             tx               = _mm256_mul_ps(fscal,dx11);
1882             ty               = _mm256_mul_ps(fscal,dy11);
1883             tz               = _mm256_mul_ps(fscal,dz11);
1884
1885             /* Update vectorial force */
1886             fix1             = _mm256_add_ps(fix1,tx);
1887             fiy1             = _mm256_add_ps(fiy1,ty);
1888             fiz1             = _mm256_add_ps(fiz1,tz);
1889
1890             fjx1             = _mm256_add_ps(fjx1,tx);
1891             fjy1             = _mm256_add_ps(fjy1,ty);
1892             fjz1             = _mm256_add_ps(fjz1,tz);
1893
1894             /**************************
1895              * CALCULATE INTERACTIONS *
1896              **************************/
1897
1898             r12              = _mm256_mul_ps(rsq12,rinv12);
1899             r12              = _mm256_andnot_ps(dummy_mask,r12);
1900
1901             /* EWALD ELECTROSTATICS */
1902             
1903             /* Analytical PME correction */
1904             zeta2            = _mm256_mul_ps(beta2,rsq12);
1905             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1906             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1907             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1908             felec            = _mm256_mul_ps(qq12,felec);
1909             
1910             fscal            = felec;
1911
1912             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1913
1914             /* Calculate temporary vectorial force */
1915             tx               = _mm256_mul_ps(fscal,dx12);
1916             ty               = _mm256_mul_ps(fscal,dy12);
1917             tz               = _mm256_mul_ps(fscal,dz12);
1918
1919             /* Update vectorial force */
1920             fix1             = _mm256_add_ps(fix1,tx);
1921             fiy1             = _mm256_add_ps(fiy1,ty);
1922             fiz1             = _mm256_add_ps(fiz1,tz);
1923
1924             fjx2             = _mm256_add_ps(fjx2,tx);
1925             fjy2             = _mm256_add_ps(fjy2,ty);
1926             fjz2             = _mm256_add_ps(fjz2,tz);
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             r13              = _mm256_mul_ps(rsq13,rinv13);
1933             r13              = _mm256_andnot_ps(dummy_mask,r13);
1934
1935             /* EWALD ELECTROSTATICS */
1936             
1937             /* Analytical PME correction */
1938             zeta2            = _mm256_mul_ps(beta2,rsq13);
1939             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1940             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1941             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1942             felec            = _mm256_mul_ps(qq13,felec);
1943             
1944             fscal            = felec;
1945
1946             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1947
1948             /* Calculate temporary vectorial force */
1949             tx               = _mm256_mul_ps(fscal,dx13);
1950             ty               = _mm256_mul_ps(fscal,dy13);
1951             tz               = _mm256_mul_ps(fscal,dz13);
1952
1953             /* Update vectorial force */
1954             fix1             = _mm256_add_ps(fix1,tx);
1955             fiy1             = _mm256_add_ps(fiy1,ty);
1956             fiz1             = _mm256_add_ps(fiz1,tz);
1957
1958             fjx3             = _mm256_add_ps(fjx3,tx);
1959             fjy3             = _mm256_add_ps(fjy3,ty);
1960             fjz3             = _mm256_add_ps(fjz3,tz);
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             r21              = _mm256_mul_ps(rsq21,rinv21);
1967             r21              = _mm256_andnot_ps(dummy_mask,r21);
1968
1969             /* EWALD ELECTROSTATICS */
1970             
1971             /* Analytical PME correction */
1972             zeta2            = _mm256_mul_ps(beta2,rsq21);
1973             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1974             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1975             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1976             felec            = _mm256_mul_ps(qq21,felec);
1977             
1978             fscal            = felec;
1979
1980             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1981
1982             /* Calculate temporary vectorial force */
1983             tx               = _mm256_mul_ps(fscal,dx21);
1984             ty               = _mm256_mul_ps(fscal,dy21);
1985             tz               = _mm256_mul_ps(fscal,dz21);
1986
1987             /* Update vectorial force */
1988             fix2             = _mm256_add_ps(fix2,tx);
1989             fiy2             = _mm256_add_ps(fiy2,ty);
1990             fiz2             = _mm256_add_ps(fiz2,tz);
1991
1992             fjx1             = _mm256_add_ps(fjx1,tx);
1993             fjy1             = _mm256_add_ps(fjy1,ty);
1994             fjz1             = _mm256_add_ps(fjz1,tz);
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             r22              = _mm256_mul_ps(rsq22,rinv22);
2001             r22              = _mm256_andnot_ps(dummy_mask,r22);
2002
2003             /* EWALD ELECTROSTATICS */
2004             
2005             /* Analytical PME correction */
2006             zeta2            = _mm256_mul_ps(beta2,rsq22);
2007             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2008             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2009             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2010             felec            = _mm256_mul_ps(qq22,felec);
2011             
2012             fscal            = felec;
2013
2014             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2015
2016             /* Calculate temporary vectorial force */
2017             tx               = _mm256_mul_ps(fscal,dx22);
2018             ty               = _mm256_mul_ps(fscal,dy22);
2019             tz               = _mm256_mul_ps(fscal,dz22);
2020
2021             /* Update vectorial force */
2022             fix2             = _mm256_add_ps(fix2,tx);
2023             fiy2             = _mm256_add_ps(fiy2,ty);
2024             fiz2             = _mm256_add_ps(fiz2,tz);
2025
2026             fjx2             = _mm256_add_ps(fjx2,tx);
2027             fjy2             = _mm256_add_ps(fjy2,ty);
2028             fjz2             = _mm256_add_ps(fjz2,tz);
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             r23              = _mm256_mul_ps(rsq23,rinv23);
2035             r23              = _mm256_andnot_ps(dummy_mask,r23);
2036
2037             /* EWALD ELECTROSTATICS */
2038             
2039             /* Analytical PME correction */
2040             zeta2            = _mm256_mul_ps(beta2,rsq23);
2041             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2042             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2043             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2044             felec            = _mm256_mul_ps(qq23,felec);
2045             
2046             fscal            = felec;
2047
2048             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2049
2050             /* Calculate temporary vectorial force */
2051             tx               = _mm256_mul_ps(fscal,dx23);
2052             ty               = _mm256_mul_ps(fscal,dy23);
2053             tz               = _mm256_mul_ps(fscal,dz23);
2054
2055             /* Update vectorial force */
2056             fix2             = _mm256_add_ps(fix2,tx);
2057             fiy2             = _mm256_add_ps(fiy2,ty);
2058             fiz2             = _mm256_add_ps(fiz2,tz);
2059
2060             fjx3             = _mm256_add_ps(fjx3,tx);
2061             fjy3             = _mm256_add_ps(fjy3,ty);
2062             fjz3             = _mm256_add_ps(fjz3,tz);
2063
2064             /**************************
2065              * CALCULATE INTERACTIONS *
2066              **************************/
2067
2068             r31              = _mm256_mul_ps(rsq31,rinv31);
2069             r31              = _mm256_andnot_ps(dummy_mask,r31);
2070
2071             /* EWALD ELECTROSTATICS */
2072             
2073             /* Analytical PME correction */
2074             zeta2            = _mm256_mul_ps(beta2,rsq31);
2075             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2076             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2077             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2078             felec            = _mm256_mul_ps(qq31,felec);
2079             
2080             fscal            = felec;
2081
2082             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2083
2084             /* Calculate temporary vectorial force */
2085             tx               = _mm256_mul_ps(fscal,dx31);
2086             ty               = _mm256_mul_ps(fscal,dy31);
2087             tz               = _mm256_mul_ps(fscal,dz31);
2088
2089             /* Update vectorial force */
2090             fix3             = _mm256_add_ps(fix3,tx);
2091             fiy3             = _mm256_add_ps(fiy3,ty);
2092             fiz3             = _mm256_add_ps(fiz3,tz);
2093
2094             fjx1             = _mm256_add_ps(fjx1,tx);
2095             fjy1             = _mm256_add_ps(fjy1,ty);
2096             fjz1             = _mm256_add_ps(fjz1,tz);
2097
2098             /**************************
2099              * CALCULATE INTERACTIONS *
2100              **************************/
2101
2102             r32              = _mm256_mul_ps(rsq32,rinv32);
2103             r32              = _mm256_andnot_ps(dummy_mask,r32);
2104
2105             /* EWALD ELECTROSTATICS */
2106             
2107             /* Analytical PME correction */
2108             zeta2            = _mm256_mul_ps(beta2,rsq32);
2109             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2110             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2111             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2112             felec            = _mm256_mul_ps(qq32,felec);
2113             
2114             fscal            = felec;
2115
2116             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2117
2118             /* Calculate temporary vectorial force */
2119             tx               = _mm256_mul_ps(fscal,dx32);
2120             ty               = _mm256_mul_ps(fscal,dy32);
2121             tz               = _mm256_mul_ps(fscal,dz32);
2122
2123             /* Update vectorial force */
2124             fix3             = _mm256_add_ps(fix3,tx);
2125             fiy3             = _mm256_add_ps(fiy3,ty);
2126             fiz3             = _mm256_add_ps(fiz3,tz);
2127
2128             fjx2             = _mm256_add_ps(fjx2,tx);
2129             fjy2             = _mm256_add_ps(fjy2,ty);
2130             fjz2             = _mm256_add_ps(fjz2,tz);
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             r33              = _mm256_mul_ps(rsq33,rinv33);
2137             r33              = _mm256_andnot_ps(dummy_mask,r33);
2138
2139             /* EWALD ELECTROSTATICS */
2140             
2141             /* Analytical PME correction */
2142             zeta2            = _mm256_mul_ps(beta2,rsq33);
2143             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2144             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2145             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2146             felec            = _mm256_mul_ps(qq33,felec);
2147             
2148             fscal            = felec;
2149
2150             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2151
2152             /* Calculate temporary vectorial force */
2153             tx               = _mm256_mul_ps(fscal,dx33);
2154             ty               = _mm256_mul_ps(fscal,dy33);
2155             tz               = _mm256_mul_ps(fscal,dz33);
2156
2157             /* Update vectorial force */
2158             fix3             = _mm256_add_ps(fix3,tx);
2159             fiy3             = _mm256_add_ps(fiy3,ty);
2160             fiz3             = _mm256_add_ps(fiz3,tz);
2161
2162             fjx3             = _mm256_add_ps(fjx3,tx);
2163             fjy3             = _mm256_add_ps(fjy3,ty);
2164             fjz3             = _mm256_add_ps(fjz3,tz);
2165
2166             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2167             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2168             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2169             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2170             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2171             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2172             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2173             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2174
2175             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2176                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2177                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2178
2179             /* Inner loop uses 513 flops */
2180         }
2181
2182         /* End of innermost loop */
2183
2184         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2185                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
2186
2187         /* Increment number of inner iterations */
2188         inneriter                  += j_index_end - j_index_start;
2189
2190         /* Outer loop uses 18 flops */
2191     }
2192
2193     /* Increment number of outer iterations */
2194     outeriter        += nri;
2195
2196     /* Update outer/inner flops */
2197
2198     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*513);
2199 }