3665057d3c46bbd0439f9b1dc3e08c2bfddc8530
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Particle
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     real *           vdwioffsetptr3;
79     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
84     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
86     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
87     real             *charge;
88     int              nvdwtype;
89     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
90     int              *vdwtype;
91     real             *vdwparam;
92     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
93     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
94     __m256i          ewitab;
95     __m128i          ewitab_lo,ewitab_hi;
96     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
97     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
98     real             *ewtab;
99     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
100     real             rswitch_scalar,d_scalar;
101     __m256           dummy_mask,cutoff_mask;
102     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
103     __m256           one     = _mm256_set1_ps(1.0);
104     __m256           two     = _mm256_set1_ps(2.0);
105     x                = xx[0];
106     f                = ff[0];
107
108     nri              = nlist->nri;
109     iinr             = nlist->iinr;
110     jindex           = nlist->jindex;
111     jjnr             = nlist->jjnr;
112     shiftidx         = nlist->shift;
113     gid              = nlist->gid;
114     shiftvec         = fr->shift_vec[0];
115     fshift           = fr->fshift[0];
116     facel            = _mm256_set1_ps(fr->epsfac);
117     charge           = mdatoms->chargeA;
118     nvdwtype         = fr->ntype;
119     vdwparam         = fr->nbfp;
120     vdwtype          = mdatoms->typeA;
121
122     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
123     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
124     beta2            = _mm256_mul_ps(beta,beta);
125     beta3            = _mm256_mul_ps(beta,beta2);
126
127     ewtab            = fr->ic->tabq_coul_FDV0;
128     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
129     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
136     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
137
138     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
139     rcutoff_scalar   = fr->rcoulomb;
140     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
141     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
142
143     rswitch_scalar   = fr->rcoulomb_switch;
144     rswitch          = _mm256_set1_ps(rswitch_scalar);
145     /* Setup switch parameters */
146     d_scalar         = rcutoff_scalar-rswitch_scalar;
147     d                = _mm256_set1_ps(d_scalar);
148     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
149     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
150     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
151     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
152     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
153     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
154
155     /* Avoid stupid compiler warnings */
156     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
157     j_coord_offsetA = 0;
158     j_coord_offsetB = 0;
159     j_coord_offsetC = 0;
160     j_coord_offsetD = 0;
161     j_coord_offsetE = 0;
162     j_coord_offsetF = 0;
163     j_coord_offsetG = 0;
164     j_coord_offsetH = 0;
165
166     outeriter        = 0;
167     inneriter        = 0;
168
169     for(iidx=0;iidx<4*DIM;iidx++)
170     {
171         scratch[iidx] = 0.0;
172     }
173
174     /* Start outer loop over neighborlists */
175     for(iidx=0; iidx<nri; iidx++)
176     {
177         /* Load shift vector for this list */
178         i_shift_offset   = DIM*shiftidx[iidx];
179
180         /* Load limits for loop over neighbors */
181         j_index_start    = jindex[iidx];
182         j_index_end      = jindex[iidx+1];
183
184         /* Get outer coordinate index */
185         inr              = iinr[iidx];
186         i_coord_offset   = DIM*inr;
187
188         /* Load i particle coords and add shift vector */
189         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
190                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
191
192         fix0             = _mm256_setzero_ps();
193         fiy0             = _mm256_setzero_ps();
194         fiz0             = _mm256_setzero_ps();
195         fix1             = _mm256_setzero_ps();
196         fiy1             = _mm256_setzero_ps();
197         fiz1             = _mm256_setzero_ps();
198         fix2             = _mm256_setzero_ps();
199         fiy2             = _mm256_setzero_ps();
200         fiz2             = _mm256_setzero_ps();
201         fix3             = _mm256_setzero_ps();
202         fiy3             = _mm256_setzero_ps();
203         fiz3             = _mm256_setzero_ps();
204
205         /* Reset potential sums */
206         velecsum         = _mm256_setzero_ps();
207         vvdwsum          = _mm256_setzero_ps();
208
209         /* Start inner kernel loop */
210         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
211         {
212
213             /* Get j neighbor index, and coordinate index */
214             jnrA             = jjnr[jidx];
215             jnrB             = jjnr[jidx+1];
216             jnrC             = jjnr[jidx+2];
217             jnrD             = jjnr[jidx+3];
218             jnrE             = jjnr[jidx+4];
219             jnrF             = jjnr[jidx+5];
220             jnrG             = jjnr[jidx+6];
221             jnrH             = jjnr[jidx+7];
222             j_coord_offsetA  = DIM*jnrA;
223             j_coord_offsetB  = DIM*jnrB;
224             j_coord_offsetC  = DIM*jnrC;
225             j_coord_offsetD  = DIM*jnrD;
226             j_coord_offsetE  = DIM*jnrE;
227             j_coord_offsetF  = DIM*jnrF;
228             j_coord_offsetG  = DIM*jnrG;
229             j_coord_offsetH  = DIM*jnrH;
230
231             /* load j atom coordinates */
232             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233                                                  x+j_coord_offsetC,x+j_coord_offsetD,
234                                                  x+j_coord_offsetE,x+j_coord_offsetF,
235                                                  x+j_coord_offsetG,x+j_coord_offsetH,
236                                                  &jx0,&jy0,&jz0);
237
238             /* Calculate displacement vector */
239             dx00             = _mm256_sub_ps(ix0,jx0);
240             dy00             = _mm256_sub_ps(iy0,jy0);
241             dz00             = _mm256_sub_ps(iz0,jz0);
242             dx10             = _mm256_sub_ps(ix1,jx0);
243             dy10             = _mm256_sub_ps(iy1,jy0);
244             dz10             = _mm256_sub_ps(iz1,jz0);
245             dx20             = _mm256_sub_ps(ix2,jx0);
246             dy20             = _mm256_sub_ps(iy2,jy0);
247             dz20             = _mm256_sub_ps(iz2,jz0);
248             dx30             = _mm256_sub_ps(ix3,jx0);
249             dy30             = _mm256_sub_ps(iy3,jy0);
250             dz30             = _mm256_sub_ps(iz3,jz0);
251
252             /* Calculate squared distance and things based on it */
253             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
254             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
255             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
256             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
257
258             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
259             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
260             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
261             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
262
263             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
264             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
265             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
266             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
267
268             /* Load parameters for j particles */
269             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
270                                                                  charge+jnrC+0,charge+jnrD+0,
271                                                                  charge+jnrE+0,charge+jnrF+0,
272                                                                  charge+jnrG+0,charge+jnrH+0);
273             vdwjidx0A        = 2*vdwtype[jnrA+0];
274             vdwjidx0B        = 2*vdwtype[jnrB+0];
275             vdwjidx0C        = 2*vdwtype[jnrC+0];
276             vdwjidx0D        = 2*vdwtype[jnrD+0];
277             vdwjidx0E        = 2*vdwtype[jnrE+0];
278             vdwjidx0F        = 2*vdwtype[jnrF+0];
279             vdwjidx0G        = 2*vdwtype[jnrG+0];
280             vdwjidx0H        = 2*vdwtype[jnrH+0];
281
282             fjx0             = _mm256_setzero_ps();
283             fjy0             = _mm256_setzero_ps();
284             fjz0             = _mm256_setzero_ps();
285
286             /**************************
287              * CALCULATE INTERACTIONS *
288              **************************/
289
290             if (gmx_mm256_any_lt(rsq00,rcutoff2))
291             {
292
293             r00              = _mm256_mul_ps(rsq00,rinv00);
294
295             /* Compute parameters for interactions between i and j atoms */
296             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
297                                             vdwioffsetptr0+vdwjidx0B,
298                                             vdwioffsetptr0+vdwjidx0C,
299                                             vdwioffsetptr0+vdwjidx0D,
300                                             vdwioffsetptr0+vdwjidx0E,
301                                             vdwioffsetptr0+vdwjidx0F,
302                                             vdwioffsetptr0+vdwjidx0G,
303                                             vdwioffsetptr0+vdwjidx0H,
304                                             &c6_00,&c12_00);
305
306             /* LENNARD-JONES DISPERSION/REPULSION */
307
308             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
309             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
310             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
311             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
312             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
313
314             d                = _mm256_sub_ps(r00,rswitch);
315             d                = _mm256_max_ps(d,_mm256_setzero_ps());
316             d2               = _mm256_mul_ps(d,d);
317             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
318
319             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
320
321             /* Evaluate switch function */
322             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
323             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
324             vvdw             = _mm256_mul_ps(vvdw,sw);
325             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
326
327             /* Update potential sum for this i atom from the interaction with this j atom. */
328             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
329             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
330
331             fscal            = fvdw;
332
333             fscal            = _mm256_and_ps(fscal,cutoff_mask);
334
335             /* Calculate temporary vectorial force */
336             tx               = _mm256_mul_ps(fscal,dx00);
337             ty               = _mm256_mul_ps(fscal,dy00);
338             tz               = _mm256_mul_ps(fscal,dz00);
339
340             /* Update vectorial force */
341             fix0             = _mm256_add_ps(fix0,tx);
342             fiy0             = _mm256_add_ps(fiy0,ty);
343             fiz0             = _mm256_add_ps(fiz0,tz);
344
345             fjx0             = _mm256_add_ps(fjx0,tx);
346             fjy0             = _mm256_add_ps(fjy0,ty);
347             fjz0             = _mm256_add_ps(fjz0,tz);
348
349             }
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             if (gmx_mm256_any_lt(rsq10,rcutoff2))
356             {
357
358             r10              = _mm256_mul_ps(rsq10,rinv10);
359
360             /* Compute parameters for interactions between i and j atoms */
361             qq10             = _mm256_mul_ps(iq1,jq0);
362
363             /* EWALD ELECTROSTATICS */
364             
365             /* Analytical PME correction */
366             zeta2            = _mm256_mul_ps(beta2,rsq10);
367             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
368             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
369             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
370             felec            = _mm256_mul_ps(qq10,felec);
371             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
372             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
373             velec            = _mm256_sub_ps(rinv10,pmecorrV);
374             velec            = _mm256_mul_ps(qq10,velec);
375             
376             d                = _mm256_sub_ps(r10,rswitch);
377             d                = _mm256_max_ps(d,_mm256_setzero_ps());
378             d2               = _mm256_mul_ps(d,d);
379             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
380
381             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
382
383             /* Evaluate switch function */
384             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
385             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
386             velec            = _mm256_mul_ps(velec,sw);
387             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velec            = _mm256_and_ps(velec,cutoff_mask);
391             velecsum         = _mm256_add_ps(velecsum,velec);
392
393             fscal            = felec;
394
395             fscal            = _mm256_and_ps(fscal,cutoff_mask);
396
397             /* Calculate temporary vectorial force */
398             tx               = _mm256_mul_ps(fscal,dx10);
399             ty               = _mm256_mul_ps(fscal,dy10);
400             tz               = _mm256_mul_ps(fscal,dz10);
401
402             /* Update vectorial force */
403             fix1             = _mm256_add_ps(fix1,tx);
404             fiy1             = _mm256_add_ps(fiy1,ty);
405             fiz1             = _mm256_add_ps(fiz1,tz);
406
407             fjx0             = _mm256_add_ps(fjx0,tx);
408             fjy0             = _mm256_add_ps(fjy0,ty);
409             fjz0             = _mm256_add_ps(fjz0,tz);
410
411             }
412
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             if (gmx_mm256_any_lt(rsq20,rcutoff2))
418             {
419
420             r20              = _mm256_mul_ps(rsq20,rinv20);
421
422             /* Compute parameters for interactions between i and j atoms */
423             qq20             = _mm256_mul_ps(iq2,jq0);
424
425             /* EWALD ELECTROSTATICS */
426             
427             /* Analytical PME correction */
428             zeta2            = _mm256_mul_ps(beta2,rsq20);
429             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
430             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
431             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
432             felec            = _mm256_mul_ps(qq20,felec);
433             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
434             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
435             velec            = _mm256_sub_ps(rinv20,pmecorrV);
436             velec            = _mm256_mul_ps(qq20,velec);
437             
438             d                = _mm256_sub_ps(r20,rswitch);
439             d                = _mm256_max_ps(d,_mm256_setzero_ps());
440             d2               = _mm256_mul_ps(d,d);
441             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
442
443             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
444
445             /* Evaluate switch function */
446             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
447             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
448             velec            = _mm256_mul_ps(velec,sw);
449             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
450
451             /* Update potential sum for this i atom from the interaction with this j atom. */
452             velec            = _mm256_and_ps(velec,cutoff_mask);
453             velecsum         = _mm256_add_ps(velecsum,velec);
454
455             fscal            = felec;
456
457             fscal            = _mm256_and_ps(fscal,cutoff_mask);
458
459             /* Calculate temporary vectorial force */
460             tx               = _mm256_mul_ps(fscal,dx20);
461             ty               = _mm256_mul_ps(fscal,dy20);
462             tz               = _mm256_mul_ps(fscal,dz20);
463
464             /* Update vectorial force */
465             fix2             = _mm256_add_ps(fix2,tx);
466             fiy2             = _mm256_add_ps(fiy2,ty);
467             fiz2             = _mm256_add_ps(fiz2,tz);
468
469             fjx0             = _mm256_add_ps(fjx0,tx);
470             fjy0             = _mm256_add_ps(fjy0,ty);
471             fjz0             = _mm256_add_ps(fjz0,tz);
472
473             }
474
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             if (gmx_mm256_any_lt(rsq30,rcutoff2))
480             {
481
482             r30              = _mm256_mul_ps(rsq30,rinv30);
483
484             /* Compute parameters for interactions between i and j atoms */
485             qq30             = _mm256_mul_ps(iq3,jq0);
486
487             /* EWALD ELECTROSTATICS */
488             
489             /* Analytical PME correction */
490             zeta2            = _mm256_mul_ps(beta2,rsq30);
491             rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
492             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
493             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
494             felec            = _mm256_mul_ps(qq30,felec);
495             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
496             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
497             velec            = _mm256_sub_ps(rinv30,pmecorrV);
498             velec            = _mm256_mul_ps(qq30,velec);
499             
500             d                = _mm256_sub_ps(r30,rswitch);
501             d                = _mm256_max_ps(d,_mm256_setzero_ps());
502             d2               = _mm256_mul_ps(d,d);
503             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
504
505             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
506
507             /* Evaluate switch function */
508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
509             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
510             velec            = _mm256_mul_ps(velec,sw);
511             cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velec            = _mm256_and_ps(velec,cutoff_mask);
515             velecsum         = _mm256_add_ps(velecsum,velec);
516
517             fscal            = felec;
518
519             fscal            = _mm256_and_ps(fscal,cutoff_mask);
520
521             /* Calculate temporary vectorial force */
522             tx               = _mm256_mul_ps(fscal,dx30);
523             ty               = _mm256_mul_ps(fscal,dy30);
524             tz               = _mm256_mul_ps(fscal,dz30);
525
526             /* Update vectorial force */
527             fix3             = _mm256_add_ps(fix3,tx);
528             fiy3             = _mm256_add_ps(fiy3,ty);
529             fiz3             = _mm256_add_ps(fiz3,tz);
530
531             fjx0             = _mm256_add_ps(fjx0,tx);
532             fjy0             = _mm256_add_ps(fjy0,ty);
533             fjz0             = _mm256_add_ps(fjz0,tz);
534
535             }
536
537             fjptrA             = f+j_coord_offsetA;
538             fjptrB             = f+j_coord_offsetB;
539             fjptrC             = f+j_coord_offsetC;
540             fjptrD             = f+j_coord_offsetD;
541             fjptrE             = f+j_coord_offsetE;
542             fjptrF             = f+j_coord_offsetF;
543             fjptrG             = f+j_coord_offsetG;
544             fjptrH             = f+j_coord_offsetH;
545
546             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
547
548             /* Inner loop uses 386 flops */
549         }
550
551         if(jidx<j_index_end)
552         {
553
554             /* Get j neighbor index, and coordinate index */
555             jnrlistA         = jjnr[jidx];
556             jnrlistB         = jjnr[jidx+1];
557             jnrlistC         = jjnr[jidx+2];
558             jnrlistD         = jjnr[jidx+3];
559             jnrlistE         = jjnr[jidx+4];
560             jnrlistF         = jjnr[jidx+5];
561             jnrlistG         = jjnr[jidx+6];
562             jnrlistH         = jjnr[jidx+7];
563             /* Sign of each element will be negative for non-real atoms.
564              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
565              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
566              */
567             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
568                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
569                                             
570             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
571             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
572             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
573             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
574             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
575             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
576             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
577             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
578             j_coord_offsetA  = DIM*jnrA;
579             j_coord_offsetB  = DIM*jnrB;
580             j_coord_offsetC  = DIM*jnrC;
581             j_coord_offsetD  = DIM*jnrD;
582             j_coord_offsetE  = DIM*jnrE;
583             j_coord_offsetF  = DIM*jnrF;
584             j_coord_offsetG  = DIM*jnrG;
585             j_coord_offsetH  = DIM*jnrH;
586
587             /* load j atom coordinates */
588             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
589                                                  x+j_coord_offsetC,x+j_coord_offsetD,
590                                                  x+j_coord_offsetE,x+j_coord_offsetF,
591                                                  x+j_coord_offsetG,x+j_coord_offsetH,
592                                                  &jx0,&jy0,&jz0);
593
594             /* Calculate displacement vector */
595             dx00             = _mm256_sub_ps(ix0,jx0);
596             dy00             = _mm256_sub_ps(iy0,jy0);
597             dz00             = _mm256_sub_ps(iz0,jz0);
598             dx10             = _mm256_sub_ps(ix1,jx0);
599             dy10             = _mm256_sub_ps(iy1,jy0);
600             dz10             = _mm256_sub_ps(iz1,jz0);
601             dx20             = _mm256_sub_ps(ix2,jx0);
602             dy20             = _mm256_sub_ps(iy2,jy0);
603             dz20             = _mm256_sub_ps(iz2,jz0);
604             dx30             = _mm256_sub_ps(ix3,jx0);
605             dy30             = _mm256_sub_ps(iy3,jy0);
606             dz30             = _mm256_sub_ps(iz3,jz0);
607
608             /* Calculate squared distance and things based on it */
609             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
610             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
611             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
612             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
613
614             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
615             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
616             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
617             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
618
619             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
620             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
621             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
622             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
623
624             /* Load parameters for j particles */
625             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
626                                                                  charge+jnrC+0,charge+jnrD+0,
627                                                                  charge+jnrE+0,charge+jnrF+0,
628                                                                  charge+jnrG+0,charge+jnrH+0);
629             vdwjidx0A        = 2*vdwtype[jnrA+0];
630             vdwjidx0B        = 2*vdwtype[jnrB+0];
631             vdwjidx0C        = 2*vdwtype[jnrC+0];
632             vdwjidx0D        = 2*vdwtype[jnrD+0];
633             vdwjidx0E        = 2*vdwtype[jnrE+0];
634             vdwjidx0F        = 2*vdwtype[jnrF+0];
635             vdwjidx0G        = 2*vdwtype[jnrG+0];
636             vdwjidx0H        = 2*vdwtype[jnrH+0];
637
638             fjx0             = _mm256_setzero_ps();
639             fjy0             = _mm256_setzero_ps();
640             fjz0             = _mm256_setzero_ps();
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             if (gmx_mm256_any_lt(rsq00,rcutoff2))
647             {
648
649             r00              = _mm256_mul_ps(rsq00,rinv00);
650             r00              = _mm256_andnot_ps(dummy_mask,r00);
651
652             /* Compute parameters for interactions between i and j atoms */
653             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
654                                             vdwioffsetptr0+vdwjidx0B,
655                                             vdwioffsetptr0+vdwjidx0C,
656                                             vdwioffsetptr0+vdwjidx0D,
657                                             vdwioffsetptr0+vdwjidx0E,
658                                             vdwioffsetptr0+vdwjidx0F,
659                                             vdwioffsetptr0+vdwjidx0G,
660                                             vdwioffsetptr0+vdwjidx0H,
661                                             &c6_00,&c12_00);
662
663             /* LENNARD-JONES DISPERSION/REPULSION */
664
665             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
666             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
667             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
668             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
669             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
670
671             d                = _mm256_sub_ps(r00,rswitch);
672             d                = _mm256_max_ps(d,_mm256_setzero_ps());
673             d2               = _mm256_mul_ps(d,d);
674             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
675
676             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
677
678             /* Evaluate switch function */
679             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
681             vvdw             = _mm256_mul_ps(vvdw,sw);
682             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
683
684             /* Update potential sum for this i atom from the interaction with this j atom. */
685             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
686             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
687             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
688
689             fscal            = fvdw;
690
691             fscal            = _mm256_and_ps(fscal,cutoff_mask);
692
693             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
694
695             /* Calculate temporary vectorial force */
696             tx               = _mm256_mul_ps(fscal,dx00);
697             ty               = _mm256_mul_ps(fscal,dy00);
698             tz               = _mm256_mul_ps(fscal,dz00);
699
700             /* Update vectorial force */
701             fix0             = _mm256_add_ps(fix0,tx);
702             fiy0             = _mm256_add_ps(fiy0,ty);
703             fiz0             = _mm256_add_ps(fiz0,tz);
704
705             fjx0             = _mm256_add_ps(fjx0,tx);
706             fjy0             = _mm256_add_ps(fjy0,ty);
707             fjz0             = _mm256_add_ps(fjz0,tz);
708
709             }
710
711             /**************************
712              * CALCULATE INTERACTIONS *
713              **************************/
714
715             if (gmx_mm256_any_lt(rsq10,rcutoff2))
716             {
717
718             r10              = _mm256_mul_ps(rsq10,rinv10);
719             r10              = _mm256_andnot_ps(dummy_mask,r10);
720
721             /* Compute parameters for interactions between i and j atoms */
722             qq10             = _mm256_mul_ps(iq1,jq0);
723
724             /* EWALD ELECTROSTATICS */
725             
726             /* Analytical PME correction */
727             zeta2            = _mm256_mul_ps(beta2,rsq10);
728             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
729             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
730             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
731             felec            = _mm256_mul_ps(qq10,felec);
732             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
733             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
734             velec            = _mm256_sub_ps(rinv10,pmecorrV);
735             velec            = _mm256_mul_ps(qq10,velec);
736             
737             d                = _mm256_sub_ps(r10,rswitch);
738             d                = _mm256_max_ps(d,_mm256_setzero_ps());
739             d2               = _mm256_mul_ps(d,d);
740             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
741
742             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
743
744             /* Evaluate switch function */
745             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
746             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
747             velec            = _mm256_mul_ps(velec,sw);
748             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
749
750             /* Update potential sum for this i atom from the interaction with this j atom. */
751             velec            = _mm256_and_ps(velec,cutoff_mask);
752             velec            = _mm256_andnot_ps(dummy_mask,velec);
753             velecsum         = _mm256_add_ps(velecsum,velec);
754
755             fscal            = felec;
756
757             fscal            = _mm256_and_ps(fscal,cutoff_mask);
758
759             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
760
761             /* Calculate temporary vectorial force */
762             tx               = _mm256_mul_ps(fscal,dx10);
763             ty               = _mm256_mul_ps(fscal,dy10);
764             tz               = _mm256_mul_ps(fscal,dz10);
765
766             /* Update vectorial force */
767             fix1             = _mm256_add_ps(fix1,tx);
768             fiy1             = _mm256_add_ps(fiy1,ty);
769             fiz1             = _mm256_add_ps(fiz1,tz);
770
771             fjx0             = _mm256_add_ps(fjx0,tx);
772             fjy0             = _mm256_add_ps(fjy0,ty);
773             fjz0             = _mm256_add_ps(fjz0,tz);
774
775             }
776
777             /**************************
778              * CALCULATE INTERACTIONS *
779              **************************/
780
781             if (gmx_mm256_any_lt(rsq20,rcutoff2))
782             {
783
784             r20              = _mm256_mul_ps(rsq20,rinv20);
785             r20              = _mm256_andnot_ps(dummy_mask,r20);
786
787             /* Compute parameters for interactions between i and j atoms */
788             qq20             = _mm256_mul_ps(iq2,jq0);
789
790             /* EWALD ELECTROSTATICS */
791             
792             /* Analytical PME correction */
793             zeta2            = _mm256_mul_ps(beta2,rsq20);
794             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
795             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
796             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
797             felec            = _mm256_mul_ps(qq20,felec);
798             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
799             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
800             velec            = _mm256_sub_ps(rinv20,pmecorrV);
801             velec            = _mm256_mul_ps(qq20,velec);
802             
803             d                = _mm256_sub_ps(r20,rswitch);
804             d                = _mm256_max_ps(d,_mm256_setzero_ps());
805             d2               = _mm256_mul_ps(d,d);
806             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
807
808             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
809
810             /* Evaluate switch function */
811             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
812             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
813             velec            = _mm256_mul_ps(velec,sw);
814             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
815
816             /* Update potential sum for this i atom from the interaction with this j atom. */
817             velec            = _mm256_and_ps(velec,cutoff_mask);
818             velec            = _mm256_andnot_ps(dummy_mask,velec);
819             velecsum         = _mm256_add_ps(velecsum,velec);
820
821             fscal            = felec;
822
823             fscal            = _mm256_and_ps(fscal,cutoff_mask);
824
825             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
826
827             /* Calculate temporary vectorial force */
828             tx               = _mm256_mul_ps(fscal,dx20);
829             ty               = _mm256_mul_ps(fscal,dy20);
830             tz               = _mm256_mul_ps(fscal,dz20);
831
832             /* Update vectorial force */
833             fix2             = _mm256_add_ps(fix2,tx);
834             fiy2             = _mm256_add_ps(fiy2,ty);
835             fiz2             = _mm256_add_ps(fiz2,tz);
836
837             fjx0             = _mm256_add_ps(fjx0,tx);
838             fjy0             = _mm256_add_ps(fjy0,ty);
839             fjz0             = _mm256_add_ps(fjz0,tz);
840
841             }
842
843             /**************************
844              * CALCULATE INTERACTIONS *
845              **************************/
846
847             if (gmx_mm256_any_lt(rsq30,rcutoff2))
848             {
849
850             r30              = _mm256_mul_ps(rsq30,rinv30);
851             r30              = _mm256_andnot_ps(dummy_mask,r30);
852
853             /* Compute parameters for interactions between i and j atoms */
854             qq30             = _mm256_mul_ps(iq3,jq0);
855
856             /* EWALD ELECTROSTATICS */
857             
858             /* Analytical PME correction */
859             zeta2            = _mm256_mul_ps(beta2,rsq30);
860             rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
861             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
862             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
863             felec            = _mm256_mul_ps(qq30,felec);
864             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
865             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
866             velec            = _mm256_sub_ps(rinv30,pmecorrV);
867             velec            = _mm256_mul_ps(qq30,velec);
868             
869             d                = _mm256_sub_ps(r30,rswitch);
870             d                = _mm256_max_ps(d,_mm256_setzero_ps());
871             d2               = _mm256_mul_ps(d,d);
872             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
873
874             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
875
876             /* Evaluate switch function */
877             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
878             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
879             velec            = _mm256_mul_ps(velec,sw);
880             cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
881
882             /* Update potential sum for this i atom from the interaction with this j atom. */
883             velec            = _mm256_and_ps(velec,cutoff_mask);
884             velec            = _mm256_andnot_ps(dummy_mask,velec);
885             velecsum         = _mm256_add_ps(velecsum,velec);
886
887             fscal            = felec;
888
889             fscal            = _mm256_and_ps(fscal,cutoff_mask);
890
891             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
892
893             /* Calculate temporary vectorial force */
894             tx               = _mm256_mul_ps(fscal,dx30);
895             ty               = _mm256_mul_ps(fscal,dy30);
896             tz               = _mm256_mul_ps(fscal,dz30);
897
898             /* Update vectorial force */
899             fix3             = _mm256_add_ps(fix3,tx);
900             fiy3             = _mm256_add_ps(fiy3,ty);
901             fiz3             = _mm256_add_ps(fiz3,tz);
902
903             fjx0             = _mm256_add_ps(fjx0,tx);
904             fjy0             = _mm256_add_ps(fjy0,ty);
905             fjz0             = _mm256_add_ps(fjz0,tz);
906
907             }
908
909             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
910             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
911             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
912             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
913             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
914             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
915             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
916             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
917
918             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
919
920             /* Inner loop uses 390 flops */
921         }
922
923         /* End of innermost loop */
924
925         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
926                                                  f+i_coord_offset,fshift+i_shift_offset);
927
928         ggid                        = gid[iidx];
929         /* Update potential energies */
930         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
931         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
932
933         /* Increment number of inner iterations */
934         inneriter                  += j_index_end - j_index_start;
935
936         /* Outer loop uses 26 flops */
937     }
938
939     /* Increment number of outer iterations */
940     outeriter        += nri;
941
942     /* Update outer/inner flops */
943
944     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*390);
945 }
946 /*
947  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single
948  * Electrostatics interaction: Ewald
949  * VdW interaction:            LennardJones
950  * Geometry:                   Water4-Particle
951  * Calculate force/pot:        Force
952  */
953 void
954 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_256_single
955                     (t_nblist * gmx_restrict                nlist,
956                      rvec * gmx_restrict                    xx,
957                      rvec * gmx_restrict                    ff,
958                      t_forcerec * gmx_restrict              fr,
959                      t_mdatoms * gmx_restrict               mdatoms,
960                      nb_kernel_data_t * gmx_restrict        kernel_data,
961                      t_nrnb * gmx_restrict                  nrnb)
962 {
963     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
964      * just 0 for non-waters.
965      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
966      * jnr indices corresponding to data put in the four positions in the SIMD register.
967      */
968     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
969     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
970     int              jnrA,jnrB,jnrC,jnrD;
971     int              jnrE,jnrF,jnrG,jnrH;
972     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
973     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
974     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
975     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
976     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
977     real             rcutoff_scalar;
978     real             *shiftvec,*fshift,*x,*f;
979     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
980     real             scratch[4*DIM];
981     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
982     real *           vdwioffsetptr0;
983     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
984     real *           vdwioffsetptr1;
985     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
986     real *           vdwioffsetptr2;
987     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
988     real *           vdwioffsetptr3;
989     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
990     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
991     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
992     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
993     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
994     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
995     __m256           dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
996     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
997     real             *charge;
998     int              nvdwtype;
999     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1000     int              *vdwtype;
1001     real             *vdwparam;
1002     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1003     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1004     __m256i          ewitab;
1005     __m128i          ewitab_lo,ewitab_hi;
1006     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1007     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1008     real             *ewtab;
1009     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1010     real             rswitch_scalar,d_scalar;
1011     __m256           dummy_mask,cutoff_mask;
1012     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1013     __m256           one     = _mm256_set1_ps(1.0);
1014     __m256           two     = _mm256_set1_ps(2.0);
1015     x                = xx[0];
1016     f                = ff[0];
1017
1018     nri              = nlist->nri;
1019     iinr             = nlist->iinr;
1020     jindex           = nlist->jindex;
1021     jjnr             = nlist->jjnr;
1022     shiftidx         = nlist->shift;
1023     gid              = nlist->gid;
1024     shiftvec         = fr->shift_vec[0];
1025     fshift           = fr->fshift[0];
1026     facel            = _mm256_set1_ps(fr->epsfac);
1027     charge           = mdatoms->chargeA;
1028     nvdwtype         = fr->ntype;
1029     vdwparam         = fr->nbfp;
1030     vdwtype          = mdatoms->typeA;
1031
1032     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1033     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1034     beta2            = _mm256_mul_ps(beta,beta);
1035     beta3            = _mm256_mul_ps(beta,beta2);
1036
1037     ewtab            = fr->ic->tabq_coul_FDV0;
1038     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1039     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1040
1041     /* Setup water-specific parameters */
1042     inr              = nlist->iinr[0];
1043     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1044     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1045     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1046     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1047
1048     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1049     rcutoff_scalar   = fr->rcoulomb;
1050     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1051     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1052
1053     rswitch_scalar   = fr->rcoulomb_switch;
1054     rswitch          = _mm256_set1_ps(rswitch_scalar);
1055     /* Setup switch parameters */
1056     d_scalar         = rcutoff_scalar-rswitch_scalar;
1057     d                = _mm256_set1_ps(d_scalar);
1058     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1059     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1060     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1061     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1062     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1063     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1064
1065     /* Avoid stupid compiler warnings */
1066     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1067     j_coord_offsetA = 0;
1068     j_coord_offsetB = 0;
1069     j_coord_offsetC = 0;
1070     j_coord_offsetD = 0;
1071     j_coord_offsetE = 0;
1072     j_coord_offsetF = 0;
1073     j_coord_offsetG = 0;
1074     j_coord_offsetH = 0;
1075
1076     outeriter        = 0;
1077     inneriter        = 0;
1078
1079     for(iidx=0;iidx<4*DIM;iidx++)
1080     {
1081         scratch[iidx] = 0.0;
1082     }
1083
1084     /* Start outer loop over neighborlists */
1085     for(iidx=0; iidx<nri; iidx++)
1086     {
1087         /* Load shift vector for this list */
1088         i_shift_offset   = DIM*shiftidx[iidx];
1089
1090         /* Load limits for loop over neighbors */
1091         j_index_start    = jindex[iidx];
1092         j_index_end      = jindex[iidx+1];
1093
1094         /* Get outer coordinate index */
1095         inr              = iinr[iidx];
1096         i_coord_offset   = DIM*inr;
1097
1098         /* Load i particle coords and add shift vector */
1099         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1100                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1101
1102         fix0             = _mm256_setzero_ps();
1103         fiy0             = _mm256_setzero_ps();
1104         fiz0             = _mm256_setzero_ps();
1105         fix1             = _mm256_setzero_ps();
1106         fiy1             = _mm256_setzero_ps();
1107         fiz1             = _mm256_setzero_ps();
1108         fix2             = _mm256_setzero_ps();
1109         fiy2             = _mm256_setzero_ps();
1110         fiz2             = _mm256_setzero_ps();
1111         fix3             = _mm256_setzero_ps();
1112         fiy3             = _mm256_setzero_ps();
1113         fiz3             = _mm256_setzero_ps();
1114
1115         /* Start inner kernel loop */
1116         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1117         {
1118
1119             /* Get j neighbor index, and coordinate index */
1120             jnrA             = jjnr[jidx];
1121             jnrB             = jjnr[jidx+1];
1122             jnrC             = jjnr[jidx+2];
1123             jnrD             = jjnr[jidx+3];
1124             jnrE             = jjnr[jidx+4];
1125             jnrF             = jjnr[jidx+5];
1126             jnrG             = jjnr[jidx+6];
1127             jnrH             = jjnr[jidx+7];
1128             j_coord_offsetA  = DIM*jnrA;
1129             j_coord_offsetB  = DIM*jnrB;
1130             j_coord_offsetC  = DIM*jnrC;
1131             j_coord_offsetD  = DIM*jnrD;
1132             j_coord_offsetE  = DIM*jnrE;
1133             j_coord_offsetF  = DIM*jnrF;
1134             j_coord_offsetG  = DIM*jnrG;
1135             j_coord_offsetH  = DIM*jnrH;
1136
1137             /* load j atom coordinates */
1138             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1139                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1140                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1141                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1142                                                  &jx0,&jy0,&jz0);
1143
1144             /* Calculate displacement vector */
1145             dx00             = _mm256_sub_ps(ix0,jx0);
1146             dy00             = _mm256_sub_ps(iy0,jy0);
1147             dz00             = _mm256_sub_ps(iz0,jz0);
1148             dx10             = _mm256_sub_ps(ix1,jx0);
1149             dy10             = _mm256_sub_ps(iy1,jy0);
1150             dz10             = _mm256_sub_ps(iz1,jz0);
1151             dx20             = _mm256_sub_ps(ix2,jx0);
1152             dy20             = _mm256_sub_ps(iy2,jy0);
1153             dz20             = _mm256_sub_ps(iz2,jz0);
1154             dx30             = _mm256_sub_ps(ix3,jx0);
1155             dy30             = _mm256_sub_ps(iy3,jy0);
1156             dz30             = _mm256_sub_ps(iz3,jz0);
1157
1158             /* Calculate squared distance and things based on it */
1159             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1160             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1161             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1162             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1163
1164             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1165             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1166             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1167             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
1168
1169             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1170             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1171             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1172             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
1173
1174             /* Load parameters for j particles */
1175             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1176                                                                  charge+jnrC+0,charge+jnrD+0,
1177                                                                  charge+jnrE+0,charge+jnrF+0,
1178                                                                  charge+jnrG+0,charge+jnrH+0);
1179             vdwjidx0A        = 2*vdwtype[jnrA+0];
1180             vdwjidx0B        = 2*vdwtype[jnrB+0];
1181             vdwjidx0C        = 2*vdwtype[jnrC+0];
1182             vdwjidx0D        = 2*vdwtype[jnrD+0];
1183             vdwjidx0E        = 2*vdwtype[jnrE+0];
1184             vdwjidx0F        = 2*vdwtype[jnrF+0];
1185             vdwjidx0G        = 2*vdwtype[jnrG+0];
1186             vdwjidx0H        = 2*vdwtype[jnrH+0];
1187
1188             fjx0             = _mm256_setzero_ps();
1189             fjy0             = _mm256_setzero_ps();
1190             fjz0             = _mm256_setzero_ps();
1191
1192             /**************************
1193              * CALCULATE INTERACTIONS *
1194              **************************/
1195
1196             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1197             {
1198
1199             r00              = _mm256_mul_ps(rsq00,rinv00);
1200
1201             /* Compute parameters for interactions between i and j atoms */
1202             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1203                                             vdwioffsetptr0+vdwjidx0B,
1204                                             vdwioffsetptr0+vdwjidx0C,
1205                                             vdwioffsetptr0+vdwjidx0D,
1206                                             vdwioffsetptr0+vdwjidx0E,
1207                                             vdwioffsetptr0+vdwjidx0F,
1208                                             vdwioffsetptr0+vdwjidx0G,
1209                                             vdwioffsetptr0+vdwjidx0H,
1210                                             &c6_00,&c12_00);
1211
1212             /* LENNARD-JONES DISPERSION/REPULSION */
1213
1214             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1215             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1216             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1217             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1218             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1219
1220             d                = _mm256_sub_ps(r00,rswitch);
1221             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1222             d2               = _mm256_mul_ps(d,d);
1223             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1224
1225             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1226
1227             /* Evaluate switch function */
1228             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1229             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1230             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1231
1232             fscal            = fvdw;
1233
1234             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1235
1236             /* Calculate temporary vectorial force */
1237             tx               = _mm256_mul_ps(fscal,dx00);
1238             ty               = _mm256_mul_ps(fscal,dy00);
1239             tz               = _mm256_mul_ps(fscal,dz00);
1240
1241             /* Update vectorial force */
1242             fix0             = _mm256_add_ps(fix0,tx);
1243             fiy0             = _mm256_add_ps(fiy0,ty);
1244             fiz0             = _mm256_add_ps(fiz0,tz);
1245
1246             fjx0             = _mm256_add_ps(fjx0,tx);
1247             fjy0             = _mm256_add_ps(fjy0,ty);
1248             fjz0             = _mm256_add_ps(fjz0,tz);
1249
1250             }
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1257             {
1258
1259             r10              = _mm256_mul_ps(rsq10,rinv10);
1260
1261             /* Compute parameters for interactions between i and j atoms */
1262             qq10             = _mm256_mul_ps(iq1,jq0);
1263
1264             /* EWALD ELECTROSTATICS */
1265             
1266             /* Analytical PME correction */
1267             zeta2            = _mm256_mul_ps(beta2,rsq10);
1268             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1269             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1270             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1271             felec            = _mm256_mul_ps(qq10,felec);
1272             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1273             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1274             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1275             velec            = _mm256_mul_ps(qq10,velec);
1276             
1277             d                = _mm256_sub_ps(r10,rswitch);
1278             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1279             d2               = _mm256_mul_ps(d,d);
1280             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1281
1282             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1283
1284             /* Evaluate switch function */
1285             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1286             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1287             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1288
1289             fscal            = felec;
1290
1291             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1292
1293             /* Calculate temporary vectorial force */
1294             tx               = _mm256_mul_ps(fscal,dx10);
1295             ty               = _mm256_mul_ps(fscal,dy10);
1296             tz               = _mm256_mul_ps(fscal,dz10);
1297
1298             /* Update vectorial force */
1299             fix1             = _mm256_add_ps(fix1,tx);
1300             fiy1             = _mm256_add_ps(fiy1,ty);
1301             fiz1             = _mm256_add_ps(fiz1,tz);
1302
1303             fjx0             = _mm256_add_ps(fjx0,tx);
1304             fjy0             = _mm256_add_ps(fjy0,ty);
1305             fjz0             = _mm256_add_ps(fjz0,tz);
1306
1307             }
1308
1309             /**************************
1310              * CALCULATE INTERACTIONS *
1311              **************************/
1312
1313             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1314             {
1315
1316             r20              = _mm256_mul_ps(rsq20,rinv20);
1317
1318             /* Compute parameters for interactions between i and j atoms */
1319             qq20             = _mm256_mul_ps(iq2,jq0);
1320
1321             /* EWALD ELECTROSTATICS */
1322             
1323             /* Analytical PME correction */
1324             zeta2            = _mm256_mul_ps(beta2,rsq20);
1325             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1326             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1327             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1328             felec            = _mm256_mul_ps(qq20,felec);
1329             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1330             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1331             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1332             velec            = _mm256_mul_ps(qq20,velec);
1333             
1334             d                = _mm256_sub_ps(r20,rswitch);
1335             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1336             d2               = _mm256_mul_ps(d,d);
1337             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1338
1339             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1340
1341             /* Evaluate switch function */
1342             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1343             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1344             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1345
1346             fscal            = felec;
1347
1348             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1349
1350             /* Calculate temporary vectorial force */
1351             tx               = _mm256_mul_ps(fscal,dx20);
1352             ty               = _mm256_mul_ps(fscal,dy20);
1353             tz               = _mm256_mul_ps(fscal,dz20);
1354
1355             /* Update vectorial force */
1356             fix2             = _mm256_add_ps(fix2,tx);
1357             fiy2             = _mm256_add_ps(fiy2,ty);
1358             fiz2             = _mm256_add_ps(fiz2,tz);
1359
1360             fjx0             = _mm256_add_ps(fjx0,tx);
1361             fjy0             = _mm256_add_ps(fjy0,ty);
1362             fjz0             = _mm256_add_ps(fjz0,tz);
1363
1364             }
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             if (gmx_mm256_any_lt(rsq30,rcutoff2))
1371             {
1372
1373             r30              = _mm256_mul_ps(rsq30,rinv30);
1374
1375             /* Compute parameters for interactions between i and j atoms */
1376             qq30             = _mm256_mul_ps(iq3,jq0);
1377
1378             /* EWALD ELECTROSTATICS */
1379             
1380             /* Analytical PME correction */
1381             zeta2            = _mm256_mul_ps(beta2,rsq30);
1382             rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
1383             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1384             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1385             felec            = _mm256_mul_ps(qq30,felec);
1386             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1387             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1388             velec            = _mm256_sub_ps(rinv30,pmecorrV);
1389             velec            = _mm256_mul_ps(qq30,velec);
1390             
1391             d                = _mm256_sub_ps(r30,rswitch);
1392             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1393             d2               = _mm256_mul_ps(d,d);
1394             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1395
1396             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1397
1398             /* Evaluate switch function */
1399             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1400             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1401             cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1402
1403             fscal            = felec;
1404
1405             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1406
1407             /* Calculate temporary vectorial force */
1408             tx               = _mm256_mul_ps(fscal,dx30);
1409             ty               = _mm256_mul_ps(fscal,dy30);
1410             tz               = _mm256_mul_ps(fscal,dz30);
1411
1412             /* Update vectorial force */
1413             fix3             = _mm256_add_ps(fix3,tx);
1414             fiy3             = _mm256_add_ps(fiy3,ty);
1415             fiz3             = _mm256_add_ps(fiz3,tz);
1416
1417             fjx0             = _mm256_add_ps(fjx0,tx);
1418             fjy0             = _mm256_add_ps(fjy0,ty);
1419             fjz0             = _mm256_add_ps(fjz0,tz);
1420
1421             }
1422
1423             fjptrA             = f+j_coord_offsetA;
1424             fjptrB             = f+j_coord_offsetB;
1425             fjptrC             = f+j_coord_offsetC;
1426             fjptrD             = f+j_coord_offsetD;
1427             fjptrE             = f+j_coord_offsetE;
1428             fjptrF             = f+j_coord_offsetF;
1429             fjptrG             = f+j_coord_offsetG;
1430             fjptrH             = f+j_coord_offsetH;
1431
1432             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1433
1434             /* Inner loop uses 374 flops */
1435         }
1436
1437         if(jidx<j_index_end)
1438         {
1439
1440             /* Get j neighbor index, and coordinate index */
1441             jnrlistA         = jjnr[jidx];
1442             jnrlistB         = jjnr[jidx+1];
1443             jnrlistC         = jjnr[jidx+2];
1444             jnrlistD         = jjnr[jidx+3];
1445             jnrlistE         = jjnr[jidx+4];
1446             jnrlistF         = jjnr[jidx+5];
1447             jnrlistG         = jjnr[jidx+6];
1448             jnrlistH         = jjnr[jidx+7];
1449             /* Sign of each element will be negative for non-real atoms.
1450              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1451              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1452              */
1453             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1454                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1455                                             
1456             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1457             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1458             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1459             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1460             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1461             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1462             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1463             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1464             j_coord_offsetA  = DIM*jnrA;
1465             j_coord_offsetB  = DIM*jnrB;
1466             j_coord_offsetC  = DIM*jnrC;
1467             j_coord_offsetD  = DIM*jnrD;
1468             j_coord_offsetE  = DIM*jnrE;
1469             j_coord_offsetF  = DIM*jnrF;
1470             j_coord_offsetG  = DIM*jnrG;
1471             j_coord_offsetH  = DIM*jnrH;
1472
1473             /* load j atom coordinates */
1474             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1475                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1476                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1477                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1478                                                  &jx0,&jy0,&jz0);
1479
1480             /* Calculate displacement vector */
1481             dx00             = _mm256_sub_ps(ix0,jx0);
1482             dy00             = _mm256_sub_ps(iy0,jy0);
1483             dz00             = _mm256_sub_ps(iz0,jz0);
1484             dx10             = _mm256_sub_ps(ix1,jx0);
1485             dy10             = _mm256_sub_ps(iy1,jy0);
1486             dz10             = _mm256_sub_ps(iz1,jz0);
1487             dx20             = _mm256_sub_ps(ix2,jx0);
1488             dy20             = _mm256_sub_ps(iy2,jy0);
1489             dz20             = _mm256_sub_ps(iz2,jz0);
1490             dx30             = _mm256_sub_ps(ix3,jx0);
1491             dy30             = _mm256_sub_ps(iy3,jy0);
1492             dz30             = _mm256_sub_ps(iz3,jz0);
1493
1494             /* Calculate squared distance and things based on it */
1495             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1496             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1497             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1498             rsq30            = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1499
1500             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1501             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1502             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1503             rinv30           = gmx_mm256_invsqrt_ps(rsq30);
1504
1505             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1506             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1507             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1508             rinvsq30         = _mm256_mul_ps(rinv30,rinv30);
1509
1510             /* Load parameters for j particles */
1511             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1512                                                                  charge+jnrC+0,charge+jnrD+0,
1513                                                                  charge+jnrE+0,charge+jnrF+0,
1514                                                                  charge+jnrG+0,charge+jnrH+0);
1515             vdwjidx0A        = 2*vdwtype[jnrA+0];
1516             vdwjidx0B        = 2*vdwtype[jnrB+0];
1517             vdwjidx0C        = 2*vdwtype[jnrC+0];
1518             vdwjidx0D        = 2*vdwtype[jnrD+0];
1519             vdwjidx0E        = 2*vdwtype[jnrE+0];
1520             vdwjidx0F        = 2*vdwtype[jnrF+0];
1521             vdwjidx0G        = 2*vdwtype[jnrG+0];
1522             vdwjidx0H        = 2*vdwtype[jnrH+0];
1523
1524             fjx0             = _mm256_setzero_ps();
1525             fjy0             = _mm256_setzero_ps();
1526             fjz0             = _mm256_setzero_ps();
1527
1528             /**************************
1529              * CALCULATE INTERACTIONS *
1530              **************************/
1531
1532             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1533             {
1534
1535             r00              = _mm256_mul_ps(rsq00,rinv00);
1536             r00              = _mm256_andnot_ps(dummy_mask,r00);
1537
1538             /* Compute parameters for interactions between i and j atoms */
1539             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1540                                             vdwioffsetptr0+vdwjidx0B,
1541                                             vdwioffsetptr0+vdwjidx0C,
1542                                             vdwioffsetptr0+vdwjidx0D,
1543                                             vdwioffsetptr0+vdwjidx0E,
1544                                             vdwioffsetptr0+vdwjidx0F,
1545                                             vdwioffsetptr0+vdwjidx0G,
1546                                             vdwioffsetptr0+vdwjidx0H,
1547                                             &c6_00,&c12_00);
1548
1549             /* LENNARD-JONES DISPERSION/REPULSION */
1550
1551             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1552             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1553             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1554             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1555             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1556
1557             d                = _mm256_sub_ps(r00,rswitch);
1558             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1559             d2               = _mm256_mul_ps(d,d);
1560             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1561
1562             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1563
1564             /* Evaluate switch function */
1565             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1566             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1567             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1568
1569             fscal            = fvdw;
1570
1571             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1572
1573             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1574
1575             /* Calculate temporary vectorial force */
1576             tx               = _mm256_mul_ps(fscal,dx00);
1577             ty               = _mm256_mul_ps(fscal,dy00);
1578             tz               = _mm256_mul_ps(fscal,dz00);
1579
1580             /* Update vectorial force */
1581             fix0             = _mm256_add_ps(fix0,tx);
1582             fiy0             = _mm256_add_ps(fiy0,ty);
1583             fiz0             = _mm256_add_ps(fiz0,tz);
1584
1585             fjx0             = _mm256_add_ps(fjx0,tx);
1586             fjy0             = _mm256_add_ps(fjy0,ty);
1587             fjz0             = _mm256_add_ps(fjz0,tz);
1588
1589             }
1590
1591             /**************************
1592              * CALCULATE INTERACTIONS *
1593              **************************/
1594
1595             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1596             {
1597
1598             r10              = _mm256_mul_ps(rsq10,rinv10);
1599             r10              = _mm256_andnot_ps(dummy_mask,r10);
1600
1601             /* Compute parameters for interactions between i and j atoms */
1602             qq10             = _mm256_mul_ps(iq1,jq0);
1603
1604             /* EWALD ELECTROSTATICS */
1605             
1606             /* Analytical PME correction */
1607             zeta2            = _mm256_mul_ps(beta2,rsq10);
1608             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1609             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1610             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1611             felec            = _mm256_mul_ps(qq10,felec);
1612             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1613             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1614             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1615             velec            = _mm256_mul_ps(qq10,velec);
1616             
1617             d                = _mm256_sub_ps(r10,rswitch);
1618             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1619             d2               = _mm256_mul_ps(d,d);
1620             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1621
1622             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1623
1624             /* Evaluate switch function */
1625             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1626             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1627             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1628
1629             fscal            = felec;
1630
1631             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1632
1633             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1634
1635             /* Calculate temporary vectorial force */
1636             tx               = _mm256_mul_ps(fscal,dx10);
1637             ty               = _mm256_mul_ps(fscal,dy10);
1638             tz               = _mm256_mul_ps(fscal,dz10);
1639
1640             /* Update vectorial force */
1641             fix1             = _mm256_add_ps(fix1,tx);
1642             fiy1             = _mm256_add_ps(fiy1,ty);
1643             fiz1             = _mm256_add_ps(fiz1,tz);
1644
1645             fjx0             = _mm256_add_ps(fjx0,tx);
1646             fjy0             = _mm256_add_ps(fjy0,ty);
1647             fjz0             = _mm256_add_ps(fjz0,tz);
1648
1649             }
1650
1651             /**************************
1652              * CALCULATE INTERACTIONS *
1653              **************************/
1654
1655             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1656             {
1657
1658             r20              = _mm256_mul_ps(rsq20,rinv20);
1659             r20              = _mm256_andnot_ps(dummy_mask,r20);
1660
1661             /* Compute parameters for interactions between i and j atoms */
1662             qq20             = _mm256_mul_ps(iq2,jq0);
1663
1664             /* EWALD ELECTROSTATICS */
1665             
1666             /* Analytical PME correction */
1667             zeta2            = _mm256_mul_ps(beta2,rsq20);
1668             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1669             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1670             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1671             felec            = _mm256_mul_ps(qq20,felec);
1672             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1673             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1674             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1675             velec            = _mm256_mul_ps(qq20,velec);
1676             
1677             d                = _mm256_sub_ps(r20,rswitch);
1678             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1679             d2               = _mm256_mul_ps(d,d);
1680             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1681
1682             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1683
1684             /* Evaluate switch function */
1685             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1686             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1687             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1688
1689             fscal            = felec;
1690
1691             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1692
1693             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1694
1695             /* Calculate temporary vectorial force */
1696             tx               = _mm256_mul_ps(fscal,dx20);
1697             ty               = _mm256_mul_ps(fscal,dy20);
1698             tz               = _mm256_mul_ps(fscal,dz20);
1699
1700             /* Update vectorial force */
1701             fix2             = _mm256_add_ps(fix2,tx);
1702             fiy2             = _mm256_add_ps(fiy2,ty);
1703             fiz2             = _mm256_add_ps(fiz2,tz);
1704
1705             fjx0             = _mm256_add_ps(fjx0,tx);
1706             fjy0             = _mm256_add_ps(fjy0,ty);
1707             fjz0             = _mm256_add_ps(fjz0,tz);
1708
1709             }
1710
1711             /**************************
1712              * CALCULATE INTERACTIONS *
1713              **************************/
1714
1715             if (gmx_mm256_any_lt(rsq30,rcutoff2))
1716             {
1717
1718             r30              = _mm256_mul_ps(rsq30,rinv30);
1719             r30              = _mm256_andnot_ps(dummy_mask,r30);
1720
1721             /* Compute parameters for interactions between i and j atoms */
1722             qq30             = _mm256_mul_ps(iq3,jq0);
1723
1724             /* EWALD ELECTROSTATICS */
1725             
1726             /* Analytical PME correction */
1727             zeta2            = _mm256_mul_ps(beta2,rsq30);
1728             rinv3            = _mm256_mul_ps(rinvsq30,rinv30);
1729             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1730             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1731             felec            = _mm256_mul_ps(qq30,felec);
1732             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1733             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1734             velec            = _mm256_sub_ps(rinv30,pmecorrV);
1735             velec            = _mm256_mul_ps(qq30,velec);
1736             
1737             d                = _mm256_sub_ps(r30,rswitch);
1738             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1739             d2               = _mm256_mul_ps(d,d);
1740             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1741
1742             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1743
1744             /* Evaluate switch function */
1745             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1746             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv30,_mm256_mul_ps(velec,dsw)) );
1747             cutoff_mask      = _mm256_cmp_ps(rsq30,rcutoff2,_CMP_LT_OQ);
1748
1749             fscal            = felec;
1750
1751             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1752
1753             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1754
1755             /* Calculate temporary vectorial force */
1756             tx               = _mm256_mul_ps(fscal,dx30);
1757             ty               = _mm256_mul_ps(fscal,dy30);
1758             tz               = _mm256_mul_ps(fscal,dz30);
1759
1760             /* Update vectorial force */
1761             fix3             = _mm256_add_ps(fix3,tx);
1762             fiy3             = _mm256_add_ps(fiy3,ty);
1763             fiz3             = _mm256_add_ps(fiz3,tz);
1764
1765             fjx0             = _mm256_add_ps(fjx0,tx);
1766             fjy0             = _mm256_add_ps(fjy0,ty);
1767             fjz0             = _mm256_add_ps(fjz0,tz);
1768
1769             }
1770
1771             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1772             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1773             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1774             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1775             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1776             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1777             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1778             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1779
1780             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1781
1782             /* Inner loop uses 378 flops */
1783         }
1784
1785         /* End of innermost loop */
1786
1787         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1788                                                  f+i_coord_offset,fshift+i_shift_offset);
1789
1790         /* Increment number of inner iterations */
1791         inneriter                  += j_index_end - j_index_start;
1792
1793         /* Outer loop uses 24 flops */
1794     }
1795
1796     /* Increment number of outer iterations */
1797     outeriter        += nri;
1798
1799     /* Update outer/inner flops */
1800
1801     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*378);
1802 }