made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     __m256i          ewitab;
96     __m128i          ewitab_lo,ewitab_hi;
97     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
99     real             *ewtab;
100     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
101     real             rswitch_scalar,d_scalar;
102     __m256           dummy_mask,cutoff_mask;
103     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
104     __m256           one     = _mm256_set1_ps(1.0);
105     __m256           two     = _mm256_set1_ps(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     facel            = _mm256_set1_ps(fr->epsfac);
118     charge           = mdatoms->chargeA;
119
120     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
121     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
122     beta2            = _mm256_mul_ps(beta,beta);
123     beta3            = _mm256_mul_ps(beta,beta2);
124
125     ewtab            = fr->ic->tabq_coul_FDV0;
126     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
127     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
132     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
133     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
134
135     jq0              = _mm256_set1_ps(charge[inr+0]);
136     jq1              = _mm256_set1_ps(charge[inr+1]);
137     jq2              = _mm256_set1_ps(charge[inr+2]);
138     qq00             = _mm256_mul_ps(iq0,jq0);
139     qq01             = _mm256_mul_ps(iq0,jq1);
140     qq02             = _mm256_mul_ps(iq0,jq2);
141     qq10             = _mm256_mul_ps(iq1,jq0);
142     qq11             = _mm256_mul_ps(iq1,jq1);
143     qq12             = _mm256_mul_ps(iq1,jq2);
144     qq20             = _mm256_mul_ps(iq2,jq0);
145     qq21             = _mm256_mul_ps(iq2,jq1);
146     qq22             = _mm256_mul_ps(iq2,jq2);
147
148     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149     rcutoff_scalar   = fr->rcoulomb;
150     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
151     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
152
153     rswitch_scalar   = fr->rcoulomb_switch;
154     rswitch          = _mm256_set1_ps(rswitch_scalar);
155     /* Setup switch parameters */
156     d_scalar         = rcutoff_scalar-rswitch_scalar;
157     d                = _mm256_set1_ps(d_scalar);
158     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
159     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
160     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
161     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
162     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
163     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
164
165     /* Avoid stupid compiler warnings */
166     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
167     j_coord_offsetA = 0;
168     j_coord_offsetB = 0;
169     j_coord_offsetC = 0;
170     j_coord_offsetD = 0;
171     j_coord_offsetE = 0;
172     j_coord_offsetF = 0;
173     j_coord_offsetG = 0;
174     j_coord_offsetH = 0;
175
176     outeriter        = 0;
177     inneriter        = 0;
178
179     for(iidx=0;iidx<4*DIM;iidx++)
180     {
181         scratch[iidx] = 0.0;
182     }
183
184     /* Start outer loop over neighborlists */
185     for(iidx=0; iidx<nri; iidx++)
186     {
187         /* Load shift vector for this list */
188         i_shift_offset   = DIM*shiftidx[iidx];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
201
202         fix0             = _mm256_setzero_ps();
203         fiy0             = _mm256_setzero_ps();
204         fiz0             = _mm256_setzero_ps();
205         fix1             = _mm256_setzero_ps();
206         fiy1             = _mm256_setzero_ps();
207         fiz1             = _mm256_setzero_ps();
208         fix2             = _mm256_setzero_ps();
209         fiy2             = _mm256_setzero_ps();
210         fiz2             = _mm256_setzero_ps();
211
212         /* Reset potential sums */
213         velecsum         = _mm256_setzero_ps();
214
215         /* Start inner kernel loop */
216         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
217         {
218
219             /* Get j neighbor index, and coordinate index */
220             jnrA             = jjnr[jidx];
221             jnrB             = jjnr[jidx+1];
222             jnrC             = jjnr[jidx+2];
223             jnrD             = jjnr[jidx+3];
224             jnrE             = jjnr[jidx+4];
225             jnrF             = jjnr[jidx+5];
226             jnrG             = jjnr[jidx+6];
227             jnrH             = jjnr[jidx+7];
228             j_coord_offsetA  = DIM*jnrA;
229             j_coord_offsetB  = DIM*jnrB;
230             j_coord_offsetC  = DIM*jnrC;
231             j_coord_offsetD  = DIM*jnrD;
232             j_coord_offsetE  = DIM*jnrE;
233             j_coord_offsetF  = DIM*jnrF;
234             j_coord_offsetG  = DIM*jnrG;
235             j_coord_offsetH  = DIM*jnrH;
236
237             /* load j atom coordinates */
238             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
240                                                  x+j_coord_offsetE,x+j_coord_offsetF,
241                                                  x+j_coord_offsetG,x+j_coord_offsetH,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
243
244             /* Calculate displacement vector */
245             dx00             = _mm256_sub_ps(ix0,jx0);
246             dy00             = _mm256_sub_ps(iy0,jy0);
247             dz00             = _mm256_sub_ps(iz0,jz0);
248             dx01             = _mm256_sub_ps(ix0,jx1);
249             dy01             = _mm256_sub_ps(iy0,jy1);
250             dz01             = _mm256_sub_ps(iz0,jz1);
251             dx02             = _mm256_sub_ps(ix0,jx2);
252             dy02             = _mm256_sub_ps(iy0,jy2);
253             dz02             = _mm256_sub_ps(iz0,jz2);
254             dx10             = _mm256_sub_ps(ix1,jx0);
255             dy10             = _mm256_sub_ps(iy1,jy0);
256             dz10             = _mm256_sub_ps(iz1,jz0);
257             dx11             = _mm256_sub_ps(ix1,jx1);
258             dy11             = _mm256_sub_ps(iy1,jy1);
259             dz11             = _mm256_sub_ps(iz1,jz1);
260             dx12             = _mm256_sub_ps(ix1,jx2);
261             dy12             = _mm256_sub_ps(iy1,jy2);
262             dz12             = _mm256_sub_ps(iz1,jz2);
263             dx20             = _mm256_sub_ps(ix2,jx0);
264             dy20             = _mm256_sub_ps(iy2,jy0);
265             dz20             = _mm256_sub_ps(iz2,jz0);
266             dx21             = _mm256_sub_ps(ix2,jx1);
267             dy21             = _mm256_sub_ps(iy2,jy1);
268             dz21             = _mm256_sub_ps(iz2,jz1);
269             dx22             = _mm256_sub_ps(ix2,jx2);
270             dy22             = _mm256_sub_ps(iy2,jy2);
271             dz22             = _mm256_sub_ps(iz2,jz2);
272
273             /* Calculate squared distance and things based on it */
274             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
275             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
276             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
277             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
278             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
279             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
280             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
281             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
282             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
283
284             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
285             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
286             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
287             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
288             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
289             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
290             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
291             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
292             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
293
294             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
295             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
296             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
297             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
298             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
299             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
300             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
301             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
302             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
303
304             fjx0             = _mm256_setzero_ps();
305             fjy0             = _mm256_setzero_ps();
306             fjz0             = _mm256_setzero_ps();
307             fjx1             = _mm256_setzero_ps();
308             fjy1             = _mm256_setzero_ps();
309             fjz1             = _mm256_setzero_ps();
310             fjx2             = _mm256_setzero_ps();
311             fjy2             = _mm256_setzero_ps();
312             fjz2             = _mm256_setzero_ps();
313
314             /**************************
315              * CALCULATE INTERACTIONS *
316              **************************/
317
318             if (gmx_mm256_any_lt(rsq00,rcutoff2))
319             {
320
321             r00              = _mm256_mul_ps(rsq00,rinv00);
322
323             /* EWALD ELECTROSTATICS */
324             
325             /* Analytical PME correction */
326             zeta2            = _mm256_mul_ps(beta2,rsq00);
327             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
328             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
329             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
330             felec            = _mm256_mul_ps(qq00,felec);
331             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
332             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
333             velec            = _mm256_sub_ps(rinv00,pmecorrV);
334             velec            = _mm256_mul_ps(qq00,velec);
335             
336             d                = _mm256_sub_ps(r00,rswitch);
337             d                = _mm256_max_ps(d,_mm256_setzero_ps());
338             d2               = _mm256_mul_ps(d,d);
339             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
340
341             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
342
343             /* Evaluate switch function */
344             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
345             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
346             velec            = _mm256_mul_ps(velec,sw);
347             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
348
349             /* Update potential sum for this i atom from the interaction with this j atom. */
350             velec            = _mm256_and_ps(velec,cutoff_mask);
351             velecsum         = _mm256_add_ps(velecsum,velec);
352
353             fscal            = felec;
354
355             fscal            = _mm256_and_ps(fscal,cutoff_mask);
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm256_mul_ps(fscal,dx00);
359             ty               = _mm256_mul_ps(fscal,dy00);
360             tz               = _mm256_mul_ps(fscal,dz00);
361
362             /* Update vectorial force */
363             fix0             = _mm256_add_ps(fix0,tx);
364             fiy0             = _mm256_add_ps(fiy0,ty);
365             fiz0             = _mm256_add_ps(fiz0,tz);
366
367             fjx0             = _mm256_add_ps(fjx0,tx);
368             fjy0             = _mm256_add_ps(fjy0,ty);
369             fjz0             = _mm256_add_ps(fjz0,tz);
370
371             }
372
373             /**************************
374              * CALCULATE INTERACTIONS *
375              **************************/
376
377             if (gmx_mm256_any_lt(rsq01,rcutoff2))
378             {
379
380             r01              = _mm256_mul_ps(rsq01,rinv01);
381
382             /* EWALD ELECTROSTATICS */
383             
384             /* Analytical PME correction */
385             zeta2            = _mm256_mul_ps(beta2,rsq01);
386             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
387             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
388             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
389             felec            = _mm256_mul_ps(qq01,felec);
390             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
391             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
392             velec            = _mm256_sub_ps(rinv01,pmecorrV);
393             velec            = _mm256_mul_ps(qq01,velec);
394             
395             d                = _mm256_sub_ps(r01,rswitch);
396             d                = _mm256_max_ps(d,_mm256_setzero_ps());
397             d2               = _mm256_mul_ps(d,d);
398             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
399
400             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
401
402             /* Evaluate switch function */
403             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
404             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
405             velec            = _mm256_mul_ps(velec,sw);
406             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
407
408             /* Update potential sum for this i atom from the interaction with this j atom. */
409             velec            = _mm256_and_ps(velec,cutoff_mask);
410             velecsum         = _mm256_add_ps(velecsum,velec);
411
412             fscal            = felec;
413
414             fscal            = _mm256_and_ps(fscal,cutoff_mask);
415
416             /* Calculate temporary vectorial force */
417             tx               = _mm256_mul_ps(fscal,dx01);
418             ty               = _mm256_mul_ps(fscal,dy01);
419             tz               = _mm256_mul_ps(fscal,dz01);
420
421             /* Update vectorial force */
422             fix0             = _mm256_add_ps(fix0,tx);
423             fiy0             = _mm256_add_ps(fiy0,ty);
424             fiz0             = _mm256_add_ps(fiz0,tz);
425
426             fjx1             = _mm256_add_ps(fjx1,tx);
427             fjy1             = _mm256_add_ps(fjy1,ty);
428             fjz1             = _mm256_add_ps(fjz1,tz);
429
430             }
431
432             /**************************
433              * CALCULATE INTERACTIONS *
434              **************************/
435
436             if (gmx_mm256_any_lt(rsq02,rcutoff2))
437             {
438
439             r02              = _mm256_mul_ps(rsq02,rinv02);
440
441             /* EWALD ELECTROSTATICS */
442             
443             /* Analytical PME correction */
444             zeta2            = _mm256_mul_ps(beta2,rsq02);
445             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
446             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
447             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
448             felec            = _mm256_mul_ps(qq02,felec);
449             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
450             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
451             velec            = _mm256_sub_ps(rinv02,pmecorrV);
452             velec            = _mm256_mul_ps(qq02,velec);
453             
454             d                = _mm256_sub_ps(r02,rswitch);
455             d                = _mm256_max_ps(d,_mm256_setzero_ps());
456             d2               = _mm256_mul_ps(d,d);
457             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
458
459             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
460
461             /* Evaluate switch function */
462             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
463             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
464             velec            = _mm256_mul_ps(velec,sw);
465             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
466
467             /* Update potential sum for this i atom from the interaction with this j atom. */
468             velec            = _mm256_and_ps(velec,cutoff_mask);
469             velecsum         = _mm256_add_ps(velecsum,velec);
470
471             fscal            = felec;
472
473             fscal            = _mm256_and_ps(fscal,cutoff_mask);
474
475             /* Calculate temporary vectorial force */
476             tx               = _mm256_mul_ps(fscal,dx02);
477             ty               = _mm256_mul_ps(fscal,dy02);
478             tz               = _mm256_mul_ps(fscal,dz02);
479
480             /* Update vectorial force */
481             fix0             = _mm256_add_ps(fix0,tx);
482             fiy0             = _mm256_add_ps(fiy0,ty);
483             fiz0             = _mm256_add_ps(fiz0,tz);
484
485             fjx2             = _mm256_add_ps(fjx2,tx);
486             fjy2             = _mm256_add_ps(fjy2,ty);
487             fjz2             = _mm256_add_ps(fjz2,tz);
488
489             }
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             if (gmx_mm256_any_lt(rsq10,rcutoff2))
496             {
497
498             r10              = _mm256_mul_ps(rsq10,rinv10);
499
500             /* EWALD ELECTROSTATICS */
501             
502             /* Analytical PME correction */
503             zeta2            = _mm256_mul_ps(beta2,rsq10);
504             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
505             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
506             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
507             felec            = _mm256_mul_ps(qq10,felec);
508             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
509             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
510             velec            = _mm256_sub_ps(rinv10,pmecorrV);
511             velec            = _mm256_mul_ps(qq10,velec);
512             
513             d                = _mm256_sub_ps(r10,rswitch);
514             d                = _mm256_max_ps(d,_mm256_setzero_ps());
515             d2               = _mm256_mul_ps(d,d);
516             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
517
518             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
519
520             /* Evaluate switch function */
521             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
522             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
523             velec            = _mm256_mul_ps(velec,sw);
524             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
525
526             /* Update potential sum for this i atom from the interaction with this j atom. */
527             velec            = _mm256_and_ps(velec,cutoff_mask);
528             velecsum         = _mm256_add_ps(velecsum,velec);
529
530             fscal            = felec;
531
532             fscal            = _mm256_and_ps(fscal,cutoff_mask);
533
534             /* Calculate temporary vectorial force */
535             tx               = _mm256_mul_ps(fscal,dx10);
536             ty               = _mm256_mul_ps(fscal,dy10);
537             tz               = _mm256_mul_ps(fscal,dz10);
538
539             /* Update vectorial force */
540             fix1             = _mm256_add_ps(fix1,tx);
541             fiy1             = _mm256_add_ps(fiy1,ty);
542             fiz1             = _mm256_add_ps(fiz1,tz);
543
544             fjx0             = _mm256_add_ps(fjx0,tx);
545             fjy0             = _mm256_add_ps(fjy0,ty);
546             fjz0             = _mm256_add_ps(fjz0,tz);
547
548             }
549
550             /**************************
551              * CALCULATE INTERACTIONS *
552              **************************/
553
554             if (gmx_mm256_any_lt(rsq11,rcutoff2))
555             {
556
557             r11              = _mm256_mul_ps(rsq11,rinv11);
558
559             /* EWALD ELECTROSTATICS */
560             
561             /* Analytical PME correction */
562             zeta2            = _mm256_mul_ps(beta2,rsq11);
563             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
564             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
565             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
566             felec            = _mm256_mul_ps(qq11,felec);
567             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
568             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
569             velec            = _mm256_sub_ps(rinv11,pmecorrV);
570             velec            = _mm256_mul_ps(qq11,velec);
571             
572             d                = _mm256_sub_ps(r11,rswitch);
573             d                = _mm256_max_ps(d,_mm256_setzero_ps());
574             d2               = _mm256_mul_ps(d,d);
575             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
576
577             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
578
579             /* Evaluate switch function */
580             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
582             velec            = _mm256_mul_ps(velec,sw);
583             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velec            = _mm256_and_ps(velec,cutoff_mask);
587             velecsum         = _mm256_add_ps(velecsum,velec);
588
589             fscal            = felec;
590
591             fscal            = _mm256_and_ps(fscal,cutoff_mask);
592
593             /* Calculate temporary vectorial force */
594             tx               = _mm256_mul_ps(fscal,dx11);
595             ty               = _mm256_mul_ps(fscal,dy11);
596             tz               = _mm256_mul_ps(fscal,dz11);
597
598             /* Update vectorial force */
599             fix1             = _mm256_add_ps(fix1,tx);
600             fiy1             = _mm256_add_ps(fiy1,ty);
601             fiz1             = _mm256_add_ps(fiz1,tz);
602
603             fjx1             = _mm256_add_ps(fjx1,tx);
604             fjy1             = _mm256_add_ps(fjy1,ty);
605             fjz1             = _mm256_add_ps(fjz1,tz);
606
607             }
608
609             /**************************
610              * CALCULATE INTERACTIONS *
611              **************************/
612
613             if (gmx_mm256_any_lt(rsq12,rcutoff2))
614             {
615
616             r12              = _mm256_mul_ps(rsq12,rinv12);
617
618             /* EWALD ELECTROSTATICS */
619             
620             /* Analytical PME correction */
621             zeta2            = _mm256_mul_ps(beta2,rsq12);
622             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
623             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
624             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
625             felec            = _mm256_mul_ps(qq12,felec);
626             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
627             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
628             velec            = _mm256_sub_ps(rinv12,pmecorrV);
629             velec            = _mm256_mul_ps(qq12,velec);
630             
631             d                = _mm256_sub_ps(r12,rswitch);
632             d                = _mm256_max_ps(d,_mm256_setzero_ps());
633             d2               = _mm256_mul_ps(d,d);
634             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
635
636             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
637
638             /* Evaluate switch function */
639             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
640             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
641             velec            = _mm256_mul_ps(velec,sw);
642             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
643
644             /* Update potential sum for this i atom from the interaction with this j atom. */
645             velec            = _mm256_and_ps(velec,cutoff_mask);
646             velecsum         = _mm256_add_ps(velecsum,velec);
647
648             fscal            = felec;
649
650             fscal            = _mm256_and_ps(fscal,cutoff_mask);
651
652             /* Calculate temporary vectorial force */
653             tx               = _mm256_mul_ps(fscal,dx12);
654             ty               = _mm256_mul_ps(fscal,dy12);
655             tz               = _mm256_mul_ps(fscal,dz12);
656
657             /* Update vectorial force */
658             fix1             = _mm256_add_ps(fix1,tx);
659             fiy1             = _mm256_add_ps(fiy1,ty);
660             fiz1             = _mm256_add_ps(fiz1,tz);
661
662             fjx2             = _mm256_add_ps(fjx2,tx);
663             fjy2             = _mm256_add_ps(fjy2,ty);
664             fjz2             = _mm256_add_ps(fjz2,tz);
665
666             }
667
668             /**************************
669              * CALCULATE INTERACTIONS *
670              **************************/
671
672             if (gmx_mm256_any_lt(rsq20,rcutoff2))
673             {
674
675             r20              = _mm256_mul_ps(rsq20,rinv20);
676
677             /* EWALD ELECTROSTATICS */
678             
679             /* Analytical PME correction */
680             zeta2            = _mm256_mul_ps(beta2,rsq20);
681             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
682             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
683             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
684             felec            = _mm256_mul_ps(qq20,felec);
685             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
686             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
687             velec            = _mm256_sub_ps(rinv20,pmecorrV);
688             velec            = _mm256_mul_ps(qq20,velec);
689             
690             d                = _mm256_sub_ps(r20,rswitch);
691             d                = _mm256_max_ps(d,_mm256_setzero_ps());
692             d2               = _mm256_mul_ps(d,d);
693             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
694
695             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
696
697             /* Evaluate switch function */
698             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
699             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
700             velec            = _mm256_mul_ps(velec,sw);
701             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
702
703             /* Update potential sum for this i atom from the interaction with this j atom. */
704             velec            = _mm256_and_ps(velec,cutoff_mask);
705             velecsum         = _mm256_add_ps(velecsum,velec);
706
707             fscal            = felec;
708
709             fscal            = _mm256_and_ps(fscal,cutoff_mask);
710
711             /* Calculate temporary vectorial force */
712             tx               = _mm256_mul_ps(fscal,dx20);
713             ty               = _mm256_mul_ps(fscal,dy20);
714             tz               = _mm256_mul_ps(fscal,dz20);
715
716             /* Update vectorial force */
717             fix2             = _mm256_add_ps(fix2,tx);
718             fiy2             = _mm256_add_ps(fiy2,ty);
719             fiz2             = _mm256_add_ps(fiz2,tz);
720
721             fjx0             = _mm256_add_ps(fjx0,tx);
722             fjy0             = _mm256_add_ps(fjy0,ty);
723             fjz0             = _mm256_add_ps(fjz0,tz);
724
725             }
726
727             /**************************
728              * CALCULATE INTERACTIONS *
729              **************************/
730
731             if (gmx_mm256_any_lt(rsq21,rcutoff2))
732             {
733
734             r21              = _mm256_mul_ps(rsq21,rinv21);
735
736             /* EWALD ELECTROSTATICS */
737             
738             /* Analytical PME correction */
739             zeta2            = _mm256_mul_ps(beta2,rsq21);
740             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
741             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
742             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
743             felec            = _mm256_mul_ps(qq21,felec);
744             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
745             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
746             velec            = _mm256_sub_ps(rinv21,pmecorrV);
747             velec            = _mm256_mul_ps(qq21,velec);
748             
749             d                = _mm256_sub_ps(r21,rswitch);
750             d                = _mm256_max_ps(d,_mm256_setzero_ps());
751             d2               = _mm256_mul_ps(d,d);
752             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
753
754             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
755
756             /* Evaluate switch function */
757             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
758             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
759             velec            = _mm256_mul_ps(velec,sw);
760             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
761
762             /* Update potential sum for this i atom from the interaction with this j atom. */
763             velec            = _mm256_and_ps(velec,cutoff_mask);
764             velecsum         = _mm256_add_ps(velecsum,velec);
765
766             fscal            = felec;
767
768             fscal            = _mm256_and_ps(fscal,cutoff_mask);
769
770             /* Calculate temporary vectorial force */
771             tx               = _mm256_mul_ps(fscal,dx21);
772             ty               = _mm256_mul_ps(fscal,dy21);
773             tz               = _mm256_mul_ps(fscal,dz21);
774
775             /* Update vectorial force */
776             fix2             = _mm256_add_ps(fix2,tx);
777             fiy2             = _mm256_add_ps(fiy2,ty);
778             fiz2             = _mm256_add_ps(fiz2,tz);
779
780             fjx1             = _mm256_add_ps(fjx1,tx);
781             fjy1             = _mm256_add_ps(fjy1,ty);
782             fjz1             = _mm256_add_ps(fjz1,tz);
783
784             }
785
786             /**************************
787              * CALCULATE INTERACTIONS *
788              **************************/
789
790             if (gmx_mm256_any_lt(rsq22,rcutoff2))
791             {
792
793             r22              = _mm256_mul_ps(rsq22,rinv22);
794
795             /* EWALD ELECTROSTATICS */
796             
797             /* Analytical PME correction */
798             zeta2            = _mm256_mul_ps(beta2,rsq22);
799             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
800             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
801             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
802             felec            = _mm256_mul_ps(qq22,felec);
803             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
804             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
805             velec            = _mm256_sub_ps(rinv22,pmecorrV);
806             velec            = _mm256_mul_ps(qq22,velec);
807             
808             d                = _mm256_sub_ps(r22,rswitch);
809             d                = _mm256_max_ps(d,_mm256_setzero_ps());
810             d2               = _mm256_mul_ps(d,d);
811             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
812
813             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
814
815             /* Evaluate switch function */
816             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
817             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
818             velec            = _mm256_mul_ps(velec,sw);
819             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
820
821             /* Update potential sum for this i atom from the interaction with this j atom. */
822             velec            = _mm256_and_ps(velec,cutoff_mask);
823             velecsum         = _mm256_add_ps(velecsum,velec);
824
825             fscal            = felec;
826
827             fscal            = _mm256_and_ps(fscal,cutoff_mask);
828
829             /* Calculate temporary vectorial force */
830             tx               = _mm256_mul_ps(fscal,dx22);
831             ty               = _mm256_mul_ps(fscal,dy22);
832             tz               = _mm256_mul_ps(fscal,dz22);
833
834             /* Update vectorial force */
835             fix2             = _mm256_add_ps(fix2,tx);
836             fiy2             = _mm256_add_ps(fiy2,ty);
837             fiz2             = _mm256_add_ps(fiz2,tz);
838
839             fjx2             = _mm256_add_ps(fjx2,tx);
840             fjy2             = _mm256_add_ps(fjy2,ty);
841             fjz2             = _mm256_add_ps(fjz2,tz);
842
843             }
844
845             fjptrA             = f+j_coord_offsetA;
846             fjptrB             = f+j_coord_offsetB;
847             fjptrC             = f+j_coord_offsetC;
848             fjptrD             = f+j_coord_offsetD;
849             fjptrE             = f+j_coord_offsetE;
850             fjptrF             = f+j_coord_offsetF;
851             fjptrG             = f+j_coord_offsetG;
852             fjptrH             = f+j_coord_offsetH;
853
854             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
855                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
856
857             /* Inner loop uses 972 flops */
858         }
859
860         if(jidx<j_index_end)
861         {
862
863             /* Get j neighbor index, and coordinate index */
864             jnrlistA         = jjnr[jidx];
865             jnrlistB         = jjnr[jidx+1];
866             jnrlistC         = jjnr[jidx+2];
867             jnrlistD         = jjnr[jidx+3];
868             jnrlistE         = jjnr[jidx+4];
869             jnrlistF         = jjnr[jidx+5];
870             jnrlistG         = jjnr[jidx+6];
871             jnrlistH         = jjnr[jidx+7];
872             /* Sign of each element will be negative for non-real atoms.
873              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
874              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
875              */
876             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
877                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
878                                             
879             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
880             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
881             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
882             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
883             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
884             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
885             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
886             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
887             j_coord_offsetA  = DIM*jnrA;
888             j_coord_offsetB  = DIM*jnrB;
889             j_coord_offsetC  = DIM*jnrC;
890             j_coord_offsetD  = DIM*jnrD;
891             j_coord_offsetE  = DIM*jnrE;
892             j_coord_offsetF  = DIM*jnrF;
893             j_coord_offsetG  = DIM*jnrG;
894             j_coord_offsetH  = DIM*jnrH;
895
896             /* load j atom coordinates */
897             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
898                                                  x+j_coord_offsetC,x+j_coord_offsetD,
899                                                  x+j_coord_offsetE,x+j_coord_offsetF,
900                                                  x+j_coord_offsetG,x+j_coord_offsetH,
901                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
902
903             /* Calculate displacement vector */
904             dx00             = _mm256_sub_ps(ix0,jx0);
905             dy00             = _mm256_sub_ps(iy0,jy0);
906             dz00             = _mm256_sub_ps(iz0,jz0);
907             dx01             = _mm256_sub_ps(ix0,jx1);
908             dy01             = _mm256_sub_ps(iy0,jy1);
909             dz01             = _mm256_sub_ps(iz0,jz1);
910             dx02             = _mm256_sub_ps(ix0,jx2);
911             dy02             = _mm256_sub_ps(iy0,jy2);
912             dz02             = _mm256_sub_ps(iz0,jz2);
913             dx10             = _mm256_sub_ps(ix1,jx0);
914             dy10             = _mm256_sub_ps(iy1,jy0);
915             dz10             = _mm256_sub_ps(iz1,jz0);
916             dx11             = _mm256_sub_ps(ix1,jx1);
917             dy11             = _mm256_sub_ps(iy1,jy1);
918             dz11             = _mm256_sub_ps(iz1,jz1);
919             dx12             = _mm256_sub_ps(ix1,jx2);
920             dy12             = _mm256_sub_ps(iy1,jy2);
921             dz12             = _mm256_sub_ps(iz1,jz2);
922             dx20             = _mm256_sub_ps(ix2,jx0);
923             dy20             = _mm256_sub_ps(iy2,jy0);
924             dz20             = _mm256_sub_ps(iz2,jz0);
925             dx21             = _mm256_sub_ps(ix2,jx1);
926             dy21             = _mm256_sub_ps(iy2,jy1);
927             dz21             = _mm256_sub_ps(iz2,jz1);
928             dx22             = _mm256_sub_ps(ix2,jx2);
929             dy22             = _mm256_sub_ps(iy2,jy2);
930             dz22             = _mm256_sub_ps(iz2,jz2);
931
932             /* Calculate squared distance and things based on it */
933             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
934             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
935             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
936             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
937             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
938             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
939             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
940             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
941             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
942
943             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
944             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
945             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
946             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
947             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
948             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
949             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
950             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
951             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
952
953             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
954             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
955             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
956             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
957             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
958             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
959             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
960             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
961             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
962
963             fjx0             = _mm256_setzero_ps();
964             fjy0             = _mm256_setzero_ps();
965             fjz0             = _mm256_setzero_ps();
966             fjx1             = _mm256_setzero_ps();
967             fjy1             = _mm256_setzero_ps();
968             fjz1             = _mm256_setzero_ps();
969             fjx2             = _mm256_setzero_ps();
970             fjy2             = _mm256_setzero_ps();
971             fjz2             = _mm256_setzero_ps();
972
973             /**************************
974              * CALCULATE INTERACTIONS *
975              **************************/
976
977             if (gmx_mm256_any_lt(rsq00,rcutoff2))
978             {
979
980             r00              = _mm256_mul_ps(rsq00,rinv00);
981             r00              = _mm256_andnot_ps(dummy_mask,r00);
982
983             /* EWALD ELECTROSTATICS */
984             
985             /* Analytical PME correction */
986             zeta2            = _mm256_mul_ps(beta2,rsq00);
987             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
988             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
989             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
990             felec            = _mm256_mul_ps(qq00,felec);
991             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
992             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
993             velec            = _mm256_sub_ps(rinv00,pmecorrV);
994             velec            = _mm256_mul_ps(qq00,velec);
995             
996             d                = _mm256_sub_ps(r00,rswitch);
997             d                = _mm256_max_ps(d,_mm256_setzero_ps());
998             d2               = _mm256_mul_ps(d,d);
999             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1000
1001             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1002
1003             /* Evaluate switch function */
1004             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1005             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1006             velec            = _mm256_mul_ps(velec,sw);
1007             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1008
1009             /* Update potential sum for this i atom from the interaction with this j atom. */
1010             velec            = _mm256_and_ps(velec,cutoff_mask);
1011             velec            = _mm256_andnot_ps(dummy_mask,velec);
1012             velecsum         = _mm256_add_ps(velecsum,velec);
1013
1014             fscal            = felec;
1015
1016             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1017
1018             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm256_mul_ps(fscal,dx00);
1022             ty               = _mm256_mul_ps(fscal,dy00);
1023             tz               = _mm256_mul_ps(fscal,dz00);
1024
1025             /* Update vectorial force */
1026             fix0             = _mm256_add_ps(fix0,tx);
1027             fiy0             = _mm256_add_ps(fiy0,ty);
1028             fiz0             = _mm256_add_ps(fiz0,tz);
1029
1030             fjx0             = _mm256_add_ps(fjx0,tx);
1031             fjy0             = _mm256_add_ps(fjy0,ty);
1032             fjz0             = _mm256_add_ps(fjz0,tz);
1033
1034             }
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1041             {
1042
1043             r01              = _mm256_mul_ps(rsq01,rinv01);
1044             r01              = _mm256_andnot_ps(dummy_mask,r01);
1045
1046             /* EWALD ELECTROSTATICS */
1047             
1048             /* Analytical PME correction */
1049             zeta2            = _mm256_mul_ps(beta2,rsq01);
1050             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1051             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1052             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1053             felec            = _mm256_mul_ps(qq01,felec);
1054             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1055             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1056             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1057             velec            = _mm256_mul_ps(qq01,velec);
1058             
1059             d                = _mm256_sub_ps(r01,rswitch);
1060             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1061             d2               = _mm256_mul_ps(d,d);
1062             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1063
1064             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1065
1066             /* Evaluate switch function */
1067             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1068             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1069             velec            = _mm256_mul_ps(velec,sw);
1070             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm256_and_ps(velec,cutoff_mask);
1074             velec            = _mm256_andnot_ps(dummy_mask,velec);
1075             velecsum         = _mm256_add_ps(velecsum,velec);
1076
1077             fscal            = felec;
1078
1079             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1080
1081             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1082
1083             /* Calculate temporary vectorial force */
1084             tx               = _mm256_mul_ps(fscal,dx01);
1085             ty               = _mm256_mul_ps(fscal,dy01);
1086             tz               = _mm256_mul_ps(fscal,dz01);
1087
1088             /* Update vectorial force */
1089             fix0             = _mm256_add_ps(fix0,tx);
1090             fiy0             = _mm256_add_ps(fiy0,ty);
1091             fiz0             = _mm256_add_ps(fiz0,tz);
1092
1093             fjx1             = _mm256_add_ps(fjx1,tx);
1094             fjy1             = _mm256_add_ps(fjy1,ty);
1095             fjz1             = _mm256_add_ps(fjz1,tz);
1096
1097             }
1098
1099             /**************************
1100              * CALCULATE INTERACTIONS *
1101              **************************/
1102
1103             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1104             {
1105
1106             r02              = _mm256_mul_ps(rsq02,rinv02);
1107             r02              = _mm256_andnot_ps(dummy_mask,r02);
1108
1109             /* EWALD ELECTROSTATICS */
1110             
1111             /* Analytical PME correction */
1112             zeta2            = _mm256_mul_ps(beta2,rsq02);
1113             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1114             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1115             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1116             felec            = _mm256_mul_ps(qq02,felec);
1117             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1118             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1119             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1120             velec            = _mm256_mul_ps(qq02,velec);
1121             
1122             d                = _mm256_sub_ps(r02,rswitch);
1123             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1124             d2               = _mm256_mul_ps(d,d);
1125             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1126
1127             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1128
1129             /* Evaluate switch function */
1130             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1131             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1132             velec            = _mm256_mul_ps(velec,sw);
1133             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1134
1135             /* Update potential sum for this i atom from the interaction with this j atom. */
1136             velec            = _mm256_and_ps(velec,cutoff_mask);
1137             velec            = _mm256_andnot_ps(dummy_mask,velec);
1138             velecsum         = _mm256_add_ps(velecsum,velec);
1139
1140             fscal            = felec;
1141
1142             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1143
1144             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1145
1146             /* Calculate temporary vectorial force */
1147             tx               = _mm256_mul_ps(fscal,dx02);
1148             ty               = _mm256_mul_ps(fscal,dy02);
1149             tz               = _mm256_mul_ps(fscal,dz02);
1150
1151             /* Update vectorial force */
1152             fix0             = _mm256_add_ps(fix0,tx);
1153             fiy0             = _mm256_add_ps(fiy0,ty);
1154             fiz0             = _mm256_add_ps(fiz0,tz);
1155
1156             fjx2             = _mm256_add_ps(fjx2,tx);
1157             fjy2             = _mm256_add_ps(fjy2,ty);
1158             fjz2             = _mm256_add_ps(fjz2,tz);
1159
1160             }
1161
1162             /**************************
1163              * CALCULATE INTERACTIONS *
1164              **************************/
1165
1166             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1167             {
1168
1169             r10              = _mm256_mul_ps(rsq10,rinv10);
1170             r10              = _mm256_andnot_ps(dummy_mask,r10);
1171
1172             /* EWALD ELECTROSTATICS */
1173             
1174             /* Analytical PME correction */
1175             zeta2            = _mm256_mul_ps(beta2,rsq10);
1176             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1177             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1178             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1179             felec            = _mm256_mul_ps(qq10,felec);
1180             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1181             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1182             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1183             velec            = _mm256_mul_ps(qq10,velec);
1184             
1185             d                = _mm256_sub_ps(r10,rswitch);
1186             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1187             d2               = _mm256_mul_ps(d,d);
1188             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1189
1190             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1191
1192             /* Evaluate switch function */
1193             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1194             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1195             velec            = _mm256_mul_ps(velec,sw);
1196             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _mm256_and_ps(velec,cutoff_mask);
1200             velec            = _mm256_andnot_ps(dummy_mask,velec);
1201             velecsum         = _mm256_add_ps(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1206
1207             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm256_mul_ps(fscal,dx10);
1211             ty               = _mm256_mul_ps(fscal,dy10);
1212             tz               = _mm256_mul_ps(fscal,dz10);
1213
1214             /* Update vectorial force */
1215             fix1             = _mm256_add_ps(fix1,tx);
1216             fiy1             = _mm256_add_ps(fiy1,ty);
1217             fiz1             = _mm256_add_ps(fiz1,tz);
1218
1219             fjx0             = _mm256_add_ps(fjx0,tx);
1220             fjy0             = _mm256_add_ps(fjy0,ty);
1221             fjz0             = _mm256_add_ps(fjz0,tz);
1222
1223             }
1224
1225             /**************************
1226              * CALCULATE INTERACTIONS *
1227              **************************/
1228
1229             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1230             {
1231
1232             r11              = _mm256_mul_ps(rsq11,rinv11);
1233             r11              = _mm256_andnot_ps(dummy_mask,r11);
1234
1235             /* EWALD ELECTROSTATICS */
1236             
1237             /* Analytical PME correction */
1238             zeta2            = _mm256_mul_ps(beta2,rsq11);
1239             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1240             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1241             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1242             felec            = _mm256_mul_ps(qq11,felec);
1243             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1244             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1245             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1246             velec            = _mm256_mul_ps(qq11,velec);
1247             
1248             d                = _mm256_sub_ps(r11,rswitch);
1249             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1250             d2               = _mm256_mul_ps(d,d);
1251             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1252
1253             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1254
1255             /* Evaluate switch function */
1256             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1257             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1258             velec            = _mm256_mul_ps(velec,sw);
1259             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1260
1261             /* Update potential sum for this i atom from the interaction with this j atom. */
1262             velec            = _mm256_and_ps(velec,cutoff_mask);
1263             velec            = _mm256_andnot_ps(dummy_mask,velec);
1264             velecsum         = _mm256_add_ps(velecsum,velec);
1265
1266             fscal            = felec;
1267
1268             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1269
1270             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1271
1272             /* Calculate temporary vectorial force */
1273             tx               = _mm256_mul_ps(fscal,dx11);
1274             ty               = _mm256_mul_ps(fscal,dy11);
1275             tz               = _mm256_mul_ps(fscal,dz11);
1276
1277             /* Update vectorial force */
1278             fix1             = _mm256_add_ps(fix1,tx);
1279             fiy1             = _mm256_add_ps(fiy1,ty);
1280             fiz1             = _mm256_add_ps(fiz1,tz);
1281
1282             fjx1             = _mm256_add_ps(fjx1,tx);
1283             fjy1             = _mm256_add_ps(fjy1,ty);
1284             fjz1             = _mm256_add_ps(fjz1,tz);
1285
1286             }
1287
1288             /**************************
1289              * CALCULATE INTERACTIONS *
1290              **************************/
1291
1292             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1293             {
1294
1295             r12              = _mm256_mul_ps(rsq12,rinv12);
1296             r12              = _mm256_andnot_ps(dummy_mask,r12);
1297
1298             /* EWALD ELECTROSTATICS */
1299             
1300             /* Analytical PME correction */
1301             zeta2            = _mm256_mul_ps(beta2,rsq12);
1302             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1303             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1304             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1305             felec            = _mm256_mul_ps(qq12,felec);
1306             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1307             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1308             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1309             velec            = _mm256_mul_ps(qq12,velec);
1310             
1311             d                = _mm256_sub_ps(r12,rswitch);
1312             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1313             d2               = _mm256_mul_ps(d,d);
1314             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1315
1316             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1317
1318             /* Evaluate switch function */
1319             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1320             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1321             velec            = _mm256_mul_ps(velec,sw);
1322             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1323
1324             /* Update potential sum for this i atom from the interaction with this j atom. */
1325             velec            = _mm256_and_ps(velec,cutoff_mask);
1326             velec            = _mm256_andnot_ps(dummy_mask,velec);
1327             velecsum         = _mm256_add_ps(velecsum,velec);
1328
1329             fscal            = felec;
1330
1331             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1332
1333             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1334
1335             /* Calculate temporary vectorial force */
1336             tx               = _mm256_mul_ps(fscal,dx12);
1337             ty               = _mm256_mul_ps(fscal,dy12);
1338             tz               = _mm256_mul_ps(fscal,dz12);
1339
1340             /* Update vectorial force */
1341             fix1             = _mm256_add_ps(fix1,tx);
1342             fiy1             = _mm256_add_ps(fiy1,ty);
1343             fiz1             = _mm256_add_ps(fiz1,tz);
1344
1345             fjx2             = _mm256_add_ps(fjx2,tx);
1346             fjy2             = _mm256_add_ps(fjy2,ty);
1347             fjz2             = _mm256_add_ps(fjz2,tz);
1348
1349             }
1350
1351             /**************************
1352              * CALCULATE INTERACTIONS *
1353              **************************/
1354
1355             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1356             {
1357
1358             r20              = _mm256_mul_ps(rsq20,rinv20);
1359             r20              = _mm256_andnot_ps(dummy_mask,r20);
1360
1361             /* EWALD ELECTROSTATICS */
1362             
1363             /* Analytical PME correction */
1364             zeta2            = _mm256_mul_ps(beta2,rsq20);
1365             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1366             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1367             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1368             felec            = _mm256_mul_ps(qq20,felec);
1369             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1370             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1371             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1372             velec            = _mm256_mul_ps(qq20,velec);
1373             
1374             d                = _mm256_sub_ps(r20,rswitch);
1375             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1376             d2               = _mm256_mul_ps(d,d);
1377             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1378
1379             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1380
1381             /* Evaluate switch function */
1382             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1383             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1384             velec            = _mm256_mul_ps(velec,sw);
1385             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1386
1387             /* Update potential sum for this i atom from the interaction with this j atom. */
1388             velec            = _mm256_and_ps(velec,cutoff_mask);
1389             velec            = _mm256_andnot_ps(dummy_mask,velec);
1390             velecsum         = _mm256_add_ps(velecsum,velec);
1391
1392             fscal            = felec;
1393
1394             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1395
1396             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1397
1398             /* Calculate temporary vectorial force */
1399             tx               = _mm256_mul_ps(fscal,dx20);
1400             ty               = _mm256_mul_ps(fscal,dy20);
1401             tz               = _mm256_mul_ps(fscal,dz20);
1402
1403             /* Update vectorial force */
1404             fix2             = _mm256_add_ps(fix2,tx);
1405             fiy2             = _mm256_add_ps(fiy2,ty);
1406             fiz2             = _mm256_add_ps(fiz2,tz);
1407
1408             fjx0             = _mm256_add_ps(fjx0,tx);
1409             fjy0             = _mm256_add_ps(fjy0,ty);
1410             fjz0             = _mm256_add_ps(fjz0,tz);
1411
1412             }
1413
1414             /**************************
1415              * CALCULATE INTERACTIONS *
1416              **************************/
1417
1418             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1419             {
1420
1421             r21              = _mm256_mul_ps(rsq21,rinv21);
1422             r21              = _mm256_andnot_ps(dummy_mask,r21);
1423
1424             /* EWALD ELECTROSTATICS */
1425             
1426             /* Analytical PME correction */
1427             zeta2            = _mm256_mul_ps(beta2,rsq21);
1428             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1429             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1430             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1431             felec            = _mm256_mul_ps(qq21,felec);
1432             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1433             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1434             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1435             velec            = _mm256_mul_ps(qq21,velec);
1436             
1437             d                = _mm256_sub_ps(r21,rswitch);
1438             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1439             d2               = _mm256_mul_ps(d,d);
1440             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1441
1442             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1443
1444             /* Evaluate switch function */
1445             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1446             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1447             velec            = _mm256_mul_ps(velec,sw);
1448             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1449
1450             /* Update potential sum for this i atom from the interaction with this j atom. */
1451             velec            = _mm256_and_ps(velec,cutoff_mask);
1452             velec            = _mm256_andnot_ps(dummy_mask,velec);
1453             velecsum         = _mm256_add_ps(velecsum,velec);
1454
1455             fscal            = felec;
1456
1457             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1458
1459             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1460
1461             /* Calculate temporary vectorial force */
1462             tx               = _mm256_mul_ps(fscal,dx21);
1463             ty               = _mm256_mul_ps(fscal,dy21);
1464             tz               = _mm256_mul_ps(fscal,dz21);
1465
1466             /* Update vectorial force */
1467             fix2             = _mm256_add_ps(fix2,tx);
1468             fiy2             = _mm256_add_ps(fiy2,ty);
1469             fiz2             = _mm256_add_ps(fiz2,tz);
1470
1471             fjx1             = _mm256_add_ps(fjx1,tx);
1472             fjy1             = _mm256_add_ps(fjy1,ty);
1473             fjz1             = _mm256_add_ps(fjz1,tz);
1474
1475             }
1476
1477             /**************************
1478              * CALCULATE INTERACTIONS *
1479              **************************/
1480
1481             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1482             {
1483
1484             r22              = _mm256_mul_ps(rsq22,rinv22);
1485             r22              = _mm256_andnot_ps(dummy_mask,r22);
1486
1487             /* EWALD ELECTROSTATICS */
1488             
1489             /* Analytical PME correction */
1490             zeta2            = _mm256_mul_ps(beta2,rsq22);
1491             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1492             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1493             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1494             felec            = _mm256_mul_ps(qq22,felec);
1495             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1496             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1497             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1498             velec            = _mm256_mul_ps(qq22,velec);
1499             
1500             d                = _mm256_sub_ps(r22,rswitch);
1501             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1502             d2               = _mm256_mul_ps(d,d);
1503             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1504
1505             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1506
1507             /* Evaluate switch function */
1508             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1509             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1510             velec            = _mm256_mul_ps(velec,sw);
1511             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1512
1513             /* Update potential sum for this i atom from the interaction with this j atom. */
1514             velec            = _mm256_and_ps(velec,cutoff_mask);
1515             velec            = _mm256_andnot_ps(dummy_mask,velec);
1516             velecsum         = _mm256_add_ps(velecsum,velec);
1517
1518             fscal            = felec;
1519
1520             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1521
1522             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1523
1524             /* Calculate temporary vectorial force */
1525             tx               = _mm256_mul_ps(fscal,dx22);
1526             ty               = _mm256_mul_ps(fscal,dy22);
1527             tz               = _mm256_mul_ps(fscal,dz22);
1528
1529             /* Update vectorial force */
1530             fix2             = _mm256_add_ps(fix2,tx);
1531             fiy2             = _mm256_add_ps(fiy2,ty);
1532             fiz2             = _mm256_add_ps(fiz2,tz);
1533
1534             fjx2             = _mm256_add_ps(fjx2,tx);
1535             fjy2             = _mm256_add_ps(fjy2,ty);
1536             fjz2             = _mm256_add_ps(fjz2,tz);
1537
1538             }
1539
1540             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1541             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1542             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1543             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1544             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1545             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1546             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1547             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1548
1549             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1550                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1551
1552             /* Inner loop uses 981 flops */
1553         }
1554
1555         /* End of innermost loop */
1556
1557         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1558                                                  f+i_coord_offset,fshift+i_shift_offset);
1559
1560         ggid                        = gid[iidx];
1561         /* Update potential energies */
1562         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1563
1564         /* Increment number of inner iterations */
1565         inneriter                  += j_index_end - j_index_start;
1566
1567         /* Outer loop uses 19 flops */
1568     }
1569
1570     /* Increment number of outer iterations */
1571     outeriter        += nri;
1572
1573     /* Update outer/inner flops */
1574
1575     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*981);
1576 }
1577 /*
1578  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1579  * Electrostatics interaction: Ewald
1580  * VdW interaction:            None
1581  * Geometry:                   Water3-Water3
1582  * Calculate force/pot:        Force
1583  */
1584 void
1585 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1586                     (t_nblist * gmx_restrict                nlist,
1587                      rvec * gmx_restrict                    xx,
1588                      rvec * gmx_restrict                    ff,
1589                      t_forcerec * gmx_restrict              fr,
1590                      t_mdatoms * gmx_restrict               mdatoms,
1591                      nb_kernel_data_t * gmx_restrict        kernel_data,
1592                      t_nrnb * gmx_restrict                  nrnb)
1593 {
1594     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1595      * just 0 for non-waters.
1596      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1597      * jnr indices corresponding to data put in the four positions in the SIMD register.
1598      */
1599     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1600     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1601     int              jnrA,jnrB,jnrC,jnrD;
1602     int              jnrE,jnrF,jnrG,jnrH;
1603     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1604     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1605     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1606     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1607     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1608     real             rcutoff_scalar;
1609     real             *shiftvec,*fshift,*x,*f;
1610     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1611     real             scratch[4*DIM];
1612     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1613     real *           vdwioffsetptr0;
1614     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1615     real *           vdwioffsetptr1;
1616     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1617     real *           vdwioffsetptr2;
1618     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1619     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1620     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1621     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1622     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1623     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1624     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1625     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1626     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1627     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1628     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1629     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1630     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1631     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1632     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1633     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1634     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1635     real             *charge;
1636     __m256i          ewitab;
1637     __m128i          ewitab_lo,ewitab_hi;
1638     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1639     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1640     real             *ewtab;
1641     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1642     real             rswitch_scalar,d_scalar;
1643     __m256           dummy_mask,cutoff_mask;
1644     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1645     __m256           one     = _mm256_set1_ps(1.0);
1646     __m256           two     = _mm256_set1_ps(2.0);
1647     x                = xx[0];
1648     f                = ff[0];
1649
1650     nri              = nlist->nri;
1651     iinr             = nlist->iinr;
1652     jindex           = nlist->jindex;
1653     jjnr             = nlist->jjnr;
1654     shiftidx         = nlist->shift;
1655     gid              = nlist->gid;
1656     shiftvec         = fr->shift_vec[0];
1657     fshift           = fr->fshift[0];
1658     facel            = _mm256_set1_ps(fr->epsfac);
1659     charge           = mdatoms->chargeA;
1660
1661     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1662     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1663     beta2            = _mm256_mul_ps(beta,beta);
1664     beta3            = _mm256_mul_ps(beta,beta2);
1665
1666     ewtab            = fr->ic->tabq_coul_FDV0;
1667     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1668     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1669
1670     /* Setup water-specific parameters */
1671     inr              = nlist->iinr[0];
1672     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1673     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1674     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1675
1676     jq0              = _mm256_set1_ps(charge[inr+0]);
1677     jq1              = _mm256_set1_ps(charge[inr+1]);
1678     jq2              = _mm256_set1_ps(charge[inr+2]);
1679     qq00             = _mm256_mul_ps(iq0,jq0);
1680     qq01             = _mm256_mul_ps(iq0,jq1);
1681     qq02             = _mm256_mul_ps(iq0,jq2);
1682     qq10             = _mm256_mul_ps(iq1,jq0);
1683     qq11             = _mm256_mul_ps(iq1,jq1);
1684     qq12             = _mm256_mul_ps(iq1,jq2);
1685     qq20             = _mm256_mul_ps(iq2,jq0);
1686     qq21             = _mm256_mul_ps(iq2,jq1);
1687     qq22             = _mm256_mul_ps(iq2,jq2);
1688
1689     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1690     rcutoff_scalar   = fr->rcoulomb;
1691     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1692     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1693
1694     rswitch_scalar   = fr->rcoulomb_switch;
1695     rswitch          = _mm256_set1_ps(rswitch_scalar);
1696     /* Setup switch parameters */
1697     d_scalar         = rcutoff_scalar-rswitch_scalar;
1698     d                = _mm256_set1_ps(d_scalar);
1699     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1700     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1701     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1702     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1703     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1704     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1705
1706     /* Avoid stupid compiler warnings */
1707     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1708     j_coord_offsetA = 0;
1709     j_coord_offsetB = 0;
1710     j_coord_offsetC = 0;
1711     j_coord_offsetD = 0;
1712     j_coord_offsetE = 0;
1713     j_coord_offsetF = 0;
1714     j_coord_offsetG = 0;
1715     j_coord_offsetH = 0;
1716
1717     outeriter        = 0;
1718     inneriter        = 0;
1719
1720     for(iidx=0;iidx<4*DIM;iidx++)
1721     {
1722         scratch[iidx] = 0.0;
1723     }
1724
1725     /* Start outer loop over neighborlists */
1726     for(iidx=0; iidx<nri; iidx++)
1727     {
1728         /* Load shift vector for this list */
1729         i_shift_offset   = DIM*shiftidx[iidx];
1730
1731         /* Load limits for loop over neighbors */
1732         j_index_start    = jindex[iidx];
1733         j_index_end      = jindex[iidx+1];
1734
1735         /* Get outer coordinate index */
1736         inr              = iinr[iidx];
1737         i_coord_offset   = DIM*inr;
1738
1739         /* Load i particle coords and add shift vector */
1740         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1741                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1742
1743         fix0             = _mm256_setzero_ps();
1744         fiy0             = _mm256_setzero_ps();
1745         fiz0             = _mm256_setzero_ps();
1746         fix1             = _mm256_setzero_ps();
1747         fiy1             = _mm256_setzero_ps();
1748         fiz1             = _mm256_setzero_ps();
1749         fix2             = _mm256_setzero_ps();
1750         fiy2             = _mm256_setzero_ps();
1751         fiz2             = _mm256_setzero_ps();
1752
1753         /* Start inner kernel loop */
1754         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1755         {
1756
1757             /* Get j neighbor index, and coordinate index */
1758             jnrA             = jjnr[jidx];
1759             jnrB             = jjnr[jidx+1];
1760             jnrC             = jjnr[jidx+2];
1761             jnrD             = jjnr[jidx+3];
1762             jnrE             = jjnr[jidx+4];
1763             jnrF             = jjnr[jidx+5];
1764             jnrG             = jjnr[jidx+6];
1765             jnrH             = jjnr[jidx+7];
1766             j_coord_offsetA  = DIM*jnrA;
1767             j_coord_offsetB  = DIM*jnrB;
1768             j_coord_offsetC  = DIM*jnrC;
1769             j_coord_offsetD  = DIM*jnrD;
1770             j_coord_offsetE  = DIM*jnrE;
1771             j_coord_offsetF  = DIM*jnrF;
1772             j_coord_offsetG  = DIM*jnrG;
1773             j_coord_offsetH  = DIM*jnrH;
1774
1775             /* load j atom coordinates */
1776             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1777                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1778                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1779                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1780                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1781
1782             /* Calculate displacement vector */
1783             dx00             = _mm256_sub_ps(ix0,jx0);
1784             dy00             = _mm256_sub_ps(iy0,jy0);
1785             dz00             = _mm256_sub_ps(iz0,jz0);
1786             dx01             = _mm256_sub_ps(ix0,jx1);
1787             dy01             = _mm256_sub_ps(iy0,jy1);
1788             dz01             = _mm256_sub_ps(iz0,jz1);
1789             dx02             = _mm256_sub_ps(ix0,jx2);
1790             dy02             = _mm256_sub_ps(iy0,jy2);
1791             dz02             = _mm256_sub_ps(iz0,jz2);
1792             dx10             = _mm256_sub_ps(ix1,jx0);
1793             dy10             = _mm256_sub_ps(iy1,jy0);
1794             dz10             = _mm256_sub_ps(iz1,jz0);
1795             dx11             = _mm256_sub_ps(ix1,jx1);
1796             dy11             = _mm256_sub_ps(iy1,jy1);
1797             dz11             = _mm256_sub_ps(iz1,jz1);
1798             dx12             = _mm256_sub_ps(ix1,jx2);
1799             dy12             = _mm256_sub_ps(iy1,jy2);
1800             dz12             = _mm256_sub_ps(iz1,jz2);
1801             dx20             = _mm256_sub_ps(ix2,jx0);
1802             dy20             = _mm256_sub_ps(iy2,jy0);
1803             dz20             = _mm256_sub_ps(iz2,jz0);
1804             dx21             = _mm256_sub_ps(ix2,jx1);
1805             dy21             = _mm256_sub_ps(iy2,jy1);
1806             dz21             = _mm256_sub_ps(iz2,jz1);
1807             dx22             = _mm256_sub_ps(ix2,jx2);
1808             dy22             = _mm256_sub_ps(iy2,jy2);
1809             dz22             = _mm256_sub_ps(iz2,jz2);
1810
1811             /* Calculate squared distance and things based on it */
1812             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1813             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1814             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1815             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1816             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1817             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1818             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1819             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1820             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1821
1822             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1823             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1824             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1825             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1826             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1827             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1828             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1829             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1830             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1831
1832             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1833             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1834             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1835             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1836             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1837             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1838             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1839             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1840             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1841
1842             fjx0             = _mm256_setzero_ps();
1843             fjy0             = _mm256_setzero_ps();
1844             fjz0             = _mm256_setzero_ps();
1845             fjx1             = _mm256_setzero_ps();
1846             fjy1             = _mm256_setzero_ps();
1847             fjz1             = _mm256_setzero_ps();
1848             fjx2             = _mm256_setzero_ps();
1849             fjy2             = _mm256_setzero_ps();
1850             fjz2             = _mm256_setzero_ps();
1851
1852             /**************************
1853              * CALCULATE INTERACTIONS *
1854              **************************/
1855
1856             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1857             {
1858
1859             r00              = _mm256_mul_ps(rsq00,rinv00);
1860
1861             /* EWALD ELECTROSTATICS */
1862             
1863             /* Analytical PME correction */
1864             zeta2            = _mm256_mul_ps(beta2,rsq00);
1865             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1866             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1867             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1868             felec            = _mm256_mul_ps(qq00,felec);
1869             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1870             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1871             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1872             velec            = _mm256_mul_ps(qq00,velec);
1873             
1874             d                = _mm256_sub_ps(r00,rswitch);
1875             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1876             d2               = _mm256_mul_ps(d,d);
1877             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1878
1879             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1880
1881             /* Evaluate switch function */
1882             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1883             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1884             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1885
1886             fscal            = felec;
1887
1888             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1889
1890             /* Calculate temporary vectorial force */
1891             tx               = _mm256_mul_ps(fscal,dx00);
1892             ty               = _mm256_mul_ps(fscal,dy00);
1893             tz               = _mm256_mul_ps(fscal,dz00);
1894
1895             /* Update vectorial force */
1896             fix0             = _mm256_add_ps(fix0,tx);
1897             fiy0             = _mm256_add_ps(fiy0,ty);
1898             fiz0             = _mm256_add_ps(fiz0,tz);
1899
1900             fjx0             = _mm256_add_ps(fjx0,tx);
1901             fjy0             = _mm256_add_ps(fjy0,ty);
1902             fjz0             = _mm256_add_ps(fjz0,tz);
1903
1904             }
1905
1906             /**************************
1907              * CALCULATE INTERACTIONS *
1908              **************************/
1909
1910             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1911             {
1912
1913             r01              = _mm256_mul_ps(rsq01,rinv01);
1914
1915             /* EWALD ELECTROSTATICS */
1916             
1917             /* Analytical PME correction */
1918             zeta2            = _mm256_mul_ps(beta2,rsq01);
1919             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1920             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1921             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1922             felec            = _mm256_mul_ps(qq01,felec);
1923             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1924             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1925             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1926             velec            = _mm256_mul_ps(qq01,velec);
1927             
1928             d                = _mm256_sub_ps(r01,rswitch);
1929             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1930             d2               = _mm256_mul_ps(d,d);
1931             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1932
1933             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1934
1935             /* Evaluate switch function */
1936             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1937             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1938             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1939
1940             fscal            = felec;
1941
1942             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1943
1944             /* Calculate temporary vectorial force */
1945             tx               = _mm256_mul_ps(fscal,dx01);
1946             ty               = _mm256_mul_ps(fscal,dy01);
1947             tz               = _mm256_mul_ps(fscal,dz01);
1948
1949             /* Update vectorial force */
1950             fix0             = _mm256_add_ps(fix0,tx);
1951             fiy0             = _mm256_add_ps(fiy0,ty);
1952             fiz0             = _mm256_add_ps(fiz0,tz);
1953
1954             fjx1             = _mm256_add_ps(fjx1,tx);
1955             fjy1             = _mm256_add_ps(fjy1,ty);
1956             fjz1             = _mm256_add_ps(fjz1,tz);
1957
1958             }
1959
1960             /**************************
1961              * CALCULATE INTERACTIONS *
1962              **************************/
1963
1964             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1965             {
1966
1967             r02              = _mm256_mul_ps(rsq02,rinv02);
1968
1969             /* EWALD ELECTROSTATICS */
1970             
1971             /* Analytical PME correction */
1972             zeta2            = _mm256_mul_ps(beta2,rsq02);
1973             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1974             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1975             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1976             felec            = _mm256_mul_ps(qq02,felec);
1977             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1978             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1979             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1980             velec            = _mm256_mul_ps(qq02,velec);
1981             
1982             d                = _mm256_sub_ps(r02,rswitch);
1983             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1984             d2               = _mm256_mul_ps(d,d);
1985             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1986
1987             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1988
1989             /* Evaluate switch function */
1990             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1991             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1992             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1993
1994             fscal            = felec;
1995
1996             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1997
1998             /* Calculate temporary vectorial force */
1999             tx               = _mm256_mul_ps(fscal,dx02);
2000             ty               = _mm256_mul_ps(fscal,dy02);
2001             tz               = _mm256_mul_ps(fscal,dz02);
2002
2003             /* Update vectorial force */
2004             fix0             = _mm256_add_ps(fix0,tx);
2005             fiy0             = _mm256_add_ps(fiy0,ty);
2006             fiz0             = _mm256_add_ps(fiz0,tz);
2007
2008             fjx2             = _mm256_add_ps(fjx2,tx);
2009             fjy2             = _mm256_add_ps(fjy2,ty);
2010             fjz2             = _mm256_add_ps(fjz2,tz);
2011
2012             }
2013
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2019             {
2020
2021             r10              = _mm256_mul_ps(rsq10,rinv10);
2022
2023             /* EWALD ELECTROSTATICS */
2024             
2025             /* Analytical PME correction */
2026             zeta2            = _mm256_mul_ps(beta2,rsq10);
2027             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2028             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2029             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2030             felec            = _mm256_mul_ps(qq10,felec);
2031             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2032             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2033             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2034             velec            = _mm256_mul_ps(qq10,velec);
2035             
2036             d                = _mm256_sub_ps(r10,rswitch);
2037             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2038             d2               = _mm256_mul_ps(d,d);
2039             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2040
2041             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2042
2043             /* Evaluate switch function */
2044             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2045             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2046             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2047
2048             fscal            = felec;
2049
2050             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2051
2052             /* Calculate temporary vectorial force */
2053             tx               = _mm256_mul_ps(fscal,dx10);
2054             ty               = _mm256_mul_ps(fscal,dy10);
2055             tz               = _mm256_mul_ps(fscal,dz10);
2056
2057             /* Update vectorial force */
2058             fix1             = _mm256_add_ps(fix1,tx);
2059             fiy1             = _mm256_add_ps(fiy1,ty);
2060             fiz1             = _mm256_add_ps(fiz1,tz);
2061
2062             fjx0             = _mm256_add_ps(fjx0,tx);
2063             fjy0             = _mm256_add_ps(fjy0,ty);
2064             fjz0             = _mm256_add_ps(fjz0,tz);
2065
2066             }
2067
2068             /**************************
2069              * CALCULATE INTERACTIONS *
2070              **************************/
2071
2072             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2073             {
2074
2075             r11              = _mm256_mul_ps(rsq11,rinv11);
2076
2077             /* EWALD ELECTROSTATICS */
2078             
2079             /* Analytical PME correction */
2080             zeta2            = _mm256_mul_ps(beta2,rsq11);
2081             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2082             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2083             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2084             felec            = _mm256_mul_ps(qq11,felec);
2085             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2086             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2087             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2088             velec            = _mm256_mul_ps(qq11,velec);
2089             
2090             d                = _mm256_sub_ps(r11,rswitch);
2091             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2092             d2               = _mm256_mul_ps(d,d);
2093             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2094
2095             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2096
2097             /* Evaluate switch function */
2098             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2099             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2100             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2101
2102             fscal            = felec;
2103
2104             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2105
2106             /* Calculate temporary vectorial force */
2107             tx               = _mm256_mul_ps(fscal,dx11);
2108             ty               = _mm256_mul_ps(fscal,dy11);
2109             tz               = _mm256_mul_ps(fscal,dz11);
2110
2111             /* Update vectorial force */
2112             fix1             = _mm256_add_ps(fix1,tx);
2113             fiy1             = _mm256_add_ps(fiy1,ty);
2114             fiz1             = _mm256_add_ps(fiz1,tz);
2115
2116             fjx1             = _mm256_add_ps(fjx1,tx);
2117             fjy1             = _mm256_add_ps(fjy1,ty);
2118             fjz1             = _mm256_add_ps(fjz1,tz);
2119
2120             }
2121
2122             /**************************
2123              * CALCULATE INTERACTIONS *
2124              **************************/
2125
2126             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2127             {
2128
2129             r12              = _mm256_mul_ps(rsq12,rinv12);
2130
2131             /* EWALD ELECTROSTATICS */
2132             
2133             /* Analytical PME correction */
2134             zeta2            = _mm256_mul_ps(beta2,rsq12);
2135             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2136             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2137             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2138             felec            = _mm256_mul_ps(qq12,felec);
2139             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2140             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2141             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2142             velec            = _mm256_mul_ps(qq12,velec);
2143             
2144             d                = _mm256_sub_ps(r12,rswitch);
2145             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2146             d2               = _mm256_mul_ps(d,d);
2147             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2148
2149             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2150
2151             /* Evaluate switch function */
2152             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2153             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2154             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2155
2156             fscal            = felec;
2157
2158             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2159
2160             /* Calculate temporary vectorial force */
2161             tx               = _mm256_mul_ps(fscal,dx12);
2162             ty               = _mm256_mul_ps(fscal,dy12);
2163             tz               = _mm256_mul_ps(fscal,dz12);
2164
2165             /* Update vectorial force */
2166             fix1             = _mm256_add_ps(fix1,tx);
2167             fiy1             = _mm256_add_ps(fiy1,ty);
2168             fiz1             = _mm256_add_ps(fiz1,tz);
2169
2170             fjx2             = _mm256_add_ps(fjx2,tx);
2171             fjy2             = _mm256_add_ps(fjy2,ty);
2172             fjz2             = _mm256_add_ps(fjz2,tz);
2173
2174             }
2175
2176             /**************************
2177              * CALCULATE INTERACTIONS *
2178              **************************/
2179
2180             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2181             {
2182
2183             r20              = _mm256_mul_ps(rsq20,rinv20);
2184
2185             /* EWALD ELECTROSTATICS */
2186             
2187             /* Analytical PME correction */
2188             zeta2            = _mm256_mul_ps(beta2,rsq20);
2189             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2190             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2191             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2192             felec            = _mm256_mul_ps(qq20,felec);
2193             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2194             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2195             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2196             velec            = _mm256_mul_ps(qq20,velec);
2197             
2198             d                = _mm256_sub_ps(r20,rswitch);
2199             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2200             d2               = _mm256_mul_ps(d,d);
2201             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2202
2203             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2204
2205             /* Evaluate switch function */
2206             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2208             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2209
2210             fscal            = felec;
2211
2212             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2213
2214             /* Calculate temporary vectorial force */
2215             tx               = _mm256_mul_ps(fscal,dx20);
2216             ty               = _mm256_mul_ps(fscal,dy20);
2217             tz               = _mm256_mul_ps(fscal,dz20);
2218
2219             /* Update vectorial force */
2220             fix2             = _mm256_add_ps(fix2,tx);
2221             fiy2             = _mm256_add_ps(fiy2,ty);
2222             fiz2             = _mm256_add_ps(fiz2,tz);
2223
2224             fjx0             = _mm256_add_ps(fjx0,tx);
2225             fjy0             = _mm256_add_ps(fjy0,ty);
2226             fjz0             = _mm256_add_ps(fjz0,tz);
2227
2228             }
2229
2230             /**************************
2231              * CALCULATE INTERACTIONS *
2232              **************************/
2233
2234             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2235             {
2236
2237             r21              = _mm256_mul_ps(rsq21,rinv21);
2238
2239             /* EWALD ELECTROSTATICS */
2240             
2241             /* Analytical PME correction */
2242             zeta2            = _mm256_mul_ps(beta2,rsq21);
2243             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2244             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2245             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2246             felec            = _mm256_mul_ps(qq21,felec);
2247             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2248             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2249             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2250             velec            = _mm256_mul_ps(qq21,velec);
2251             
2252             d                = _mm256_sub_ps(r21,rswitch);
2253             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2254             d2               = _mm256_mul_ps(d,d);
2255             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2256
2257             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2258
2259             /* Evaluate switch function */
2260             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2261             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2262             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2263
2264             fscal            = felec;
2265
2266             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2267
2268             /* Calculate temporary vectorial force */
2269             tx               = _mm256_mul_ps(fscal,dx21);
2270             ty               = _mm256_mul_ps(fscal,dy21);
2271             tz               = _mm256_mul_ps(fscal,dz21);
2272
2273             /* Update vectorial force */
2274             fix2             = _mm256_add_ps(fix2,tx);
2275             fiy2             = _mm256_add_ps(fiy2,ty);
2276             fiz2             = _mm256_add_ps(fiz2,tz);
2277
2278             fjx1             = _mm256_add_ps(fjx1,tx);
2279             fjy1             = _mm256_add_ps(fjy1,ty);
2280             fjz1             = _mm256_add_ps(fjz1,tz);
2281
2282             }
2283
2284             /**************************
2285              * CALCULATE INTERACTIONS *
2286              **************************/
2287
2288             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2289             {
2290
2291             r22              = _mm256_mul_ps(rsq22,rinv22);
2292
2293             /* EWALD ELECTROSTATICS */
2294             
2295             /* Analytical PME correction */
2296             zeta2            = _mm256_mul_ps(beta2,rsq22);
2297             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2298             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2299             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2300             felec            = _mm256_mul_ps(qq22,felec);
2301             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2302             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2303             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2304             velec            = _mm256_mul_ps(qq22,velec);
2305             
2306             d                = _mm256_sub_ps(r22,rswitch);
2307             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2308             d2               = _mm256_mul_ps(d,d);
2309             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2310
2311             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2312
2313             /* Evaluate switch function */
2314             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2315             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2316             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2317
2318             fscal            = felec;
2319
2320             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2321
2322             /* Calculate temporary vectorial force */
2323             tx               = _mm256_mul_ps(fscal,dx22);
2324             ty               = _mm256_mul_ps(fscal,dy22);
2325             tz               = _mm256_mul_ps(fscal,dz22);
2326
2327             /* Update vectorial force */
2328             fix2             = _mm256_add_ps(fix2,tx);
2329             fiy2             = _mm256_add_ps(fiy2,ty);
2330             fiz2             = _mm256_add_ps(fiz2,tz);
2331
2332             fjx2             = _mm256_add_ps(fjx2,tx);
2333             fjy2             = _mm256_add_ps(fjy2,ty);
2334             fjz2             = _mm256_add_ps(fjz2,tz);
2335
2336             }
2337
2338             fjptrA             = f+j_coord_offsetA;
2339             fjptrB             = f+j_coord_offsetB;
2340             fjptrC             = f+j_coord_offsetC;
2341             fjptrD             = f+j_coord_offsetD;
2342             fjptrE             = f+j_coord_offsetE;
2343             fjptrF             = f+j_coord_offsetF;
2344             fjptrG             = f+j_coord_offsetG;
2345             fjptrH             = f+j_coord_offsetH;
2346
2347             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2348                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2349
2350             /* Inner loop uses 945 flops */
2351         }
2352
2353         if(jidx<j_index_end)
2354         {
2355
2356             /* Get j neighbor index, and coordinate index */
2357             jnrlistA         = jjnr[jidx];
2358             jnrlistB         = jjnr[jidx+1];
2359             jnrlistC         = jjnr[jidx+2];
2360             jnrlistD         = jjnr[jidx+3];
2361             jnrlistE         = jjnr[jidx+4];
2362             jnrlistF         = jjnr[jidx+5];
2363             jnrlistG         = jjnr[jidx+6];
2364             jnrlistH         = jjnr[jidx+7];
2365             /* Sign of each element will be negative for non-real atoms.
2366              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2367              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2368              */
2369             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2370                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2371                                             
2372             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2373             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2374             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2375             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2376             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2377             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2378             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2379             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2380             j_coord_offsetA  = DIM*jnrA;
2381             j_coord_offsetB  = DIM*jnrB;
2382             j_coord_offsetC  = DIM*jnrC;
2383             j_coord_offsetD  = DIM*jnrD;
2384             j_coord_offsetE  = DIM*jnrE;
2385             j_coord_offsetF  = DIM*jnrF;
2386             j_coord_offsetG  = DIM*jnrG;
2387             j_coord_offsetH  = DIM*jnrH;
2388
2389             /* load j atom coordinates */
2390             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2391                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2392                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2393                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2394                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2395
2396             /* Calculate displacement vector */
2397             dx00             = _mm256_sub_ps(ix0,jx0);
2398             dy00             = _mm256_sub_ps(iy0,jy0);
2399             dz00             = _mm256_sub_ps(iz0,jz0);
2400             dx01             = _mm256_sub_ps(ix0,jx1);
2401             dy01             = _mm256_sub_ps(iy0,jy1);
2402             dz01             = _mm256_sub_ps(iz0,jz1);
2403             dx02             = _mm256_sub_ps(ix0,jx2);
2404             dy02             = _mm256_sub_ps(iy0,jy2);
2405             dz02             = _mm256_sub_ps(iz0,jz2);
2406             dx10             = _mm256_sub_ps(ix1,jx0);
2407             dy10             = _mm256_sub_ps(iy1,jy0);
2408             dz10             = _mm256_sub_ps(iz1,jz0);
2409             dx11             = _mm256_sub_ps(ix1,jx1);
2410             dy11             = _mm256_sub_ps(iy1,jy1);
2411             dz11             = _mm256_sub_ps(iz1,jz1);
2412             dx12             = _mm256_sub_ps(ix1,jx2);
2413             dy12             = _mm256_sub_ps(iy1,jy2);
2414             dz12             = _mm256_sub_ps(iz1,jz2);
2415             dx20             = _mm256_sub_ps(ix2,jx0);
2416             dy20             = _mm256_sub_ps(iy2,jy0);
2417             dz20             = _mm256_sub_ps(iz2,jz0);
2418             dx21             = _mm256_sub_ps(ix2,jx1);
2419             dy21             = _mm256_sub_ps(iy2,jy1);
2420             dz21             = _mm256_sub_ps(iz2,jz1);
2421             dx22             = _mm256_sub_ps(ix2,jx2);
2422             dy22             = _mm256_sub_ps(iy2,jy2);
2423             dz22             = _mm256_sub_ps(iz2,jz2);
2424
2425             /* Calculate squared distance and things based on it */
2426             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2427             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2428             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2429             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2430             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2431             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2432             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2433             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2434             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2435
2436             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2437             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2438             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2439             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2440             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2441             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2442             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2443             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2444             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2445
2446             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2447             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2448             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2449             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2450             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2451             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2452             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2453             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2454             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2455
2456             fjx0             = _mm256_setzero_ps();
2457             fjy0             = _mm256_setzero_ps();
2458             fjz0             = _mm256_setzero_ps();
2459             fjx1             = _mm256_setzero_ps();
2460             fjy1             = _mm256_setzero_ps();
2461             fjz1             = _mm256_setzero_ps();
2462             fjx2             = _mm256_setzero_ps();
2463             fjy2             = _mm256_setzero_ps();
2464             fjz2             = _mm256_setzero_ps();
2465
2466             /**************************
2467              * CALCULATE INTERACTIONS *
2468              **************************/
2469
2470             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2471             {
2472
2473             r00              = _mm256_mul_ps(rsq00,rinv00);
2474             r00              = _mm256_andnot_ps(dummy_mask,r00);
2475
2476             /* EWALD ELECTROSTATICS */
2477             
2478             /* Analytical PME correction */
2479             zeta2            = _mm256_mul_ps(beta2,rsq00);
2480             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2481             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2482             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2483             felec            = _mm256_mul_ps(qq00,felec);
2484             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2485             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2486             velec            = _mm256_sub_ps(rinv00,pmecorrV);
2487             velec            = _mm256_mul_ps(qq00,velec);
2488             
2489             d                = _mm256_sub_ps(r00,rswitch);
2490             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2491             d2               = _mm256_mul_ps(d,d);
2492             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2493
2494             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2495
2496             /* Evaluate switch function */
2497             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2498             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2499             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2500
2501             fscal            = felec;
2502
2503             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2504
2505             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2506
2507             /* Calculate temporary vectorial force */
2508             tx               = _mm256_mul_ps(fscal,dx00);
2509             ty               = _mm256_mul_ps(fscal,dy00);
2510             tz               = _mm256_mul_ps(fscal,dz00);
2511
2512             /* Update vectorial force */
2513             fix0             = _mm256_add_ps(fix0,tx);
2514             fiy0             = _mm256_add_ps(fiy0,ty);
2515             fiz0             = _mm256_add_ps(fiz0,tz);
2516
2517             fjx0             = _mm256_add_ps(fjx0,tx);
2518             fjy0             = _mm256_add_ps(fjy0,ty);
2519             fjz0             = _mm256_add_ps(fjz0,tz);
2520
2521             }
2522
2523             /**************************
2524              * CALCULATE INTERACTIONS *
2525              **************************/
2526
2527             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2528             {
2529
2530             r01              = _mm256_mul_ps(rsq01,rinv01);
2531             r01              = _mm256_andnot_ps(dummy_mask,r01);
2532
2533             /* EWALD ELECTROSTATICS */
2534             
2535             /* Analytical PME correction */
2536             zeta2            = _mm256_mul_ps(beta2,rsq01);
2537             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2538             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2539             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2540             felec            = _mm256_mul_ps(qq01,felec);
2541             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2542             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2543             velec            = _mm256_sub_ps(rinv01,pmecorrV);
2544             velec            = _mm256_mul_ps(qq01,velec);
2545             
2546             d                = _mm256_sub_ps(r01,rswitch);
2547             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2548             d2               = _mm256_mul_ps(d,d);
2549             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2550
2551             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2552
2553             /* Evaluate switch function */
2554             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2555             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2556             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2557
2558             fscal            = felec;
2559
2560             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2561
2562             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2563
2564             /* Calculate temporary vectorial force */
2565             tx               = _mm256_mul_ps(fscal,dx01);
2566             ty               = _mm256_mul_ps(fscal,dy01);
2567             tz               = _mm256_mul_ps(fscal,dz01);
2568
2569             /* Update vectorial force */
2570             fix0             = _mm256_add_ps(fix0,tx);
2571             fiy0             = _mm256_add_ps(fiy0,ty);
2572             fiz0             = _mm256_add_ps(fiz0,tz);
2573
2574             fjx1             = _mm256_add_ps(fjx1,tx);
2575             fjy1             = _mm256_add_ps(fjy1,ty);
2576             fjz1             = _mm256_add_ps(fjz1,tz);
2577
2578             }
2579
2580             /**************************
2581              * CALCULATE INTERACTIONS *
2582              **************************/
2583
2584             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2585             {
2586
2587             r02              = _mm256_mul_ps(rsq02,rinv02);
2588             r02              = _mm256_andnot_ps(dummy_mask,r02);
2589
2590             /* EWALD ELECTROSTATICS */
2591             
2592             /* Analytical PME correction */
2593             zeta2            = _mm256_mul_ps(beta2,rsq02);
2594             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2595             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2596             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2597             felec            = _mm256_mul_ps(qq02,felec);
2598             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2599             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2600             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2601             velec            = _mm256_mul_ps(qq02,velec);
2602             
2603             d                = _mm256_sub_ps(r02,rswitch);
2604             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2605             d2               = _mm256_mul_ps(d,d);
2606             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2607
2608             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2609
2610             /* Evaluate switch function */
2611             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2612             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2613             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2614
2615             fscal            = felec;
2616
2617             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2618
2619             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2620
2621             /* Calculate temporary vectorial force */
2622             tx               = _mm256_mul_ps(fscal,dx02);
2623             ty               = _mm256_mul_ps(fscal,dy02);
2624             tz               = _mm256_mul_ps(fscal,dz02);
2625
2626             /* Update vectorial force */
2627             fix0             = _mm256_add_ps(fix0,tx);
2628             fiy0             = _mm256_add_ps(fiy0,ty);
2629             fiz0             = _mm256_add_ps(fiz0,tz);
2630
2631             fjx2             = _mm256_add_ps(fjx2,tx);
2632             fjy2             = _mm256_add_ps(fjy2,ty);
2633             fjz2             = _mm256_add_ps(fjz2,tz);
2634
2635             }
2636
2637             /**************************
2638              * CALCULATE INTERACTIONS *
2639              **************************/
2640
2641             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2642             {
2643
2644             r10              = _mm256_mul_ps(rsq10,rinv10);
2645             r10              = _mm256_andnot_ps(dummy_mask,r10);
2646
2647             /* EWALD ELECTROSTATICS */
2648             
2649             /* Analytical PME correction */
2650             zeta2            = _mm256_mul_ps(beta2,rsq10);
2651             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2652             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2653             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2654             felec            = _mm256_mul_ps(qq10,felec);
2655             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2656             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2657             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2658             velec            = _mm256_mul_ps(qq10,velec);
2659             
2660             d                = _mm256_sub_ps(r10,rswitch);
2661             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2662             d2               = _mm256_mul_ps(d,d);
2663             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2664
2665             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2666
2667             /* Evaluate switch function */
2668             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2669             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2670             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2671
2672             fscal            = felec;
2673
2674             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2675
2676             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2677
2678             /* Calculate temporary vectorial force */
2679             tx               = _mm256_mul_ps(fscal,dx10);
2680             ty               = _mm256_mul_ps(fscal,dy10);
2681             tz               = _mm256_mul_ps(fscal,dz10);
2682
2683             /* Update vectorial force */
2684             fix1             = _mm256_add_ps(fix1,tx);
2685             fiy1             = _mm256_add_ps(fiy1,ty);
2686             fiz1             = _mm256_add_ps(fiz1,tz);
2687
2688             fjx0             = _mm256_add_ps(fjx0,tx);
2689             fjy0             = _mm256_add_ps(fjy0,ty);
2690             fjz0             = _mm256_add_ps(fjz0,tz);
2691
2692             }
2693
2694             /**************************
2695              * CALCULATE INTERACTIONS *
2696              **************************/
2697
2698             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2699             {
2700
2701             r11              = _mm256_mul_ps(rsq11,rinv11);
2702             r11              = _mm256_andnot_ps(dummy_mask,r11);
2703
2704             /* EWALD ELECTROSTATICS */
2705             
2706             /* Analytical PME correction */
2707             zeta2            = _mm256_mul_ps(beta2,rsq11);
2708             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2709             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2710             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2711             felec            = _mm256_mul_ps(qq11,felec);
2712             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2713             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2714             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2715             velec            = _mm256_mul_ps(qq11,velec);
2716             
2717             d                = _mm256_sub_ps(r11,rswitch);
2718             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2719             d2               = _mm256_mul_ps(d,d);
2720             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2721
2722             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2723
2724             /* Evaluate switch function */
2725             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2726             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2727             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2728
2729             fscal            = felec;
2730
2731             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2732
2733             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2734
2735             /* Calculate temporary vectorial force */
2736             tx               = _mm256_mul_ps(fscal,dx11);
2737             ty               = _mm256_mul_ps(fscal,dy11);
2738             tz               = _mm256_mul_ps(fscal,dz11);
2739
2740             /* Update vectorial force */
2741             fix1             = _mm256_add_ps(fix1,tx);
2742             fiy1             = _mm256_add_ps(fiy1,ty);
2743             fiz1             = _mm256_add_ps(fiz1,tz);
2744
2745             fjx1             = _mm256_add_ps(fjx1,tx);
2746             fjy1             = _mm256_add_ps(fjy1,ty);
2747             fjz1             = _mm256_add_ps(fjz1,tz);
2748
2749             }
2750
2751             /**************************
2752              * CALCULATE INTERACTIONS *
2753              **************************/
2754
2755             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2756             {
2757
2758             r12              = _mm256_mul_ps(rsq12,rinv12);
2759             r12              = _mm256_andnot_ps(dummy_mask,r12);
2760
2761             /* EWALD ELECTROSTATICS */
2762             
2763             /* Analytical PME correction */
2764             zeta2            = _mm256_mul_ps(beta2,rsq12);
2765             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2766             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2767             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2768             felec            = _mm256_mul_ps(qq12,felec);
2769             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2770             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2771             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2772             velec            = _mm256_mul_ps(qq12,velec);
2773             
2774             d                = _mm256_sub_ps(r12,rswitch);
2775             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2776             d2               = _mm256_mul_ps(d,d);
2777             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2778
2779             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2780
2781             /* Evaluate switch function */
2782             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2783             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2784             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2785
2786             fscal            = felec;
2787
2788             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2789
2790             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2791
2792             /* Calculate temporary vectorial force */
2793             tx               = _mm256_mul_ps(fscal,dx12);
2794             ty               = _mm256_mul_ps(fscal,dy12);
2795             tz               = _mm256_mul_ps(fscal,dz12);
2796
2797             /* Update vectorial force */
2798             fix1             = _mm256_add_ps(fix1,tx);
2799             fiy1             = _mm256_add_ps(fiy1,ty);
2800             fiz1             = _mm256_add_ps(fiz1,tz);
2801
2802             fjx2             = _mm256_add_ps(fjx2,tx);
2803             fjy2             = _mm256_add_ps(fjy2,ty);
2804             fjz2             = _mm256_add_ps(fjz2,tz);
2805
2806             }
2807
2808             /**************************
2809              * CALCULATE INTERACTIONS *
2810              **************************/
2811
2812             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2813             {
2814
2815             r20              = _mm256_mul_ps(rsq20,rinv20);
2816             r20              = _mm256_andnot_ps(dummy_mask,r20);
2817
2818             /* EWALD ELECTROSTATICS */
2819             
2820             /* Analytical PME correction */
2821             zeta2            = _mm256_mul_ps(beta2,rsq20);
2822             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2823             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2824             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2825             felec            = _mm256_mul_ps(qq20,felec);
2826             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2827             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2828             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2829             velec            = _mm256_mul_ps(qq20,velec);
2830             
2831             d                = _mm256_sub_ps(r20,rswitch);
2832             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2833             d2               = _mm256_mul_ps(d,d);
2834             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2835
2836             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2837
2838             /* Evaluate switch function */
2839             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2840             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2841             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2842
2843             fscal            = felec;
2844
2845             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2846
2847             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2848
2849             /* Calculate temporary vectorial force */
2850             tx               = _mm256_mul_ps(fscal,dx20);
2851             ty               = _mm256_mul_ps(fscal,dy20);
2852             tz               = _mm256_mul_ps(fscal,dz20);
2853
2854             /* Update vectorial force */
2855             fix2             = _mm256_add_ps(fix2,tx);
2856             fiy2             = _mm256_add_ps(fiy2,ty);
2857             fiz2             = _mm256_add_ps(fiz2,tz);
2858
2859             fjx0             = _mm256_add_ps(fjx0,tx);
2860             fjy0             = _mm256_add_ps(fjy0,ty);
2861             fjz0             = _mm256_add_ps(fjz0,tz);
2862
2863             }
2864
2865             /**************************
2866              * CALCULATE INTERACTIONS *
2867              **************************/
2868
2869             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2870             {
2871
2872             r21              = _mm256_mul_ps(rsq21,rinv21);
2873             r21              = _mm256_andnot_ps(dummy_mask,r21);
2874
2875             /* EWALD ELECTROSTATICS */
2876             
2877             /* Analytical PME correction */
2878             zeta2            = _mm256_mul_ps(beta2,rsq21);
2879             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2880             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2881             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2882             felec            = _mm256_mul_ps(qq21,felec);
2883             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2884             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2885             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2886             velec            = _mm256_mul_ps(qq21,velec);
2887             
2888             d                = _mm256_sub_ps(r21,rswitch);
2889             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2890             d2               = _mm256_mul_ps(d,d);
2891             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2892
2893             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2894
2895             /* Evaluate switch function */
2896             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2897             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2898             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2899
2900             fscal            = felec;
2901
2902             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2903
2904             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2905
2906             /* Calculate temporary vectorial force */
2907             tx               = _mm256_mul_ps(fscal,dx21);
2908             ty               = _mm256_mul_ps(fscal,dy21);
2909             tz               = _mm256_mul_ps(fscal,dz21);
2910
2911             /* Update vectorial force */
2912             fix2             = _mm256_add_ps(fix2,tx);
2913             fiy2             = _mm256_add_ps(fiy2,ty);
2914             fiz2             = _mm256_add_ps(fiz2,tz);
2915
2916             fjx1             = _mm256_add_ps(fjx1,tx);
2917             fjy1             = _mm256_add_ps(fjy1,ty);
2918             fjz1             = _mm256_add_ps(fjz1,tz);
2919
2920             }
2921
2922             /**************************
2923              * CALCULATE INTERACTIONS *
2924              **************************/
2925
2926             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2927             {
2928
2929             r22              = _mm256_mul_ps(rsq22,rinv22);
2930             r22              = _mm256_andnot_ps(dummy_mask,r22);
2931
2932             /* EWALD ELECTROSTATICS */
2933             
2934             /* Analytical PME correction */
2935             zeta2            = _mm256_mul_ps(beta2,rsq22);
2936             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2937             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2938             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2939             felec            = _mm256_mul_ps(qq22,felec);
2940             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2941             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2942             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2943             velec            = _mm256_mul_ps(qq22,velec);
2944             
2945             d                = _mm256_sub_ps(r22,rswitch);
2946             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2947             d2               = _mm256_mul_ps(d,d);
2948             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2949
2950             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2951
2952             /* Evaluate switch function */
2953             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2954             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2955             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2956
2957             fscal            = felec;
2958
2959             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2960
2961             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2962
2963             /* Calculate temporary vectorial force */
2964             tx               = _mm256_mul_ps(fscal,dx22);
2965             ty               = _mm256_mul_ps(fscal,dy22);
2966             tz               = _mm256_mul_ps(fscal,dz22);
2967
2968             /* Update vectorial force */
2969             fix2             = _mm256_add_ps(fix2,tx);
2970             fiy2             = _mm256_add_ps(fiy2,ty);
2971             fiz2             = _mm256_add_ps(fiz2,tz);
2972
2973             fjx2             = _mm256_add_ps(fjx2,tx);
2974             fjy2             = _mm256_add_ps(fjy2,ty);
2975             fjz2             = _mm256_add_ps(fjz2,tz);
2976
2977             }
2978
2979             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2980             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2981             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2982             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2983             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2984             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2985             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2986             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2987
2988             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2989                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2990
2991             /* Inner loop uses 954 flops */
2992         }
2993
2994         /* End of innermost loop */
2995
2996         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2997                                                  f+i_coord_offset,fshift+i_shift_offset);
2998
2999         /* Increment number of inner iterations */
3000         inneriter                  += j_index_end - j_index_start;
3001
3002         /* Outer loop uses 18 flops */
3003     }
3004
3005     /* Increment number of outer iterations */
3006     outeriter        += nri;
3007
3008     /* Update outer/inner flops */
3009
3010     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*954);
3011 }