made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSh_VdwNone_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            None
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     __m256i          ewitab;
96     __m128i          ewitab_lo,ewitab_hi;
97     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
99     real             *ewtab;
100     __m256           dummy_mask,cutoff_mask;
101     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
102     __m256           one     = _mm256_set1_ps(1.0);
103     __m256           two     = _mm256_set1_ps(2.0);
104     x                = xx[0];
105     f                = ff[0];
106
107     nri              = nlist->nri;
108     iinr             = nlist->iinr;
109     jindex           = nlist->jindex;
110     jjnr             = nlist->jjnr;
111     shiftidx         = nlist->shift;
112     gid              = nlist->gid;
113     shiftvec         = fr->shift_vec[0];
114     fshift           = fr->fshift[0];
115     facel            = _mm256_set1_ps(fr->epsfac);
116     charge           = mdatoms->chargeA;
117
118     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
119     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
120     beta2            = _mm256_mul_ps(beta,beta);
121     beta3            = _mm256_mul_ps(beta,beta2);
122
123     ewtab            = fr->ic->tabq_coul_FDV0;
124     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
125     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
130     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
131     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
132
133     jq0              = _mm256_set1_ps(charge[inr+0]);
134     jq1              = _mm256_set1_ps(charge[inr+1]);
135     jq2              = _mm256_set1_ps(charge[inr+2]);
136     qq00             = _mm256_mul_ps(iq0,jq0);
137     qq01             = _mm256_mul_ps(iq0,jq1);
138     qq02             = _mm256_mul_ps(iq0,jq2);
139     qq10             = _mm256_mul_ps(iq1,jq0);
140     qq11             = _mm256_mul_ps(iq1,jq1);
141     qq12             = _mm256_mul_ps(iq1,jq2);
142     qq20             = _mm256_mul_ps(iq2,jq0);
143     qq21             = _mm256_mul_ps(iq2,jq1);
144     qq22             = _mm256_mul_ps(iq2,jq2);
145
146     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
147     rcutoff_scalar   = fr->rcoulomb;
148     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
149     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
150
151     /* Avoid stupid compiler warnings */
152     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
153     j_coord_offsetA = 0;
154     j_coord_offsetB = 0;
155     j_coord_offsetC = 0;
156     j_coord_offsetD = 0;
157     j_coord_offsetE = 0;
158     j_coord_offsetF = 0;
159     j_coord_offsetG = 0;
160     j_coord_offsetH = 0;
161
162     outeriter        = 0;
163     inneriter        = 0;
164
165     for(iidx=0;iidx<4*DIM;iidx++)
166     {
167         scratch[iidx] = 0.0;
168     }
169
170     /* Start outer loop over neighborlists */
171     for(iidx=0; iidx<nri; iidx++)
172     {
173         /* Load shift vector for this list */
174         i_shift_offset   = DIM*shiftidx[iidx];
175
176         /* Load limits for loop over neighbors */
177         j_index_start    = jindex[iidx];
178         j_index_end      = jindex[iidx+1];
179
180         /* Get outer coordinate index */
181         inr              = iinr[iidx];
182         i_coord_offset   = DIM*inr;
183
184         /* Load i particle coords and add shift vector */
185         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
186                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187
188         fix0             = _mm256_setzero_ps();
189         fiy0             = _mm256_setzero_ps();
190         fiz0             = _mm256_setzero_ps();
191         fix1             = _mm256_setzero_ps();
192         fiy1             = _mm256_setzero_ps();
193         fiz1             = _mm256_setzero_ps();
194         fix2             = _mm256_setzero_ps();
195         fiy2             = _mm256_setzero_ps();
196         fiz2             = _mm256_setzero_ps();
197
198         /* Reset potential sums */
199         velecsum         = _mm256_setzero_ps();
200
201         /* Start inner kernel loop */
202         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
203         {
204
205             /* Get j neighbor index, and coordinate index */
206             jnrA             = jjnr[jidx];
207             jnrB             = jjnr[jidx+1];
208             jnrC             = jjnr[jidx+2];
209             jnrD             = jjnr[jidx+3];
210             jnrE             = jjnr[jidx+4];
211             jnrF             = jjnr[jidx+5];
212             jnrG             = jjnr[jidx+6];
213             jnrH             = jjnr[jidx+7];
214             j_coord_offsetA  = DIM*jnrA;
215             j_coord_offsetB  = DIM*jnrB;
216             j_coord_offsetC  = DIM*jnrC;
217             j_coord_offsetD  = DIM*jnrD;
218             j_coord_offsetE  = DIM*jnrE;
219             j_coord_offsetF  = DIM*jnrF;
220             j_coord_offsetG  = DIM*jnrG;
221             j_coord_offsetH  = DIM*jnrH;
222
223             /* load j atom coordinates */
224             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225                                                  x+j_coord_offsetC,x+j_coord_offsetD,
226                                                  x+j_coord_offsetE,x+j_coord_offsetF,
227                                                  x+j_coord_offsetG,x+j_coord_offsetH,
228                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
229
230             /* Calculate displacement vector */
231             dx00             = _mm256_sub_ps(ix0,jx0);
232             dy00             = _mm256_sub_ps(iy0,jy0);
233             dz00             = _mm256_sub_ps(iz0,jz0);
234             dx01             = _mm256_sub_ps(ix0,jx1);
235             dy01             = _mm256_sub_ps(iy0,jy1);
236             dz01             = _mm256_sub_ps(iz0,jz1);
237             dx02             = _mm256_sub_ps(ix0,jx2);
238             dy02             = _mm256_sub_ps(iy0,jy2);
239             dz02             = _mm256_sub_ps(iz0,jz2);
240             dx10             = _mm256_sub_ps(ix1,jx0);
241             dy10             = _mm256_sub_ps(iy1,jy0);
242             dz10             = _mm256_sub_ps(iz1,jz0);
243             dx11             = _mm256_sub_ps(ix1,jx1);
244             dy11             = _mm256_sub_ps(iy1,jy1);
245             dz11             = _mm256_sub_ps(iz1,jz1);
246             dx12             = _mm256_sub_ps(ix1,jx2);
247             dy12             = _mm256_sub_ps(iy1,jy2);
248             dz12             = _mm256_sub_ps(iz1,jz2);
249             dx20             = _mm256_sub_ps(ix2,jx0);
250             dy20             = _mm256_sub_ps(iy2,jy0);
251             dz20             = _mm256_sub_ps(iz2,jz0);
252             dx21             = _mm256_sub_ps(ix2,jx1);
253             dy21             = _mm256_sub_ps(iy2,jy1);
254             dz21             = _mm256_sub_ps(iz2,jz1);
255             dx22             = _mm256_sub_ps(ix2,jx2);
256             dy22             = _mm256_sub_ps(iy2,jy2);
257             dz22             = _mm256_sub_ps(iz2,jz2);
258
259             /* Calculate squared distance and things based on it */
260             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
261             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
262             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
263             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
264             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
265             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
266             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
267             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
268             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
269
270             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
271             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
272             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
273             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
274             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
275             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
276             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
277             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
278             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
279
280             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
281             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
282             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
283             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
284             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
285             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
286             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
287             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
288             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
289
290             fjx0             = _mm256_setzero_ps();
291             fjy0             = _mm256_setzero_ps();
292             fjz0             = _mm256_setzero_ps();
293             fjx1             = _mm256_setzero_ps();
294             fjy1             = _mm256_setzero_ps();
295             fjz1             = _mm256_setzero_ps();
296             fjx2             = _mm256_setzero_ps();
297             fjy2             = _mm256_setzero_ps();
298             fjz2             = _mm256_setzero_ps();
299
300             /**************************
301              * CALCULATE INTERACTIONS *
302              **************************/
303
304             if (gmx_mm256_any_lt(rsq00,rcutoff2))
305             {
306
307             r00              = _mm256_mul_ps(rsq00,rinv00);
308
309             /* EWALD ELECTROSTATICS */
310             
311             /* Analytical PME correction */
312             zeta2            = _mm256_mul_ps(beta2,rsq00);
313             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
314             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
315             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
316             felec            = _mm256_mul_ps(qq00,felec);
317             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
318             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
319             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
320             velec            = _mm256_mul_ps(qq00,velec);
321             
322             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
323
324             /* Update potential sum for this i atom from the interaction with this j atom. */
325             velec            = _mm256_and_ps(velec,cutoff_mask);
326             velecsum         = _mm256_add_ps(velecsum,velec);
327
328             fscal            = felec;
329
330             fscal            = _mm256_and_ps(fscal,cutoff_mask);
331
332             /* Calculate temporary vectorial force */
333             tx               = _mm256_mul_ps(fscal,dx00);
334             ty               = _mm256_mul_ps(fscal,dy00);
335             tz               = _mm256_mul_ps(fscal,dz00);
336
337             /* Update vectorial force */
338             fix0             = _mm256_add_ps(fix0,tx);
339             fiy0             = _mm256_add_ps(fiy0,ty);
340             fiz0             = _mm256_add_ps(fiz0,tz);
341
342             fjx0             = _mm256_add_ps(fjx0,tx);
343             fjy0             = _mm256_add_ps(fjy0,ty);
344             fjz0             = _mm256_add_ps(fjz0,tz);
345
346             }
347
348             /**************************
349              * CALCULATE INTERACTIONS *
350              **************************/
351
352             if (gmx_mm256_any_lt(rsq01,rcutoff2))
353             {
354
355             r01              = _mm256_mul_ps(rsq01,rinv01);
356
357             /* EWALD ELECTROSTATICS */
358             
359             /* Analytical PME correction */
360             zeta2            = _mm256_mul_ps(beta2,rsq01);
361             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
362             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
363             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
364             felec            = _mm256_mul_ps(qq01,felec);
365             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
366             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
367             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
368             velec            = _mm256_mul_ps(qq01,velec);
369             
370             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
371
372             /* Update potential sum for this i atom from the interaction with this j atom. */
373             velec            = _mm256_and_ps(velec,cutoff_mask);
374             velecsum         = _mm256_add_ps(velecsum,velec);
375
376             fscal            = felec;
377
378             fscal            = _mm256_and_ps(fscal,cutoff_mask);
379
380             /* Calculate temporary vectorial force */
381             tx               = _mm256_mul_ps(fscal,dx01);
382             ty               = _mm256_mul_ps(fscal,dy01);
383             tz               = _mm256_mul_ps(fscal,dz01);
384
385             /* Update vectorial force */
386             fix0             = _mm256_add_ps(fix0,tx);
387             fiy0             = _mm256_add_ps(fiy0,ty);
388             fiz0             = _mm256_add_ps(fiz0,tz);
389
390             fjx1             = _mm256_add_ps(fjx1,tx);
391             fjy1             = _mm256_add_ps(fjy1,ty);
392             fjz1             = _mm256_add_ps(fjz1,tz);
393
394             }
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             if (gmx_mm256_any_lt(rsq02,rcutoff2))
401             {
402
403             r02              = _mm256_mul_ps(rsq02,rinv02);
404
405             /* EWALD ELECTROSTATICS */
406             
407             /* Analytical PME correction */
408             zeta2            = _mm256_mul_ps(beta2,rsq02);
409             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
410             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
411             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
412             felec            = _mm256_mul_ps(qq02,felec);
413             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
414             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
415             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
416             velec            = _mm256_mul_ps(qq02,velec);
417             
418             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
419
420             /* Update potential sum for this i atom from the interaction with this j atom. */
421             velec            = _mm256_and_ps(velec,cutoff_mask);
422             velecsum         = _mm256_add_ps(velecsum,velec);
423
424             fscal            = felec;
425
426             fscal            = _mm256_and_ps(fscal,cutoff_mask);
427
428             /* Calculate temporary vectorial force */
429             tx               = _mm256_mul_ps(fscal,dx02);
430             ty               = _mm256_mul_ps(fscal,dy02);
431             tz               = _mm256_mul_ps(fscal,dz02);
432
433             /* Update vectorial force */
434             fix0             = _mm256_add_ps(fix0,tx);
435             fiy0             = _mm256_add_ps(fiy0,ty);
436             fiz0             = _mm256_add_ps(fiz0,tz);
437
438             fjx2             = _mm256_add_ps(fjx2,tx);
439             fjy2             = _mm256_add_ps(fjy2,ty);
440             fjz2             = _mm256_add_ps(fjz2,tz);
441
442             }
443
444             /**************************
445              * CALCULATE INTERACTIONS *
446              **************************/
447
448             if (gmx_mm256_any_lt(rsq10,rcutoff2))
449             {
450
451             r10              = _mm256_mul_ps(rsq10,rinv10);
452
453             /* EWALD ELECTROSTATICS */
454             
455             /* Analytical PME correction */
456             zeta2            = _mm256_mul_ps(beta2,rsq10);
457             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
458             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
459             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
460             felec            = _mm256_mul_ps(qq10,felec);
461             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
462             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
463             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
464             velec            = _mm256_mul_ps(qq10,velec);
465             
466             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
467
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velec            = _mm256_and_ps(velec,cutoff_mask);
470             velecsum         = _mm256_add_ps(velecsum,velec);
471
472             fscal            = felec;
473
474             fscal            = _mm256_and_ps(fscal,cutoff_mask);
475
476             /* Calculate temporary vectorial force */
477             tx               = _mm256_mul_ps(fscal,dx10);
478             ty               = _mm256_mul_ps(fscal,dy10);
479             tz               = _mm256_mul_ps(fscal,dz10);
480
481             /* Update vectorial force */
482             fix1             = _mm256_add_ps(fix1,tx);
483             fiy1             = _mm256_add_ps(fiy1,ty);
484             fiz1             = _mm256_add_ps(fiz1,tz);
485
486             fjx0             = _mm256_add_ps(fjx0,tx);
487             fjy0             = _mm256_add_ps(fjy0,ty);
488             fjz0             = _mm256_add_ps(fjz0,tz);
489
490             }
491
492             /**************************
493              * CALCULATE INTERACTIONS *
494              **************************/
495
496             if (gmx_mm256_any_lt(rsq11,rcutoff2))
497             {
498
499             r11              = _mm256_mul_ps(rsq11,rinv11);
500
501             /* EWALD ELECTROSTATICS */
502             
503             /* Analytical PME correction */
504             zeta2            = _mm256_mul_ps(beta2,rsq11);
505             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
506             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
507             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
508             felec            = _mm256_mul_ps(qq11,felec);
509             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
510             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
511             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
512             velec            = _mm256_mul_ps(qq11,velec);
513             
514             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
515
516             /* Update potential sum for this i atom from the interaction with this j atom. */
517             velec            = _mm256_and_ps(velec,cutoff_mask);
518             velecsum         = _mm256_add_ps(velecsum,velec);
519
520             fscal            = felec;
521
522             fscal            = _mm256_and_ps(fscal,cutoff_mask);
523
524             /* Calculate temporary vectorial force */
525             tx               = _mm256_mul_ps(fscal,dx11);
526             ty               = _mm256_mul_ps(fscal,dy11);
527             tz               = _mm256_mul_ps(fscal,dz11);
528
529             /* Update vectorial force */
530             fix1             = _mm256_add_ps(fix1,tx);
531             fiy1             = _mm256_add_ps(fiy1,ty);
532             fiz1             = _mm256_add_ps(fiz1,tz);
533
534             fjx1             = _mm256_add_ps(fjx1,tx);
535             fjy1             = _mm256_add_ps(fjy1,ty);
536             fjz1             = _mm256_add_ps(fjz1,tz);
537
538             }
539
540             /**************************
541              * CALCULATE INTERACTIONS *
542              **************************/
543
544             if (gmx_mm256_any_lt(rsq12,rcutoff2))
545             {
546
547             r12              = _mm256_mul_ps(rsq12,rinv12);
548
549             /* EWALD ELECTROSTATICS */
550             
551             /* Analytical PME correction */
552             zeta2            = _mm256_mul_ps(beta2,rsq12);
553             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
554             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
555             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
556             felec            = _mm256_mul_ps(qq12,felec);
557             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
558             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
559             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
560             velec            = _mm256_mul_ps(qq12,velec);
561             
562             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velec            = _mm256_and_ps(velec,cutoff_mask);
566             velecsum         = _mm256_add_ps(velecsum,velec);
567
568             fscal            = felec;
569
570             fscal            = _mm256_and_ps(fscal,cutoff_mask);
571
572             /* Calculate temporary vectorial force */
573             tx               = _mm256_mul_ps(fscal,dx12);
574             ty               = _mm256_mul_ps(fscal,dy12);
575             tz               = _mm256_mul_ps(fscal,dz12);
576
577             /* Update vectorial force */
578             fix1             = _mm256_add_ps(fix1,tx);
579             fiy1             = _mm256_add_ps(fiy1,ty);
580             fiz1             = _mm256_add_ps(fiz1,tz);
581
582             fjx2             = _mm256_add_ps(fjx2,tx);
583             fjy2             = _mm256_add_ps(fjy2,ty);
584             fjz2             = _mm256_add_ps(fjz2,tz);
585
586             }
587
588             /**************************
589              * CALCULATE INTERACTIONS *
590              **************************/
591
592             if (gmx_mm256_any_lt(rsq20,rcutoff2))
593             {
594
595             r20              = _mm256_mul_ps(rsq20,rinv20);
596
597             /* EWALD ELECTROSTATICS */
598             
599             /* Analytical PME correction */
600             zeta2            = _mm256_mul_ps(beta2,rsq20);
601             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
602             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
603             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
604             felec            = _mm256_mul_ps(qq20,felec);
605             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
606             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
607             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
608             velec            = _mm256_mul_ps(qq20,velec);
609             
610             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
611
612             /* Update potential sum for this i atom from the interaction with this j atom. */
613             velec            = _mm256_and_ps(velec,cutoff_mask);
614             velecsum         = _mm256_add_ps(velecsum,velec);
615
616             fscal            = felec;
617
618             fscal            = _mm256_and_ps(fscal,cutoff_mask);
619
620             /* Calculate temporary vectorial force */
621             tx               = _mm256_mul_ps(fscal,dx20);
622             ty               = _mm256_mul_ps(fscal,dy20);
623             tz               = _mm256_mul_ps(fscal,dz20);
624
625             /* Update vectorial force */
626             fix2             = _mm256_add_ps(fix2,tx);
627             fiy2             = _mm256_add_ps(fiy2,ty);
628             fiz2             = _mm256_add_ps(fiz2,tz);
629
630             fjx0             = _mm256_add_ps(fjx0,tx);
631             fjy0             = _mm256_add_ps(fjy0,ty);
632             fjz0             = _mm256_add_ps(fjz0,tz);
633
634             }
635
636             /**************************
637              * CALCULATE INTERACTIONS *
638              **************************/
639
640             if (gmx_mm256_any_lt(rsq21,rcutoff2))
641             {
642
643             r21              = _mm256_mul_ps(rsq21,rinv21);
644
645             /* EWALD ELECTROSTATICS */
646             
647             /* Analytical PME correction */
648             zeta2            = _mm256_mul_ps(beta2,rsq21);
649             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
650             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
651             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
652             felec            = _mm256_mul_ps(qq21,felec);
653             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
654             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
655             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
656             velec            = _mm256_mul_ps(qq21,velec);
657             
658             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
659
660             /* Update potential sum for this i atom from the interaction with this j atom. */
661             velec            = _mm256_and_ps(velec,cutoff_mask);
662             velecsum         = _mm256_add_ps(velecsum,velec);
663
664             fscal            = felec;
665
666             fscal            = _mm256_and_ps(fscal,cutoff_mask);
667
668             /* Calculate temporary vectorial force */
669             tx               = _mm256_mul_ps(fscal,dx21);
670             ty               = _mm256_mul_ps(fscal,dy21);
671             tz               = _mm256_mul_ps(fscal,dz21);
672
673             /* Update vectorial force */
674             fix2             = _mm256_add_ps(fix2,tx);
675             fiy2             = _mm256_add_ps(fiy2,ty);
676             fiz2             = _mm256_add_ps(fiz2,tz);
677
678             fjx1             = _mm256_add_ps(fjx1,tx);
679             fjy1             = _mm256_add_ps(fjy1,ty);
680             fjz1             = _mm256_add_ps(fjz1,tz);
681
682             }
683
684             /**************************
685              * CALCULATE INTERACTIONS *
686              **************************/
687
688             if (gmx_mm256_any_lt(rsq22,rcutoff2))
689             {
690
691             r22              = _mm256_mul_ps(rsq22,rinv22);
692
693             /* EWALD ELECTROSTATICS */
694             
695             /* Analytical PME correction */
696             zeta2            = _mm256_mul_ps(beta2,rsq22);
697             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
698             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
699             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
700             felec            = _mm256_mul_ps(qq22,felec);
701             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
702             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
703             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
704             velec            = _mm256_mul_ps(qq22,velec);
705             
706             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
707
708             /* Update potential sum for this i atom from the interaction with this j atom. */
709             velec            = _mm256_and_ps(velec,cutoff_mask);
710             velecsum         = _mm256_add_ps(velecsum,velec);
711
712             fscal            = felec;
713
714             fscal            = _mm256_and_ps(fscal,cutoff_mask);
715
716             /* Calculate temporary vectorial force */
717             tx               = _mm256_mul_ps(fscal,dx22);
718             ty               = _mm256_mul_ps(fscal,dy22);
719             tz               = _mm256_mul_ps(fscal,dz22);
720
721             /* Update vectorial force */
722             fix2             = _mm256_add_ps(fix2,tx);
723             fiy2             = _mm256_add_ps(fiy2,ty);
724             fiz2             = _mm256_add_ps(fiz2,tz);
725
726             fjx2             = _mm256_add_ps(fjx2,tx);
727             fjy2             = _mm256_add_ps(fjy2,ty);
728             fjz2             = _mm256_add_ps(fjz2,tz);
729
730             }
731
732             fjptrA             = f+j_coord_offsetA;
733             fjptrB             = f+j_coord_offsetB;
734             fjptrC             = f+j_coord_offsetC;
735             fjptrD             = f+j_coord_offsetD;
736             fjptrE             = f+j_coord_offsetE;
737             fjptrF             = f+j_coord_offsetF;
738             fjptrG             = f+j_coord_offsetG;
739             fjptrH             = f+j_coord_offsetH;
740
741             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
742                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
743
744             /* Inner loop uses 981 flops */
745         }
746
747         if(jidx<j_index_end)
748         {
749
750             /* Get j neighbor index, and coordinate index */
751             jnrlistA         = jjnr[jidx];
752             jnrlistB         = jjnr[jidx+1];
753             jnrlistC         = jjnr[jidx+2];
754             jnrlistD         = jjnr[jidx+3];
755             jnrlistE         = jjnr[jidx+4];
756             jnrlistF         = jjnr[jidx+5];
757             jnrlistG         = jjnr[jidx+6];
758             jnrlistH         = jjnr[jidx+7];
759             /* Sign of each element will be negative for non-real atoms.
760              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
761              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
762              */
763             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
764                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
765                                             
766             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
767             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
768             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
769             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
770             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
771             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
772             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
773             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
774             j_coord_offsetA  = DIM*jnrA;
775             j_coord_offsetB  = DIM*jnrB;
776             j_coord_offsetC  = DIM*jnrC;
777             j_coord_offsetD  = DIM*jnrD;
778             j_coord_offsetE  = DIM*jnrE;
779             j_coord_offsetF  = DIM*jnrF;
780             j_coord_offsetG  = DIM*jnrG;
781             j_coord_offsetH  = DIM*jnrH;
782
783             /* load j atom coordinates */
784             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
785                                                  x+j_coord_offsetC,x+j_coord_offsetD,
786                                                  x+j_coord_offsetE,x+j_coord_offsetF,
787                                                  x+j_coord_offsetG,x+j_coord_offsetH,
788                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
789
790             /* Calculate displacement vector */
791             dx00             = _mm256_sub_ps(ix0,jx0);
792             dy00             = _mm256_sub_ps(iy0,jy0);
793             dz00             = _mm256_sub_ps(iz0,jz0);
794             dx01             = _mm256_sub_ps(ix0,jx1);
795             dy01             = _mm256_sub_ps(iy0,jy1);
796             dz01             = _mm256_sub_ps(iz0,jz1);
797             dx02             = _mm256_sub_ps(ix0,jx2);
798             dy02             = _mm256_sub_ps(iy0,jy2);
799             dz02             = _mm256_sub_ps(iz0,jz2);
800             dx10             = _mm256_sub_ps(ix1,jx0);
801             dy10             = _mm256_sub_ps(iy1,jy0);
802             dz10             = _mm256_sub_ps(iz1,jz0);
803             dx11             = _mm256_sub_ps(ix1,jx1);
804             dy11             = _mm256_sub_ps(iy1,jy1);
805             dz11             = _mm256_sub_ps(iz1,jz1);
806             dx12             = _mm256_sub_ps(ix1,jx2);
807             dy12             = _mm256_sub_ps(iy1,jy2);
808             dz12             = _mm256_sub_ps(iz1,jz2);
809             dx20             = _mm256_sub_ps(ix2,jx0);
810             dy20             = _mm256_sub_ps(iy2,jy0);
811             dz20             = _mm256_sub_ps(iz2,jz0);
812             dx21             = _mm256_sub_ps(ix2,jx1);
813             dy21             = _mm256_sub_ps(iy2,jy1);
814             dz21             = _mm256_sub_ps(iz2,jz1);
815             dx22             = _mm256_sub_ps(ix2,jx2);
816             dy22             = _mm256_sub_ps(iy2,jy2);
817             dz22             = _mm256_sub_ps(iz2,jz2);
818
819             /* Calculate squared distance and things based on it */
820             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
821             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
822             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
823             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
824             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
825             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
826             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
827             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
828             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
829
830             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
831             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
832             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
833             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
834             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
835             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
836             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
837             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
838             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
839
840             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
841             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
842             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
843             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
844             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
845             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
846             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
847             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
848             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
849
850             fjx0             = _mm256_setzero_ps();
851             fjy0             = _mm256_setzero_ps();
852             fjz0             = _mm256_setzero_ps();
853             fjx1             = _mm256_setzero_ps();
854             fjy1             = _mm256_setzero_ps();
855             fjz1             = _mm256_setzero_ps();
856             fjx2             = _mm256_setzero_ps();
857             fjy2             = _mm256_setzero_ps();
858             fjz2             = _mm256_setzero_ps();
859
860             /**************************
861              * CALCULATE INTERACTIONS *
862              **************************/
863
864             if (gmx_mm256_any_lt(rsq00,rcutoff2))
865             {
866
867             r00              = _mm256_mul_ps(rsq00,rinv00);
868             r00              = _mm256_andnot_ps(dummy_mask,r00);
869
870             /* EWALD ELECTROSTATICS */
871             
872             /* Analytical PME correction */
873             zeta2            = _mm256_mul_ps(beta2,rsq00);
874             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
875             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
876             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
877             felec            = _mm256_mul_ps(qq00,felec);
878             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
879             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
880             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
881             velec            = _mm256_mul_ps(qq00,velec);
882             
883             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
884
885             /* Update potential sum for this i atom from the interaction with this j atom. */
886             velec            = _mm256_and_ps(velec,cutoff_mask);
887             velec            = _mm256_andnot_ps(dummy_mask,velec);
888             velecsum         = _mm256_add_ps(velecsum,velec);
889
890             fscal            = felec;
891
892             fscal            = _mm256_and_ps(fscal,cutoff_mask);
893
894             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
895
896             /* Calculate temporary vectorial force */
897             tx               = _mm256_mul_ps(fscal,dx00);
898             ty               = _mm256_mul_ps(fscal,dy00);
899             tz               = _mm256_mul_ps(fscal,dz00);
900
901             /* Update vectorial force */
902             fix0             = _mm256_add_ps(fix0,tx);
903             fiy0             = _mm256_add_ps(fiy0,ty);
904             fiz0             = _mm256_add_ps(fiz0,tz);
905
906             fjx0             = _mm256_add_ps(fjx0,tx);
907             fjy0             = _mm256_add_ps(fjy0,ty);
908             fjz0             = _mm256_add_ps(fjz0,tz);
909
910             }
911
912             /**************************
913              * CALCULATE INTERACTIONS *
914              **************************/
915
916             if (gmx_mm256_any_lt(rsq01,rcutoff2))
917             {
918
919             r01              = _mm256_mul_ps(rsq01,rinv01);
920             r01              = _mm256_andnot_ps(dummy_mask,r01);
921
922             /* EWALD ELECTROSTATICS */
923             
924             /* Analytical PME correction */
925             zeta2            = _mm256_mul_ps(beta2,rsq01);
926             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
927             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
928             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
929             felec            = _mm256_mul_ps(qq01,felec);
930             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
931             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
932             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
933             velec            = _mm256_mul_ps(qq01,velec);
934             
935             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
936
937             /* Update potential sum for this i atom from the interaction with this j atom. */
938             velec            = _mm256_and_ps(velec,cutoff_mask);
939             velec            = _mm256_andnot_ps(dummy_mask,velec);
940             velecsum         = _mm256_add_ps(velecsum,velec);
941
942             fscal            = felec;
943
944             fscal            = _mm256_and_ps(fscal,cutoff_mask);
945
946             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
947
948             /* Calculate temporary vectorial force */
949             tx               = _mm256_mul_ps(fscal,dx01);
950             ty               = _mm256_mul_ps(fscal,dy01);
951             tz               = _mm256_mul_ps(fscal,dz01);
952
953             /* Update vectorial force */
954             fix0             = _mm256_add_ps(fix0,tx);
955             fiy0             = _mm256_add_ps(fiy0,ty);
956             fiz0             = _mm256_add_ps(fiz0,tz);
957
958             fjx1             = _mm256_add_ps(fjx1,tx);
959             fjy1             = _mm256_add_ps(fjy1,ty);
960             fjz1             = _mm256_add_ps(fjz1,tz);
961
962             }
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             if (gmx_mm256_any_lt(rsq02,rcutoff2))
969             {
970
971             r02              = _mm256_mul_ps(rsq02,rinv02);
972             r02              = _mm256_andnot_ps(dummy_mask,r02);
973
974             /* EWALD ELECTROSTATICS */
975             
976             /* Analytical PME correction */
977             zeta2            = _mm256_mul_ps(beta2,rsq02);
978             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
979             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
980             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
981             felec            = _mm256_mul_ps(qq02,felec);
982             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
983             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
984             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
985             velec            = _mm256_mul_ps(qq02,velec);
986             
987             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
988
989             /* Update potential sum for this i atom from the interaction with this j atom. */
990             velec            = _mm256_and_ps(velec,cutoff_mask);
991             velec            = _mm256_andnot_ps(dummy_mask,velec);
992             velecsum         = _mm256_add_ps(velecsum,velec);
993
994             fscal            = felec;
995
996             fscal            = _mm256_and_ps(fscal,cutoff_mask);
997
998             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
999
1000             /* Calculate temporary vectorial force */
1001             tx               = _mm256_mul_ps(fscal,dx02);
1002             ty               = _mm256_mul_ps(fscal,dy02);
1003             tz               = _mm256_mul_ps(fscal,dz02);
1004
1005             /* Update vectorial force */
1006             fix0             = _mm256_add_ps(fix0,tx);
1007             fiy0             = _mm256_add_ps(fiy0,ty);
1008             fiz0             = _mm256_add_ps(fiz0,tz);
1009
1010             fjx2             = _mm256_add_ps(fjx2,tx);
1011             fjy2             = _mm256_add_ps(fjy2,ty);
1012             fjz2             = _mm256_add_ps(fjz2,tz);
1013
1014             }
1015
1016             /**************************
1017              * CALCULATE INTERACTIONS *
1018              **************************/
1019
1020             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1021             {
1022
1023             r10              = _mm256_mul_ps(rsq10,rinv10);
1024             r10              = _mm256_andnot_ps(dummy_mask,r10);
1025
1026             /* EWALD ELECTROSTATICS */
1027             
1028             /* Analytical PME correction */
1029             zeta2            = _mm256_mul_ps(beta2,rsq10);
1030             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1031             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1032             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1033             felec            = _mm256_mul_ps(qq10,felec);
1034             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1035             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1036             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
1037             velec            = _mm256_mul_ps(qq10,velec);
1038             
1039             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1040
1041             /* Update potential sum for this i atom from the interaction with this j atom. */
1042             velec            = _mm256_and_ps(velec,cutoff_mask);
1043             velec            = _mm256_andnot_ps(dummy_mask,velec);
1044             velecsum         = _mm256_add_ps(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1049
1050             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1051
1052             /* Calculate temporary vectorial force */
1053             tx               = _mm256_mul_ps(fscal,dx10);
1054             ty               = _mm256_mul_ps(fscal,dy10);
1055             tz               = _mm256_mul_ps(fscal,dz10);
1056
1057             /* Update vectorial force */
1058             fix1             = _mm256_add_ps(fix1,tx);
1059             fiy1             = _mm256_add_ps(fiy1,ty);
1060             fiz1             = _mm256_add_ps(fiz1,tz);
1061
1062             fjx0             = _mm256_add_ps(fjx0,tx);
1063             fjy0             = _mm256_add_ps(fjy0,ty);
1064             fjz0             = _mm256_add_ps(fjz0,tz);
1065
1066             }
1067
1068             /**************************
1069              * CALCULATE INTERACTIONS *
1070              **************************/
1071
1072             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1073             {
1074
1075             r11              = _mm256_mul_ps(rsq11,rinv11);
1076             r11              = _mm256_andnot_ps(dummy_mask,r11);
1077
1078             /* EWALD ELECTROSTATICS */
1079             
1080             /* Analytical PME correction */
1081             zeta2            = _mm256_mul_ps(beta2,rsq11);
1082             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1083             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1084             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1085             felec            = _mm256_mul_ps(qq11,felec);
1086             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1087             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1088             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1089             velec            = _mm256_mul_ps(qq11,velec);
1090             
1091             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1092
1093             /* Update potential sum for this i atom from the interaction with this j atom. */
1094             velec            = _mm256_and_ps(velec,cutoff_mask);
1095             velec            = _mm256_andnot_ps(dummy_mask,velec);
1096             velecsum         = _mm256_add_ps(velecsum,velec);
1097
1098             fscal            = felec;
1099
1100             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1101
1102             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1103
1104             /* Calculate temporary vectorial force */
1105             tx               = _mm256_mul_ps(fscal,dx11);
1106             ty               = _mm256_mul_ps(fscal,dy11);
1107             tz               = _mm256_mul_ps(fscal,dz11);
1108
1109             /* Update vectorial force */
1110             fix1             = _mm256_add_ps(fix1,tx);
1111             fiy1             = _mm256_add_ps(fiy1,ty);
1112             fiz1             = _mm256_add_ps(fiz1,tz);
1113
1114             fjx1             = _mm256_add_ps(fjx1,tx);
1115             fjy1             = _mm256_add_ps(fjy1,ty);
1116             fjz1             = _mm256_add_ps(fjz1,tz);
1117
1118             }
1119
1120             /**************************
1121              * CALCULATE INTERACTIONS *
1122              **************************/
1123
1124             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1125             {
1126
1127             r12              = _mm256_mul_ps(rsq12,rinv12);
1128             r12              = _mm256_andnot_ps(dummy_mask,r12);
1129
1130             /* EWALD ELECTROSTATICS */
1131             
1132             /* Analytical PME correction */
1133             zeta2            = _mm256_mul_ps(beta2,rsq12);
1134             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1135             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1136             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1137             felec            = _mm256_mul_ps(qq12,felec);
1138             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1139             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1140             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1141             velec            = _mm256_mul_ps(qq12,velec);
1142             
1143             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1144
1145             /* Update potential sum for this i atom from the interaction with this j atom. */
1146             velec            = _mm256_and_ps(velec,cutoff_mask);
1147             velec            = _mm256_andnot_ps(dummy_mask,velec);
1148             velecsum         = _mm256_add_ps(velecsum,velec);
1149
1150             fscal            = felec;
1151
1152             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1153
1154             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1155
1156             /* Calculate temporary vectorial force */
1157             tx               = _mm256_mul_ps(fscal,dx12);
1158             ty               = _mm256_mul_ps(fscal,dy12);
1159             tz               = _mm256_mul_ps(fscal,dz12);
1160
1161             /* Update vectorial force */
1162             fix1             = _mm256_add_ps(fix1,tx);
1163             fiy1             = _mm256_add_ps(fiy1,ty);
1164             fiz1             = _mm256_add_ps(fiz1,tz);
1165
1166             fjx2             = _mm256_add_ps(fjx2,tx);
1167             fjy2             = _mm256_add_ps(fjy2,ty);
1168             fjz2             = _mm256_add_ps(fjz2,tz);
1169
1170             }
1171
1172             /**************************
1173              * CALCULATE INTERACTIONS *
1174              **************************/
1175
1176             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1177             {
1178
1179             r20              = _mm256_mul_ps(rsq20,rinv20);
1180             r20              = _mm256_andnot_ps(dummy_mask,r20);
1181
1182             /* EWALD ELECTROSTATICS */
1183             
1184             /* Analytical PME correction */
1185             zeta2            = _mm256_mul_ps(beta2,rsq20);
1186             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1187             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1188             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1189             felec            = _mm256_mul_ps(qq20,felec);
1190             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1191             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1192             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
1193             velec            = _mm256_mul_ps(qq20,velec);
1194             
1195             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1196
1197             /* Update potential sum for this i atom from the interaction with this j atom. */
1198             velec            = _mm256_and_ps(velec,cutoff_mask);
1199             velec            = _mm256_andnot_ps(dummy_mask,velec);
1200             velecsum         = _mm256_add_ps(velecsum,velec);
1201
1202             fscal            = felec;
1203
1204             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1205
1206             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1207
1208             /* Calculate temporary vectorial force */
1209             tx               = _mm256_mul_ps(fscal,dx20);
1210             ty               = _mm256_mul_ps(fscal,dy20);
1211             tz               = _mm256_mul_ps(fscal,dz20);
1212
1213             /* Update vectorial force */
1214             fix2             = _mm256_add_ps(fix2,tx);
1215             fiy2             = _mm256_add_ps(fiy2,ty);
1216             fiz2             = _mm256_add_ps(fiz2,tz);
1217
1218             fjx0             = _mm256_add_ps(fjx0,tx);
1219             fjy0             = _mm256_add_ps(fjy0,ty);
1220             fjz0             = _mm256_add_ps(fjz0,tz);
1221
1222             }
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1229             {
1230
1231             r21              = _mm256_mul_ps(rsq21,rinv21);
1232             r21              = _mm256_andnot_ps(dummy_mask,r21);
1233
1234             /* EWALD ELECTROSTATICS */
1235             
1236             /* Analytical PME correction */
1237             zeta2            = _mm256_mul_ps(beta2,rsq21);
1238             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1239             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1240             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1241             felec            = _mm256_mul_ps(qq21,felec);
1242             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1243             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1244             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1245             velec            = _mm256_mul_ps(qq21,velec);
1246             
1247             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1248
1249             /* Update potential sum for this i atom from the interaction with this j atom. */
1250             velec            = _mm256_and_ps(velec,cutoff_mask);
1251             velec            = _mm256_andnot_ps(dummy_mask,velec);
1252             velecsum         = _mm256_add_ps(velecsum,velec);
1253
1254             fscal            = felec;
1255
1256             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1257
1258             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1259
1260             /* Calculate temporary vectorial force */
1261             tx               = _mm256_mul_ps(fscal,dx21);
1262             ty               = _mm256_mul_ps(fscal,dy21);
1263             tz               = _mm256_mul_ps(fscal,dz21);
1264
1265             /* Update vectorial force */
1266             fix2             = _mm256_add_ps(fix2,tx);
1267             fiy2             = _mm256_add_ps(fiy2,ty);
1268             fiz2             = _mm256_add_ps(fiz2,tz);
1269
1270             fjx1             = _mm256_add_ps(fjx1,tx);
1271             fjy1             = _mm256_add_ps(fjy1,ty);
1272             fjz1             = _mm256_add_ps(fjz1,tz);
1273
1274             }
1275
1276             /**************************
1277              * CALCULATE INTERACTIONS *
1278              **************************/
1279
1280             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1281             {
1282
1283             r22              = _mm256_mul_ps(rsq22,rinv22);
1284             r22              = _mm256_andnot_ps(dummy_mask,r22);
1285
1286             /* EWALD ELECTROSTATICS */
1287             
1288             /* Analytical PME correction */
1289             zeta2            = _mm256_mul_ps(beta2,rsq22);
1290             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1291             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1292             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1293             felec            = _mm256_mul_ps(qq22,felec);
1294             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1295             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1296             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1297             velec            = _mm256_mul_ps(qq22,velec);
1298             
1299             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1300
1301             /* Update potential sum for this i atom from the interaction with this j atom. */
1302             velec            = _mm256_and_ps(velec,cutoff_mask);
1303             velec            = _mm256_andnot_ps(dummy_mask,velec);
1304             velecsum         = _mm256_add_ps(velecsum,velec);
1305
1306             fscal            = felec;
1307
1308             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1309
1310             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1311
1312             /* Calculate temporary vectorial force */
1313             tx               = _mm256_mul_ps(fscal,dx22);
1314             ty               = _mm256_mul_ps(fscal,dy22);
1315             tz               = _mm256_mul_ps(fscal,dz22);
1316
1317             /* Update vectorial force */
1318             fix2             = _mm256_add_ps(fix2,tx);
1319             fiy2             = _mm256_add_ps(fiy2,ty);
1320             fiz2             = _mm256_add_ps(fiz2,tz);
1321
1322             fjx2             = _mm256_add_ps(fjx2,tx);
1323             fjy2             = _mm256_add_ps(fjy2,ty);
1324             fjz2             = _mm256_add_ps(fjz2,tz);
1325
1326             }
1327
1328             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1329             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1330             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1331             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1332             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1333             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1334             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1335             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1336
1337             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1338                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1339
1340             /* Inner loop uses 990 flops */
1341         }
1342
1343         /* End of innermost loop */
1344
1345         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1346                                                  f+i_coord_offset,fshift+i_shift_offset);
1347
1348         ggid                        = gid[iidx];
1349         /* Update potential energies */
1350         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1351
1352         /* Increment number of inner iterations */
1353         inneriter                  += j_index_end - j_index_start;
1354
1355         /* Outer loop uses 19 flops */
1356     }
1357
1358     /* Increment number of outer iterations */
1359     outeriter        += nri;
1360
1361     /* Update outer/inner flops */
1362
1363     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*990);
1364 }
1365 /*
1366  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_single
1367  * Electrostatics interaction: Ewald
1368  * VdW interaction:            None
1369  * Geometry:                   Water3-Water3
1370  * Calculate force/pot:        Force
1371  */
1372 void
1373 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_single
1374                     (t_nblist * gmx_restrict                nlist,
1375                      rvec * gmx_restrict                    xx,
1376                      rvec * gmx_restrict                    ff,
1377                      t_forcerec * gmx_restrict              fr,
1378                      t_mdatoms * gmx_restrict               mdatoms,
1379                      nb_kernel_data_t * gmx_restrict        kernel_data,
1380                      t_nrnb * gmx_restrict                  nrnb)
1381 {
1382     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1383      * just 0 for non-waters.
1384      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1385      * jnr indices corresponding to data put in the four positions in the SIMD register.
1386      */
1387     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1388     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1389     int              jnrA,jnrB,jnrC,jnrD;
1390     int              jnrE,jnrF,jnrG,jnrH;
1391     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1392     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1393     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1394     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1395     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1396     real             rcutoff_scalar;
1397     real             *shiftvec,*fshift,*x,*f;
1398     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1399     real             scratch[4*DIM];
1400     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1401     real *           vdwioffsetptr0;
1402     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1403     real *           vdwioffsetptr1;
1404     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1405     real *           vdwioffsetptr2;
1406     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1407     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1408     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1409     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1410     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1411     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1412     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1413     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1414     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1415     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1416     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1417     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1418     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1419     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1420     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1421     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1422     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1423     real             *charge;
1424     __m256i          ewitab;
1425     __m128i          ewitab_lo,ewitab_hi;
1426     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1427     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1428     real             *ewtab;
1429     __m256           dummy_mask,cutoff_mask;
1430     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1431     __m256           one     = _mm256_set1_ps(1.0);
1432     __m256           two     = _mm256_set1_ps(2.0);
1433     x                = xx[0];
1434     f                = ff[0];
1435
1436     nri              = nlist->nri;
1437     iinr             = nlist->iinr;
1438     jindex           = nlist->jindex;
1439     jjnr             = nlist->jjnr;
1440     shiftidx         = nlist->shift;
1441     gid              = nlist->gid;
1442     shiftvec         = fr->shift_vec[0];
1443     fshift           = fr->fshift[0];
1444     facel            = _mm256_set1_ps(fr->epsfac);
1445     charge           = mdatoms->chargeA;
1446
1447     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1448     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1449     beta2            = _mm256_mul_ps(beta,beta);
1450     beta3            = _mm256_mul_ps(beta,beta2);
1451
1452     ewtab            = fr->ic->tabq_coul_F;
1453     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1454     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1455
1456     /* Setup water-specific parameters */
1457     inr              = nlist->iinr[0];
1458     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1459     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1460     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1461
1462     jq0              = _mm256_set1_ps(charge[inr+0]);
1463     jq1              = _mm256_set1_ps(charge[inr+1]);
1464     jq2              = _mm256_set1_ps(charge[inr+2]);
1465     qq00             = _mm256_mul_ps(iq0,jq0);
1466     qq01             = _mm256_mul_ps(iq0,jq1);
1467     qq02             = _mm256_mul_ps(iq0,jq2);
1468     qq10             = _mm256_mul_ps(iq1,jq0);
1469     qq11             = _mm256_mul_ps(iq1,jq1);
1470     qq12             = _mm256_mul_ps(iq1,jq2);
1471     qq20             = _mm256_mul_ps(iq2,jq0);
1472     qq21             = _mm256_mul_ps(iq2,jq1);
1473     qq22             = _mm256_mul_ps(iq2,jq2);
1474
1475     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1476     rcutoff_scalar   = fr->rcoulomb;
1477     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1478     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1479
1480     /* Avoid stupid compiler warnings */
1481     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1482     j_coord_offsetA = 0;
1483     j_coord_offsetB = 0;
1484     j_coord_offsetC = 0;
1485     j_coord_offsetD = 0;
1486     j_coord_offsetE = 0;
1487     j_coord_offsetF = 0;
1488     j_coord_offsetG = 0;
1489     j_coord_offsetH = 0;
1490
1491     outeriter        = 0;
1492     inneriter        = 0;
1493
1494     for(iidx=0;iidx<4*DIM;iidx++)
1495     {
1496         scratch[iidx] = 0.0;
1497     }
1498
1499     /* Start outer loop over neighborlists */
1500     for(iidx=0; iidx<nri; iidx++)
1501     {
1502         /* Load shift vector for this list */
1503         i_shift_offset   = DIM*shiftidx[iidx];
1504
1505         /* Load limits for loop over neighbors */
1506         j_index_start    = jindex[iidx];
1507         j_index_end      = jindex[iidx+1];
1508
1509         /* Get outer coordinate index */
1510         inr              = iinr[iidx];
1511         i_coord_offset   = DIM*inr;
1512
1513         /* Load i particle coords and add shift vector */
1514         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1515                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1516
1517         fix0             = _mm256_setzero_ps();
1518         fiy0             = _mm256_setzero_ps();
1519         fiz0             = _mm256_setzero_ps();
1520         fix1             = _mm256_setzero_ps();
1521         fiy1             = _mm256_setzero_ps();
1522         fiz1             = _mm256_setzero_ps();
1523         fix2             = _mm256_setzero_ps();
1524         fiy2             = _mm256_setzero_ps();
1525         fiz2             = _mm256_setzero_ps();
1526
1527         /* Start inner kernel loop */
1528         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1529         {
1530
1531             /* Get j neighbor index, and coordinate index */
1532             jnrA             = jjnr[jidx];
1533             jnrB             = jjnr[jidx+1];
1534             jnrC             = jjnr[jidx+2];
1535             jnrD             = jjnr[jidx+3];
1536             jnrE             = jjnr[jidx+4];
1537             jnrF             = jjnr[jidx+5];
1538             jnrG             = jjnr[jidx+6];
1539             jnrH             = jjnr[jidx+7];
1540             j_coord_offsetA  = DIM*jnrA;
1541             j_coord_offsetB  = DIM*jnrB;
1542             j_coord_offsetC  = DIM*jnrC;
1543             j_coord_offsetD  = DIM*jnrD;
1544             j_coord_offsetE  = DIM*jnrE;
1545             j_coord_offsetF  = DIM*jnrF;
1546             j_coord_offsetG  = DIM*jnrG;
1547             j_coord_offsetH  = DIM*jnrH;
1548
1549             /* load j atom coordinates */
1550             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1551                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1552                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1553                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1554                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1555
1556             /* Calculate displacement vector */
1557             dx00             = _mm256_sub_ps(ix0,jx0);
1558             dy00             = _mm256_sub_ps(iy0,jy0);
1559             dz00             = _mm256_sub_ps(iz0,jz0);
1560             dx01             = _mm256_sub_ps(ix0,jx1);
1561             dy01             = _mm256_sub_ps(iy0,jy1);
1562             dz01             = _mm256_sub_ps(iz0,jz1);
1563             dx02             = _mm256_sub_ps(ix0,jx2);
1564             dy02             = _mm256_sub_ps(iy0,jy2);
1565             dz02             = _mm256_sub_ps(iz0,jz2);
1566             dx10             = _mm256_sub_ps(ix1,jx0);
1567             dy10             = _mm256_sub_ps(iy1,jy0);
1568             dz10             = _mm256_sub_ps(iz1,jz0);
1569             dx11             = _mm256_sub_ps(ix1,jx1);
1570             dy11             = _mm256_sub_ps(iy1,jy1);
1571             dz11             = _mm256_sub_ps(iz1,jz1);
1572             dx12             = _mm256_sub_ps(ix1,jx2);
1573             dy12             = _mm256_sub_ps(iy1,jy2);
1574             dz12             = _mm256_sub_ps(iz1,jz2);
1575             dx20             = _mm256_sub_ps(ix2,jx0);
1576             dy20             = _mm256_sub_ps(iy2,jy0);
1577             dz20             = _mm256_sub_ps(iz2,jz0);
1578             dx21             = _mm256_sub_ps(ix2,jx1);
1579             dy21             = _mm256_sub_ps(iy2,jy1);
1580             dz21             = _mm256_sub_ps(iz2,jz1);
1581             dx22             = _mm256_sub_ps(ix2,jx2);
1582             dy22             = _mm256_sub_ps(iy2,jy2);
1583             dz22             = _mm256_sub_ps(iz2,jz2);
1584
1585             /* Calculate squared distance and things based on it */
1586             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1587             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1588             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1589             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1590             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1591             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1592             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1593             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1594             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1595
1596             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1597             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1598             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1599             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1600             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1601             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1602             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1603             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1604             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1605
1606             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1607             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1608             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1609             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1610             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1611             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1612             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1613             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1614             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1615
1616             fjx0             = _mm256_setzero_ps();
1617             fjy0             = _mm256_setzero_ps();
1618             fjz0             = _mm256_setzero_ps();
1619             fjx1             = _mm256_setzero_ps();
1620             fjy1             = _mm256_setzero_ps();
1621             fjz1             = _mm256_setzero_ps();
1622             fjx2             = _mm256_setzero_ps();
1623             fjy2             = _mm256_setzero_ps();
1624             fjz2             = _mm256_setzero_ps();
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1631             {
1632
1633             r00              = _mm256_mul_ps(rsq00,rinv00);
1634
1635             /* EWALD ELECTROSTATICS */
1636             
1637             /* Analytical PME correction */
1638             zeta2            = _mm256_mul_ps(beta2,rsq00);
1639             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1640             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1641             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1642             felec            = _mm256_mul_ps(qq00,felec);
1643             
1644             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1645
1646             fscal            = felec;
1647
1648             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1649
1650             /* Calculate temporary vectorial force */
1651             tx               = _mm256_mul_ps(fscal,dx00);
1652             ty               = _mm256_mul_ps(fscal,dy00);
1653             tz               = _mm256_mul_ps(fscal,dz00);
1654
1655             /* Update vectorial force */
1656             fix0             = _mm256_add_ps(fix0,tx);
1657             fiy0             = _mm256_add_ps(fiy0,ty);
1658             fiz0             = _mm256_add_ps(fiz0,tz);
1659
1660             fjx0             = _mm256_add_ps(fjx0,tx);
1661             fjy0             = _mm256_add_ps(fjy0,ty);
1662             fjz0             = _mm256_add_ps(fjz0,tz);
1663
1664             }
1665
1666             /**************************
1667              * CALCULATE INTERACTIONS *
1668              **************************/
1669
1670             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1671             {
1672
1673             r01              = _mm256_mul_ps(rsq01,rinv01);
1674
1675             /* EWALD ELECTROSTATICS */
1676             
1677             /* Analytical PME correction */
1678             zeta2            = _mm256_mul_ps(beta2,rsq01);
1679             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1680             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1681             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1682             felec            = _mm256_mul_ps(qq01,felec);
1683             
1684             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1685
1686             fscal            = felec;
1687
1688             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1689
1690             /* Calculate temporary vectorial force */
1691             tx               = _mm256_mul_ps(fscal,dx01);
1692             ty               = _mm256_mul_ps(fscal,dy01);
1693             tz               = _mm256_mul_ps(fscal,dz01);
1694
1695             /* Update vectorial force */
1696             fix0             = _mm256_add_ps(fix0,tx);
1697             fiy0             = _mm256_add_ps(fiy0,ty);
1698             fiz0             = _mm256_add_ps(fiz0,tz);
1699
1700             fjx1             = _mm256_add_ps(fjx1,tx);
1701             fjy1             = _mm256_add_ps(fjy1,ty);
1702             fjz1             = _mm256_add_ps(fjz1,tz);
1703
1704             }
1705
1706             /**************************
1707              * CALCULATE INTERACTIONS *
1708              **************************/
1709
1710             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1711             {
1712
1713             r02              = _mm256_mul_ps(rsq02,rinv02);
1714
1715             /* EWALD ELECTROSTATICS */
1716             
1717             /* Analytical PME correction */
1718             zeta2            = _mm256_mul_ps(beta2,rsq02);
1719             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1720             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1721             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1722             felec            = _mm256_mul_ps(qq02,felec);
1723             
1724             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1725
1726             fscal            = felec;
1727
1728             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1729
1730             /* Calculate temporary vectorial force */
1731             tx               = _mm256_mul_ps(fscal,dx02);
1732             ty               = _mm256_mul_ps(fscal,dy02);
1733             tz               = _mm256_mul_ps(fscal,dz02);
1734
1735             /* Update vectorial force */
1736             fix0             = _mm256_add_ps(fix0,tx);
1737             fiy0             = _mm256_add_ps(fiy0,ty);
1738             fiz0             = _mm256_add_ps(fiz0,tz);
1739
1740             fjx2             = _mm256_add_ps(fjx2,tx);
1741             fjy2             = _mm256_add_ps(fjy2,ty);
1742             fjz2             = _mm256_add_ps(fjz2,tz);
1743
1744             }
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1751             {
1752
1753             r10              = _mm256_mul_ps(rsq10,rinv10);
1754
1755             /* EWALD ELECTROSTATICS */
1756             
1757             /* Analytical PME correction */
1758             zeta2            = _mm256_mul_ps(beta2,rsq10);
1759             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1760             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1761             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1762             felec            = _mm256_mul_ps(qq10,felec);
1763             
1764             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1765
1766             fscal            = felec;
1767
1768             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1769
1770             /* Calculate temporary vectorial force */
1771             tx               = _mm256_mul_ps(fscal,dx10);
1772             ty               = _mm256_mul_ps(fscal,dy10);
1773             tz               = _mm256_mul_ps(fscal,dz10);
1774
1775             /* Update vectorial force */
1776             fix1             = _mm256_add_ps(fix1,tx);
1777             fiy1             = _mm256_add_ps(fiy1,ty);
1778             fiz1             = _mm256_add_ps(fiz1,tz);
1779
1780             fjx0             = _mm256_add_ps(fjx0,tx);
1781             fjy0             = _mm256_add_ps(fjy0,ty);
1782             fjz0             = _mm256_add_ps(fjz0,tz);
1783
1784             }
1785
1786             /**************************
1787              * CALCULATE INTERACTIONS *
1788              **************************/
1789
1790             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1791             {
1792
1793             r11              = _mm256_mul_ps(rsq11,rinv11);
1794
1795             /* EWALD ELECTROSTATICS */
1796             
1797             /* Analytical PME correction */
1798             zeta2            = _mm256_mul_ps(beta2,rsq11);
1799             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1800             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1801             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1802             felec            = _mm256_mul_ps(qq11,felec);
1803             
1804             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1805
1806             fscal            = felec;
1807
1808             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1809
1810             /* Calculate temporary vectorial force */
1811             tx               = _mm256_mul_ps(fscal,dx11);
1812             ty               = _mm256_mul_ps(fscal,dy11);
1813             tz               = _mm256_mul_ps(fscal,dz11);
1814
1815             /* Update vectorial force */
1816             fix1             = _mm256_add_ps(fix1,tx);
1817             fiy1             = _mm256_add_ps(fiy1,ty);
1818             fiz1             = _mm256_add_ps(fiz1,tz);
1819
1820             fjx1             = _mm256_add_ps(fjx1,tx);
1821             fjy1             = _mm256_add_ps(fjy1,ty);
1822             fjz1             = _mm256_add_ps(fjz1,tz);
1823
1824             }
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1831             {
1832
1833             r12              = _mm256_mul_ps(rsq12,rinv12);
1834
1835             /* EWALD ELECTROSTATICS */
1836             
1837             /* Analytical PME correction */
1838             zeta2            = _mm256_mul_ps(beta2,rsq12);
1839             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1840             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1841             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1842             felec            = _mm256_mul_ps(qq12,felec);
1843             
1844             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1845
1846             fscal            = felec;
1847
1848             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1849
1850             /* Calculate temporary vectorial force */
1851             tx               = _mm256_mul_ps(fscal,dx12);
1852             ty               = _mm256_mul_ps(fscal,dy12);
1853             tz               = _mm256_mul_ps(fscal,dz12);
1854
1855             /* Update vectorial force */
1856             fix1             = _mm256_add_ps(fix1,tx);
1857             fiy1             = _mm256_add_ps(fiy1,ty);
1858             fiz1             = _mm256_add_ps(fiz1,tz);
1859
1860             fjx2             = _mm256_add_ps(fjx2,tx);
1861             fjy2             = _mm256_add_ps(fjy2,ty);
1862             fjz2             = _mm256_add_ps(fjz2,tz);
1863
1864             }
1865
1866             /**************************
1867              * CALCULATE INTERACTIONS *
1868              **************************/
1869
1870             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1871             {
1872
1873             r20              = _mm256_mul_ps(rsq20,rinv20);
1874
1875             /* EWALD ELECTROSTATICS */
1876             
1877             /* Analytical PME correction */
1878             zeta2            = _mm256_mul_ps(beta2,rsq20);
1879             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1880             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1881             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1882             felec            = _mm256_mul_ps(qq20,felec);
1883             
1884             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1885
1886             fscal            = felec;
1887
1888             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1889
1890             /* Calculate temporary vectorial force */
1891             tx               = _mm256_mul_ps(fscal,dx20);
1892             ty               = _mm256_mul_ps(fscal,dy20);
1893             tz               = _mm256_mul_ps(fscal,dz20);
1894
1895             /* Update vectorial force */
1896             fix2             = _mm256_add_ps(fix2,tx);
1897             fiy2             = _mm256_add_ps(fiy2,ty);
1898             fiz2             = _mm256_add_ps(fiz2,tz);
1899
1900             fjx0             = _mm256_add_ps(fjx0,tx);
1901             fjy0             = _mm256_add_ps(fjy0,ty);
1902             fjz0             = _mm256_add_ps(fjz0,tz);
1903
1904             }
1905
1906             /**************************
1907              * CALCULATE INTERACTIONS *
1908              **************************/
1909
1910             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1911             {
1912
1913             r21              = _mm256_mul_ps(rsq21,rinv21);
1914
1915             /* EWALD ELECTROSTATICS */
1916             
1917             /* Analytical PME correction */
1918             zeta2            = _mm256_mul_ps(beta2,rsq21);
1919             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1920             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1921             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1922             felec            = _mm256_mul_ps(qq21,felec);
1923             
1924             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1925
1926             fscal            = felec;
1927
1928             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1929
1930             /* Calculate temporary vectorial force */
1931             tx               = _mm256_mul_ps(fscal,dx21);
1932             ty               = _mm256_mul_ps(fscal,dy21);
1933             tz               = _mm256_mul_ps(fscal,dz21);
1934
1935             /* Update vectorial force */
1936             fix2             = _mm256_add_ps(fix2,tx);
1937             fiy2             = _mm256_add_ps(fiy2,ty);
1938             fiz2             = _mm256_add_ps(fiz2,tz);
1939
1940             fjx1             = _mm256_add_ps(fjx1,tx);
1941             fjy1             = _mm256_add_ps(fjy1,ty);
1942             fjz1             = _mm256_add_ps(fjz1,tz);
1943
1944             }
1945
1946             /**************************
1947              * CALCULATE INTERACTIONS *
1948              **************************/
1949
1950             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1951             {
1952
1953             r22              = _mm256_mul_ps(rsq22,rinv22);
1954
1955             /* EWALD ELECTROSTATICS */
1956             
1957             /* Analytical PME correction */
1958             zeta2            = _mm256_mul_ps(beta2,rsq22);
1959             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1960             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1961             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1962             felec            = _mm256_mul_ps(qq22,felec);
1963             
1964             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1965
1966             fscal            = felec;
1967
1968             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1969
1970             /* Calculate temporary vectorial force */
1971             tx               = _mm256_mul_ps(fscal,dx22);
1972             ty               = _mm256_mul_ps(fscal,dy22);
1973             tz               = _mm256_mul_ps(fscal,dz22);
1974
1975             /* Update vectorial force */
1976             fix2             = _mm256_add_ps(fix2,tx);
1977             fiy2             = _mm256_add_ps(fiy2,ty);
1978             fiz2             = _mm256_add_ps(fiz2,tz);
1979
1980             fjx2             = _mm256_add_ps(fjx2,tx);
1981             fjy2             = _mm256_add_ps(fjy2,ty);
1982             fjz2             = _mm256_add_ps(fjz2,tz);
1983
1984             }
1985
1986             fjptrA             = f+j_coord_offsetA;
1987             fjptrB             = f+j_coord_offsetB;
1988             fjptrC             = f+j_coord_offsetC;
1989             fjptrD             = f+j_coord_offsetD;
1990             fjptrE             = f+j_coord_offsetE;
1991             fjptrF             = f+j_coord_offsetF;
1992             fjptrG             = f+j_coord_offsetG;
1993             fjptrH             = f+j_coord_offsetH;
1994
1995             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1996                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1997
1998             /* Inner loop uses 531 flops */
1999         }
2000
2001         if(jidx<j_index_end)
2002         {
2003
2004             /* Get j neighbor index, and coordinate index */
2005             jnrlistA         = jjnr[jidx];
2006             jnrlistB         = jjnr[jidx+1];
2007             jnrlistC         = jjnr[jidx+2];
2008             jnrlistD         = jjnr[jidx+3];
2009             jnrlistE         = jjnr[jidx+4];
2010             jnrlistF         = jjnr[jidx+5];
2011             jnrlistG         = jjnr[jidx+6];
2012             jnrlistH         = jjnr[jidx+7];
2013             /* Sign of each element will be negative for non-real atoms.
2014              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2015              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2016              */
2017             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2018                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2019                                             
2020             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2021             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2022             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2023             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2024             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2025             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2026             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2027             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2028             j_coord_offsetA  = DIM*jnrA;
2029             j_coord_offsetB  = DIM*jnrB;
2030             j_coord_offsetC  = DIM*jnrC;
2031             j_coord_offsetD  = DIM*jnrD;
2032             j_coord_offsetE  = DIM*jnrE;
2033             j_coord_offsetF  = DIM*jnrF;
2034             j_coord_offsetG  = DIM*jnrG;
2035             j_coord_offsetH  = DIM*jnrH;
2036
2037             /* load j atom coordinates */
2038             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2039                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2040                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2041                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2042                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2043
2044             /* Calculate displacement vector */
2045             dx00             = _mm256_sub_ps(ix0,jx0);
2046             dy00             = _mm256_sub_ps(iy0,jy0);
2047             dz00             = _mm256_sub_ps(iz0,jz0);
2048             dx01             = _mm256_sub_ps(ix0,jx1);
2049             dy01             = _mm256_sub_ps(iy0,jy1);
2050             dz01             = _mm256_sub_ps(iz0,jz1);
2051             dx02             = _mm256_sub_ps(ix0,jx2);
2052             dy02             = _mm256_sub_ps(iy0,jy2);
2053             dz02             = _mm256_sub_ps(iz0,jz2);
2054             dx10             = _mm256_sub_ps(ix1,jx0);
2055             dy10             = _mm256_sub_ps(iy1,jy0);
2056             dz10             = _mm256_sub_ps(iz1,jz0);
2057             dx11             = _mm256_sub_ps(ix1,jx1);
2058             dy11             = _mm256_sub_ps(iy1,jy1);
2059             dz11             = _mm256_sub_ps(iz1,jz1);
2060             dx12             = _mm256_sub_ps(ix1,jx2);
2061             dy12             = _mm256_sub_ps(iy1,jy2);
2062             dz12             = _mm256_sub_ps(iz1,jz2);
2063             dx20             = _mm256_sub_ps(ix2,jx0);
2064             dy20             = _mm256_sub_ps(iy2,jy0);
2065             dz20             = _mm256_sub_ps(iz2,jz0);
2066             dx21             = _mm256_sub_ps(ix2,jx1);
2067             dy21             = _mm256_sub_ps(iy2,jy1);
2068             dz21             = _mm256_sub_ps(iz2,jz1);
2069             dx22             = _mm256_sub_ps(ix2,jx2);
2070             dy22             = _mm256_sub_ps(iy2,jy2);
2071             dz22             = _mm256_sub_ps(iz2,jz2);
2072
2073             /* Calculate squared distance and things based on it */
2074             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2075             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2076             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2077             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2078             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2079             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2080             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2081             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2082             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2083
2084             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2085             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2086             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2087             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2088             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2089             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2090             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2091             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2092             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2093
2094             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2095             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2096             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2097             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2098             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2099             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2100             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2101             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2102             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2103
2104             fjx0             = _mm256_setzero_ps();
2105             fjy0             = _mm256_setzero_ps();
2106             fjz0             = _mm256_setzero_ps();
2107             fjx1             = _mm256_setzero_ps();
2108             fjy1             = _mm256_setzero_ps();
2109             fjz1             = _mm256_setzero_ps();
2110             fjx2             = _mm256_setzero_ps();
2111             fjy2             = _mm256_setzero_ps();
2112             fjz2             = _mm256_setzero_ps();
2113
2114             /**************************
2115              * CALCULATE INTERACTIONS *
2116              **************************/
2117
2118             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2119             {
2120
2121             r00              = _mm256_mul_ps(rsq00,rinv00);
2122             r00              = _mm256_andnot_ps(dummy_mask,r00);
2123
2124             /* EWALD ELECTROSTATICS */
2125             
2126             /* Analytical PME correction */
2127             zeta2            = _mm256_mul_ps(beta2,rsq00);
2128             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2129             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2130             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2131             felec            = _mm256_mul_ps(qq00,felec);
2132             
2133             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2134
2135             fscal            = felec;
2136
2137             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2138
2139             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2140
2141             /* Calculate temporary vectorial force */
2142             tx               = _mm256_mul_ps(fscal,dx00);
2143             ty               = _mm256_mul_ps(fscal,dy00);
2144             tz               = _mm256_mul_ps(fscal,dz00);
2145
2146             /* Update vectorial force */
2147             fix0             = _mm256_add_ps(fix0,tx);
2148             fiy0             = _mm256_add_ps(fiy0,ty);
2149             fiz0             = _mm256_add_ps(fiz0,tz);
2150
2151             fjx0             = _mm256_add_ps(fjx0,tx);
2152             fjy0             = _mm256_add_ps(fjy0,ty);
2153             fjz0             = _mm256_add_ps(fjz0,tz);
2154
2155             }
2156
2157             /**************************
2158              * CALCULATE INTERACTIONS *
2159              **************************/
2160
2161             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2162             {
2163
2164             r01              = _mm256_mul_ps(rsq01,rinv01);
2165             r01              = _mm256_andnot_ps(dummy_mask,r01);
2166
2167             /* EWALD ELECTROSTATICS */
2168             
2169             /* Analytical PME correction */
2170             zeta2            = _mm256_mul_ps(beta2,rsq01);
2171             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2172             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2173             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2174             felec            = _mm256_mul_ps(qq01,felec);
2175             
2176             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2181
2182             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm256_mul_ps(fscal,dx01);
2186             ty               = _mm256_mul_ps(fscal,dy01);
2187             tz               = _mm256_mul_ps(fscal,dz01);
2188
2189             /* Update vectorial force */
2190             fix0             = _mm256_add_ps(fix0,tx);
2191             fiy0             = _mm256_add_ps(fiy0,ty);
2192             fiz0             = _mm256_add_ps(fiz0,tz);
2193
2194             fjx1             = _mm256_add_ps(fjx1,tx);
2195             fjy1             = _mm256_add_ps(fjy1,ty);
2196             fjz1             = _mm256_add_ps(fjz1,tz);
2197
2198             }
2199
2200             /**************************
2201              * CALCULATE INTERACTIONS *
2202              **************************/
2203
2204             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2205             {
2206
2207             r02              = _mm256_mul_ps(rsq02,rinv02);
2208             r02              = _mm256_andnot_ps(dummy_mask,r02);
2209
2210             /* EWALD ELECTROSTATICS */
2211             
2212             /* Analytical PME correction */
2213             zeta2            = _mm256_mul_ps(beta2,rsq02);
2214             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2215             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2216             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2217             felec            = _mm256_mul_ps(qq02,felec);
2218             
2219             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2220
2221             fscal            = felec;
2222
2223             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2224
2225             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2226
2227             /* Calculate temporary vectorial force */
2228             tx               = _mm256_mul_ps(fscal,dx02);
2229             ty               = _mm256_mul_ps(fscal,dy02);
2230             tz               = _mm256_mul_ps(fscal,dz02);
2231
2232             /* Update vectorial force */
2233             fix0             = _mm256_add_ps(fix0,tx);
2234             fiy0             = _mm256_add_ps(fiy0,ty);
2235             fiz0             = _mm256_add_ps(fiz0,tz);
2236
2237             fjx2             = _mm256_add_ps(fjx2,tx);
2238             fjy2             = _mm256_add_ps(fjy2,ty);
2239             fjz2             = _mm256_add_ps(fjz2,tz);
2240
2241             }
2242
2243             /**************************
2244              * CALCULATE INTERACTIONS *
2245              **************************/
2246
2247             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2248             {
2249
2250             r10              = _mm256_mul_ps(rsq10,rinv10);
2251             r10              = _mm256_andnot_ps(dummy_mask,r10);
2252
2253             /* EWALD ELECTROSTATICS */
2254             
2255             /* Analytical PME correction */
2256             zeta2            = _mm256_mul_ps(beta2,rsq10);
2257             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2258             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2259             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2260             felec            = _mm256_mul_ps(qq10,felec);
2261             
2262             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2263
2264             fscal            = felec;
2265
2266             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2267
2268             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2269
2270             /* Calculate temporary vectorial force */
2271             tx               = _mm256_mul_ps(fscal,dx10);
2272             ty               = _mm256_mul_ps(fscal,dy10);
2273             tz               = _mm256_mul_ps(fscal,dz10);
2274
2275             /* Update vectorial force */
2276             fix1             = _mm256_add_ps(fix1,tx);
2277             fiy1             = _mm256_add_ps(fiy1,ty);
2278             fiz1             = _mm256_add_ps(fiz1,tz);
2279
2280             fjx0             = _mm256_add_ps(fjx0,tx);
2281             fjy0             = _mm256_add_ps(fjy0,ty);
2282             fjz0             = _mm256_add_ps(fjz0,tz);
2283
2284             }
2285
2286             /**************************
2287              * CALCULATE INTERACTIONS *
2288              **************************/
2289
2290             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2291             {
2292
2293             r11              = _mm256_mul_ps(rsq11,rinv11);
2294             r11              = _mm256_andnot_ps(dummy_mask,r11);
2295
2296             /* EWALD ELECTROSTATICS */
2297             
2298             /* Analytical PME correction */
2299             zeta2            = _mm256_mul_ps(beta2,rsq11);
2300             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2301             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2302             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2303             felec            = _mm256_mul_ps(qq11,felec);
2304             
2305             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2306
2307             fscal            = felec;
2308
2309             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2310
2311             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2312
2313             /* Calculate temporary vectorial force */
2314             tx               = _mm256_mul_ps(fscal,dx11);
2315             ty               = _mm256_mul_ps(fscal,dy11);
2316             tz               = _mm256_mul_ps(fscal,dz11);
2317
2318             /* Update vectorial force */
2319             fix1             = _mm256_add_ps(fix1,tx);
2320             fiy1             = _mm256_add_ps(fiy1,ty);
2321             fiz1             = _mm256_add_ps(fiz1,tz);
2322
2323             fjx1             = _mm256_add_ps(fjx1,tx);
2324             fjy1             = _mm256_add_ps(fjy1,ty);
2325             fjz1             = _mm256_add_ps(fjz1,tz);
2326
2327             }
2328
2329             /**************************
2330              * CALCULATE INTERACTIONS *
2331              **************************/
2332
2333             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2334             {
2335
2336             r12              = _mm256_mul_ps(rsq12,rinv12);
2337             r12              = _mm256_andnot_ps(dummy_mask,r12);
2338
2339             /* EWALD ELECTROSTATICS */
2340             
2341             /* Analytical PME correction */
2342             zeta2            = _mm256_mul_ps(beta2,rsq12);
2343             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2344             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2345             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2346             felec            = _mm256_mul_ps(qq12,felec);
2347             
2348             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2349
2350             fscal            = felec;
2351
2352             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2353
2354             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2355
2356             /* Calculate temporary vectorial force */
2357             tx               = _mm256_mul_ps(fscal,dx12);
2358             ty               = _mm256_mul_ps(fscal,dy12);
2359             tz               = _mm256_mul_ps(fscal,dz12);
2360
2361             /* Update vectorial force */
2362             fix1             = _mm256_add_ps(fix1,tx);
2363             fiy1             = _mm256_add_ps(fiy1,ty);
2364             fiz1             = _mm256_add_ps(fiz1,tz);
2365
2366             fjx2             = _mm256_add_ps(fjx2,tx);
2367             fjy2             = _mm256_add_ps(fjy2,ty);
2368             fjz2             = _mm256_add_ps(fjz2,tz);
2369
2370             }
2371
2372             /**************************
2373              * CALCULATE INTERACTIONS *
2374              **************************/
2375
2376             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2377             {
2378
2379             r20              = _mm256_mul_ps(rsq20,rinv20);
2380             r20              = _mm256_andnot_ps(dummy_mask,r20);
2381
2382             /* EWALD ELECTROSTATICS */
2383             
2384             /* Analytical PME correction */
2385             zeta2            = _mm256_mul_ps(beta2,rsq20);
2386             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2387             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2388             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2389             felec            = _mm256_mul_ps(qq20,felec);
2390             
2391             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2392
2393             fscal            = felec;
2394
2395             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2396
2397             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2398
2399             /* Calculate temporary vectorial force */
2400             tx               = _mm256_mul_ps(fscal,dx20);
2401             ty               = _mm256_mul_ps(fscal,dy20);
2402             tz               = _mm256_mul_ps(fscal,dz20);
2403
2404             /* Update vectorial force */
2405             fix2             = _mm256_add_ps(fix2,tx);
2406             fiy2             = _mm256_add_ps(fiy2,ty);
2407             fiz2             = _mm256_add_ps(fiz2,tz);
2408
2409             fjx0             = _mm256_add_ps(fjx0,tx);
2410             fjy0             = _mm256_add_ps(fjy0,ty);
2411             fjz0             = _mm256_add_ps(fjz0,tz);
2412
2413             }
2414
2415             /**************************
2416              * CALCULATE INTERACTIONS *
2417              **************************/
2418
2419             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2420             {
2421
2422             r21              = _mm256_mul_ps(rsq21,rinv21);
2423             r21              = _mm256_andnot_ps(dummy_mask,r21);
2424
2425             /* EWALD ELECTROSTATICS */
2426             
2427             /* Analytical PME correction */
2428             zeta2            = _mm256_mul_ps(beta2,rsq21);
2429             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2430             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2431             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2432             felec            = _mm256_mul_ps(qq21,felec);
2433             
2434             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2435
2436             fscal            = felec;
2437
2438             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2439
2440             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2441
2442             /* Calculate temporary vectorial force */
2443             tx               = _mm256_mul_ps(fscal,dx21);
2444             ty               = _mm256_mul_ps(fscal,dy21);
2445             tz               = _mm256_mul_ps(fscal,dz21);
2446
2447             /* Update vectorial force */
2448             fix2             = _mm256_add_ps(fix2,tx);
2449             fiy2             = _mm256_add_ps(fiy2,ty);
2450             fiz2             = _mm256_add_ps(fiz2,tz);
2451
2452             fjx1             = _mm256_add_ps(fjx1,tx);
2453             fjy1             = _mm256_add_ps(fjy1,ty);
2454             fjz1             = _mm256_add_ps(fjz1,tz);
2455
2456             }
2457
2458             /**************************
2459              * CALCULATE INTERACTIONS *
2460              **************************/
2461
2462             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2463             {
2464
2465             r22              = _mm256_mul_ps(rsq22,rinv22);
2466             r22              = _mm256_andnot_ps(dummy_mask,r22);
2467
2468             /* EWALD ELECTROSTATICS */
2469             
2470             /* Analytical PME correction */
2471             zeta2            = _mm256_mul_ps(beta2,rsq22);
2472             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2473             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2474             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2475             felec            = _mm256_mul_ps(qq22,felec);
2476             
2477             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2478
2479             fscal            = felec;
2480
2481             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2482
2483             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2484
2485             /* Calculate temporary vectorial force */
2486             tx               = _mm256_mul_ps(fscal,dx22);
2487             ty               = _mm256_mul_ps(fscal,dy22);
2488             tz               = _mm256_mul_ps(fscal,dz22);
2489
2490             /* Update vectorial force */
2491             fix2             = _mm256_add_ps(fix2,tx);
2492             fiy2             = _mm256_add_ps(fiy2,ty);
2493             fiz2             = _mm256_add_ps(fiz2,tz);
2494
2495             fjx2             = _mm256_add_ps(fjx2,tx);
2496             fjy2             = _mm256_add_ps(fjy2,ty);
2497             fjz2             = _mm256_add_ps(fjz2,tz);
2498
2499             }
2500
2501             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2502             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2503             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2504             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2505             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2506             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2507             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2508             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2509
2510             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2511                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2512
2513             /* Inner loop uses 540 flops */
2514         }
2515
2516         /* End of innermost loop */
2517
2518         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2519                                                  f+i_coord_offset,fshift+i_shift_offset);
2520
2521         /* Increment number of inner iterations */
2522         inneriter                  += j_index_end - j_index_start;
2523
2524         /* Outer loop uses 18 flops */
2525     }
2526
2527     /* Increment number of outer iterations */
2528     outeriter        += nri;
2529
2530     /* Update outer/inner flops */
2531
2532     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*540);
2533 }