Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     int              nvdwtype;
96     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97     int              *vdwtype;
98     real             *vdwparam;
99     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
100     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
101     __m256           dummy_mask,cutoff_mask;
102     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
103     __m256           one     = _mm256_set1_ps(1.0);
104     __m256           two     = _mm256_set1_ps(2.0);
105     x                = xx[0];
106     f                = ff[0];
107
108     nri              = nlist->nri;
109     iinr             = nlist->iinr;
110     jindex           = nlist->jindex;
111     jjnr             = nlist->jjnr;
112     shiftidx         = nlist->shift;
113     gid              = nlist->gid;
114     shiftvec         = fr->shift_vec[0];
115     fshift           = fr->fshift[0];
116     facel            = _mm256_set1_ps(fr->epsfac);
117     charge           = mdatoms->chargeA;
118     krf              = _mm256_set1_ps(fr->ic->k_rf);
119     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
120     crf              = _mm256_set1_ps(fr->ic->c_rf);
121     nvdwtype         = fr->ntype;
122     vdwparam         = fr->nbfp;
123     vdwtype          = mdatoms->typeA;
124
125     /* Setup water-specific parameters */
126     inr              = nlist->iinr[0];
127     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
128     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
129     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
130     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
131
132     jq0              = _mm256_set1_ps(charge[inr+0]);
133     jq1              = _mm256_set1_ps(charge[inr+1]);
134     jq2              = _mm256_set1_ps(charge[inr+2]);
135     vdwjidx0A        = 2*vdwtype[inr+0];
136     qq00             = _mm256_mul_ps(iq0,jq0);
137     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
138     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
139     qq01             = _mm256_mul_ps(iq0,jq1);
140     qq02             = _mm256_mul_ps(iq0,jq2);
141     qq10             = _mm256_mul_ps(iq1,jq0);
142     qq11             = _mm256_mul_ps(iq1,jq1);
143     qq12             = _mm256_mul_ps(iq1,jq2);
144     qq20             = _mm256_mul_ps(iq2,jq0);
145     qq21             = _mm256_mul_ps(iq2,jq1);
146     qq22             = _mm256_mul_ps(iq2,jq2);
147
148     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
149     rcutoff_scalar   = fr->rcoulomb;
150     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
151     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
152
153     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
154     rvdw             = _mm256_set1_ps(fr->rvdw);
155
156     /* Avoid stupid compiler warnings */
157     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
158     j_coord_offsetA = 0;
159     j_coord_offsetB = 0;
160     j_coord_offsetC = 0;
161     j_coord_offsetD = 0;
162     j_coord_offsetE = 0;
163     j_coord_offsetF = 0;
164     j_coord_offsetG = 0;
165     j_coord_offsetH = 0;
166
167     outeriter        = 0;
168     inneriter        = 0;
169
170     for(iidx=0;iidx<4*DIM;iidx++)
171     {
172         scratch[iidx] = 0.0;
173     }
174
175     /* Start outer loop over neighborlists */
176     for(iidx=0; iidx<nri; iidx++)
177     {
178         /* Load shift vector for this list */
179         i_shift_offset   = DIM*shiftidx[iidx];
180
181         /* Load limits for loop over neighbors */
182         j_index_start    = jindex[iidx];
183         j_index_end      = jindex[iidx+1];
184
185         /* Get outer coordinate index */
186         inr              = iinr[iidx];
187         i_coord_offset   = DIM*inr;
188
189         /* Load i particle coords and add shift vector */
190         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
191                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192
193         fix0             = _mm256_setzero_ps();
194         fiy0             = _mm256_setzero_ps();
195         fiz0             = _mm256_setzero_ps();
196         fix1             = _mm256_setzero_ps();
197         fiy1             = _mm256_setzero_ps();
198         fiz1             = _mm256_setzero_ps();
199         fix2             = _mm256_setzero_ps();
200         fiy2             = _mm256_setzero_ps();
201         fiz2             = _mm256_setzero_ps();
202
203         /* Reset potential sums */
204         velecsum         = _mm256_setzero_ps();
205         vvdwsum          = _mm256_setzero_ps();
206
207         /* Start inner kernel loop */
208         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
209         {
210
211             /* Get j neighbor index, and coordinate index */
212             jnrA             = jjnr[jidx];
213             jnrB             = jjnr[jidx+1];
214             jnrC             = jjnr[jidx+2];
215             jnrD             = jjnr[jidx+3];
216             jnrE             = jjnr[jidx+4];
217             jnrF             = jjnr[jidx+5];
218             jnrG             = jjnr[jidx+6];
219             jnrH             = jjnr[jidx+7];
220             j_coord_offsetA  = DIM*jnrA;
221             j_coord_offsetB  = DIM*jnrB;
222             j_coord_offsetC  = DIM*jnrC;
223             j_coord_offsetD  = DIM*jnrD;
224             j_coord_offsetE  = DIM*jnrE;
225             j_coord_offsetF  = DIM*jnrF;
226             j_coord_offsetG  = DIM*jnrG;
227             j_coord_offsetH  = DIM*jnrH;
228
229             /* load j atom coordinates */
230             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
231                                                  x+j_coord_offsetC,x+j_coord_offsetD,
232                                                  x+j_coord_offsetE,x+j_coord_offsetF,
233                                                  x+j_coord_offsetG,x+j_coord_offsetH,
234                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
235
236             /* Calculate displacement vector */
237             dx00             = _mm256_sub_ps(ix0,jx0);
238             dy00             = _mm256_sub_ps(iy0,jy0);
239             dz00             = _mm256_sub_ps(iz0,jz0);
240             dx01             = _mm256_sub_ps(ix0,jx1);
241             dy01             = _mm256_sub_ps(iy0,jy1);
242             dz01             = _mm256_sub_ps(iz0,jz1);
243             dx02             = _mm256_sub_ps(ix0,jx2);
244             dy02             = _mm256_sub_ps(iy0,jy2);
245             dz02             = _mm256_sub_ps(iz0,jz2);
246             dx10             = _mm256_sub_ps(ix1,jx0);
247             dy10             = _mm256_sub_ps(iy1,jy0);
248             dz10             = _mm256_sub_ps(iz1,jz0);
249             dx11             = _mm256_sub_ps(ix1,jx1);
250             dy11             = _mm256_sub_ps(iy1,jy1);
251             dz11             = _mm256_sub_ps(iz1,jz1);
252             dx12             = _mm256_sub_ps(ix1,jx2);
253             dy12             = _mm256_sub_ps(iy1,jy2);
254             dz12             = _mm256_sub_ps(iz1,jz2);
255             dx20             = _mm256_sub_ps(ix2,jx0);
256             dy20             = _mm256_sub_ps(iy2,jy0);
257             dz20             = _mm256_sub_ps(iz2,jz0);
258             dx21             = _mm256_sub_ps(ix2,jx1);
259             dy21             = _mm256_sub_ps(iy2,jy1);
260             dz21             = _mm256_sub_ps(iz2,jz1);
261             dx22             = _mm256_sub_ps(ix2,jx2);
262             dy22             = _mm256_sub_ps(iy2,jy2);
263             dz22             = _mm256_sub_ps(iz2,jz2);
264
265             /* Calculate squared distance and things based on it */
266             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
267             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
268             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
269             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
270             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
271             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
272             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
273             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
274             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
275
276             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
277             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
278             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
279             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
280             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
281             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
282             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
283             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
284             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
285
286             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
287             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
288             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
289             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
290             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
291             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
292             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
293             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
294             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
295
296             fjx0             = _mm256_setzero_ps();
297             fjy0             = _mm256_setzero_ps();
298             fjz0             = _mm256_setzero_ps();
299             fjx1             = _mm256_setzero_ps();
300             fjy1             = _mm256_setzero_ps();
301             fjz1             = _mm256_setzero_ps();
302             fjx2             = _mm256_setzero_ps();
303             fjy2             = _mm256_setzero_ps();
304             fjz2             = _mm256_setzero_ps();
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             if (gmx_mm256_any_lt(rsq00,rcutoff2))
311             {
312
313             /* REACTION-FIELD ELECTROSTATICS */
314             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
315             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
316
317             /* LENNARD-JONES DISPERSION/REPULSION */
318
319             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
320             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
321             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
322             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
323                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
324             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
325
326             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
327
328             /* Update potential sum for this i atom from the interaction with this j atom. */
329             velec            = _mm256_and_ps(velec,cutoff_mask);
330             velecsum         = _mm256_add_ps(velecsum,velec);
331             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
332             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
333
334             fscal            = _mm256_add_ps(felec,fvdw);
335
336             fscal            = _mm256_and_ps(fscal,cutoff_mask);
337
338             /* Calculate temporary vectorial force */
339             tx               = _mm256_mul_ps(fscal,dx00);
340             ty               = _mm256_mul_ps(fscal,dy00);
341             tz               = _mm256_mul_ps(fscal,dz00);
342
343             /* Update vectorial force */
344             fix0             = _mm256_add_ps(fix0,tx);
345             fiy0             = _mm256_add_ps(fiy0,ty);
346             fiz0             = _mm256_add_ps(fiz0,tz);
347
348             fjx0             = _mm256_add_ps(fjx0,tx);
349             fjy0             = _mm256_add_ps(fjy0,ty);
350             fjz0             = _mm256_add_ps(fjz0,tz);
351
352             }
353
354             /**************************
355              * CALCULATE INTERACTIONS *
356              **************************/
357
358             if (gmx_mm256_any_lt(rsq01,rcutoff2))
359             {
360
361             /* REACTION-FIELD ELECTROSTATICS */
362             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
363             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
364
365             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             velec            = _mm256_and_ps(velec,cutoff_mask);
369             velecsum         = _mm256_add_ps(velecsum,velec);
370
371             fscal            = felec;
372
373             fscal            = _mm256_and_ps(fscal,cutoff_mask);
374
375             /* Calculate temporary vectorial force */
376             tx               = _mm256_mul_ps(fscal,dx01);
377             ty               = _mm256_mul_ps(fscal,dy01);
378             tz               = _mm256_mul_ps(fscal,dz01);
379
380             /* Update vectorial force */
381             fix0             = _mm256_add_ps(fix0,tx);
382             fiy0             = _mm256_add_ps(fiy0,ty);
383             fiz0             = _mm256_add_ps(fiz0,tz);
384
385             fjx1             = _mm256_add_ps(fjx1,tx);
386             fjy1             = _mm256_add_ps(fjy1,ty);
387             fjz1             = _mm256_add_ps(fjz1,tz);
388
389             }
390
391             /**************************
392              * CALCULATE INTERACTIONS *
393              **************************/
394
395             if (gmx_mm256_any_lt(rsq02,rcutoff2))
396             {
397
398             /* REACTION-FIELD ELECTROSTATICS */
399             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
400             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
401
402             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velec            = _mm256_and_ps(velec,cutoff_mask);
406             velecsum         = _mm256_add_ps(velecsum,velec);
407
408             fscal            = felec;
409
410             fscal            = _mm256_and_ps(fscal,cutoff_mask);
411
412             /* Calculate temporary vectorial force */
413             tx               = _mm256_mul_ps(fscal,dx02);
414             ty               = _mm256_mul_ps(fscal,dy02);
415             tz               = _mm256_mul_ps(fscal,dz02);
416
417             /* Update vectorial force */
418             fix0             = _mm256_add_ps(fix0,tx);
419             fiy0             = _mm256_add_ps(fiy0,ty);
420             fiz0             = _mm256_add_ps(fiz0,tz);
421
422             fjx2             = _mm256_add_ps(fjx2,tx);
423             fjy2             = _mm256_add_ps(fjy2,ty);
424             fjz2             = _mm256_add_ps(fjz2,tz);
425
426             }
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             if (gmx_mm256_any_lt(rsq10,rcutoff2))
433             {
434
435             /* REACTION-FIELD ELECTROSTATICS */
436             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
437             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
438
439             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
440
441             /* Update potential sum for this i atom from the interaction with this j atom. */
442             velec            = _mm256_and_ps(velec,cutoff_mask);
443             velecsum         = _mm256_add_ps(velecsum,velec);
444
445             fscal            = felec;
446
447             fscal            = _mm256_and_ps(fscal,cutoff_mask);
448
449             /* Calculate temporary vectorial force */
450             tx               = _mm256_mul_ps(fscal,dx10);
451             ty               = _mm256_mul_ps(fscal,dy10);
452             tz               = _mm256_mul_ps(fscal,dz10);
453
454             /* Update vectorial force */
455             fix1             = _mm256_add_ps(fix1,tx);
456             fiy1             = _mm256_add_ps(fiy1,ty);
457             fiz1             = _mm256_add_ps(fiz1,tz);
458
459             fjx0             = _mm256_add_ps(fjx0,tx);
460             fjy0             = _mm256_add_ps(fjy0,ty);
461             fjz0             = _mm256_add_ps(fjz0,tz);
462
463             }
464
465             /**************************
466              * CALCULATE INTERACTIONS *
467              **************************/
468
469             if (gmx_mm256_any_lt(rsq11,rcutoff2))
470             {
471
472             /* REACTION-FIELD ELECTROSTATICS */
473             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
474             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
475
476             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velec            = _mm256_and_ps(velec,cutoff_mask);
480             velecsum         = _mm256_add_ps(velecsum,velec);
481
482             fscal            = felec;
483
484             fscal            = _mm256_and_ps(fscal,cutoff_mask);
485
486             /* Calculate temporary vectorial force */
487             tx               = _mm256_mul_ps(fscal,dx11);
488             ty               = _mm256_mul_ps(fscal,dy11);
489             tz               = _mm256_mul_ps(fscal,dz11);
490
491             /* Update vectorial force */
492             fix1             = _mm256_add_ps(fix1,tx);
493             fiy1             = _mm256_add_ps(fiy1,ty);
494             fiz1             = _mm256_add_ps(fiz1,tz);
495
496             fjx1             = _mm256_add_ps(fjx1,tx);
497             fjy1             = _mm256_add_ps(fjy1,ty);
498             fjz1             = _mm256_add_ps(fjz1,tz);
499
500             }
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             if (gmx_mm256_any_lt(rsq12,rcutoff2))
507             {
508
509             /* REACTION-FIELD ELECTROSTATICS */
510             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
511             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
512
513             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
514
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velec            = _mm256_and_ps(velec,cutoff_mask);
517             velecsum         = _mm256_add_ps(velecsum,velec);
518
519             fscal            = felec;
520
521             fscal            = _mm256_and_ps(fscal,cutoff_mask);
522
523             /* Calculate temporary vectorial force */
524             tx               = _mm256_mul_ps(fscal,dx12);
525             ty               = _mm256_mul_ps(fscal,dy12);
526             tz               = _mm256_mul_ps(fscal,dz12);
527
528             /* Update vectorial force */
529             fix1             = _mm256_add_ps(fix1,tx);
530             fiy1             = _mm256_add_ps(fiy1,ty);
531             fiz1             = _mm256_add_ps(fiz1,tz);
532
533             fjx2             = _mm256_add_ps(fjx2,tx);
534             fjy2             = _mm256_add_ps(fjy2,ty);
535             fjz2             = _mm256_add_ps(fjz2,tz);
536
537             }
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             if (gmx_mm256_any_lt(rsq20,rcutoff2))
544             {
545
546             /* REACTION-FIELD ELECTROSTATICS */
547             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
548             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
549
550             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velec            = _mm256_and_ps(velec,cutoff_mask);
554             velecsum         = _mm256_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558             fscal            = _mm256_and_ps(fscal,cutoff_mask);
559
560             /* Calculate temporary vectorial force */
561             tx               = _mm256_mul_ps(fscal,dx20);
562             ty               = _mm256_mul_ps(fscal,dy20);
563             tz               = _mm256_mul_ps(fscal,dz20);
564
565             /* Update vectorial force */
566             fix2             = _mm256_add_ps(fix2,tx);
567             fiy2             = _mm256_add_ps(fiy2,ty);
568             fiz2             = _mm256_add_ps(fiz2,tz);
569
570             fjx0             = _mm256_add_ps(fjx0,tx);
571             fjy0             = _mm256_add_ps(fjy0,ty);
572             fjz0             = _mm256_add_ps(fjz0,tz);
573
574             }
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             if (gmx_mm256_any_lt(rsq21,rcutoff2))
581             {
582
583             /* REACTION-FIELD ELECTROSTATICS */
584             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
585             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
586
587             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velec            = _mm256_and_ps(velec,cutoff_mask);
591             velecsum         = _mm256_add_ps(velecsum,velec);
592
593             fscal            = felec;
594
595             fscal            = _mm256_and_ps(fscal,cutoff_mask);
596
597             /* Calculate temporary vectorial force */
598             tx               = _mm256_mul_ps(fscal,dx21);
599             ty               = _mm256_mul_ps(fscal,dy21);
600             tz               = _mm256_mul_ps(fscal,dz21);
601
602             /* Update vectorial force */
603             fix2             = _mm256_add_ps(fix2,tx);
604             fiy2             = _mm256_add_ps(fiy2,ty);
605             fiz2             = _mm256_add_ps(fiz2,tz);
606
607             fjx1             = _mm256_add_ps(fjx1,tx);
608             fjy1             = _mm256_add_ps(fjy1,ty);
609             fjz1             = _mm256_add_ps(fjz1,tz);
610
611             }
612
613             /**************************
614              * CALCULATE INTERACTIONS *
615              **************************/
616
617             if (gmx_mm256_any_lt(rsq22,rcutoff2))
618             {
619
620             /* REACTION-FIELD ELECTROSTATICS */
621             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
622             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
623
624             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velec            = _mm256_and_ps(velec,cutoff_mask);
628             velecsum         = _mm256_add_ps(velecsum,velec);
629
630             fscal            = felec;
631
632             fscal            = _mm256_and_ps(fscal,cutoff_mask);
633
634             /* Calculate temporary vectorial force */
635             tx               = _mm256_mul_ps(fscal,dx22);
636             ty               = _mm256_mul_ps(fscal,dy22);
637             tz               = _mm256_mul_ps(fscal,dz22);
638
639             /* Update vectorial force */
640             fix2             = _mm256_add_ps(fix2,tx);
641             fiy2             = _mm256_add_ps(fiy2,ty);
642             fiz2             = _mm256_add_ps(fiz2,tz);
643
644             fjx2             = _mm256_add_ps(fjx2,tx);
645             fjy2             = _mm256_add_ps(fjy2,ty);
646             fjz2             = _mm256_add_ps(fjz2,tz);
647
648             }
649
650             fjptrA             = f+j_coord_offsetA;
651             fjptrB             = f+j_coord_offsetB;
652             fjptrC             = f+j_coord_offsetC;
653             fjptrD             = f+j_coord_offsetD;
654             fjptrE             = f+j_coord_offsetE;
655             fjptrF             = f+j_coord_offsetF;
656             fjptrG             = f+j_coord_offsetG;
657             fjptrH             = f+j_coord_offsetH;
658
659             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
660                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
661
662             /* Inner loop uses 342 flops */
663         }
664
665         if(jidx<j_index_end)
666         {
667
668             /* Get j neighbor index, and coordinate index */
669             jnrlistA         = jjnr[jidx];
670             jnrlistB         = jjnr[jidx+1];
671             jnrlistC         = jjnr[jidx+2];
672             jnrlistD         = jjnr[jidx+3];
673             jnrlistE         = jjnr[jidx+4];
674             jnrlistF         = jjnr[jidx+5];
675             jnrlistG         = jjnr[jidx+6];
676             jnrlistH         = jjnr[jidx+7];
677             /* Sign of each element will be negative for non-real atoms.
678              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
679              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
680              */
681             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
682                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
683                                             
684             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
685             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
686             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
687             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
688             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
689             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
690             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
691             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
692             j_coord_offsetA  = DIM*jnrA;
693             j_coord_offsetB  = DIM*jnrB;
694             j_coord_offsetC  = DIM*jnrC;
695             j_coord_offsetD  = DIM*jnrD;
696             j_coord_offsetE  = DIM*jnrE;
697             j_coord_offsetF  = DIM*jnrF;
698             j_coord_offsetG  = DIM*jnrG;
699             j_coord_offsetH  = DIM*jnrH;
700
701             /* load j atom coordinates */
702             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
703                                                  x+j_coord_offsetC,x+j_coord_offsetD,
704                                                  x+j_coord_offsetE,x+j_coord_offsetF,
705                                                  x+j_coord_offsetG,x+j_coord_offsetH,
706                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
707
708             /* Calculate displacement vector */
709             dx00             = _mm256_sub_ps(ix0,jx0);
710             dy00             = _mm256_sub_ps(iy0,jy0);
711             dz00             = _mm256_sub_ps(iz0,jz0);
712             dx01             = _mm256_sub_ps(ix0,jx1);
713             dy01             = _mm256_sub_ps(iy0,jy1);
714             dz01             = _mm256_sub_ps(iz0,jz1);
715             dx02             = _mm256_sub_ps(ix0,jx2);
716             dy02             = _mm256_sub_ps(iy0,jy2);
717             dz02             = _mm256_sub_ps(iz0,jz2);
718             dx10             = _mm256_sub_ps(ix1,jx0);
719             dy10             = _mm256_sub_ps(iy1,jy0);
720             dz10             = _mm256_sub_ps(iz1,jz0);
721             dx11             = _mm256_sub_ps(ix1,jx1);
722             dy11             = _mm256_sub_ps(iy1,jy1);
723             dz11             = _mm256_sub_ps(iz1,jz1);
724             dx12             = _mm256_sub_ps(ix1,jx2);
725             dy12             = _mm256_sub_ps(iy1,jy2);
726             dz12             = _mm256_sub_ps(iz1,jz2);
727             dx20             = _mm256_sub_ps(ix2,jx0);
728             dy20             = _mm256_sub_ps(iy2,jy0);
729             dz20             = _mm256_sub_ps(iz2,jz0);
730             dx21             = _mm256_sub_ps(ix2,jx1);
731             dy21             = _mm256_sub_ps(iy2,jy1);
732             dz21             = _mm256_sub_ps(iz2,jz1);
733             dx22             = _mm256_sub_ps(ix2,jx2);
734             dy22             = _mm256_sub_ps(iy2,jy2);
735             dz22             = _mm256_sub_ps(iz2,jz2);
736
737             /* Calculate squared distance and things based on it */
738             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
739             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
740             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
741             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
742             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
743             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
744             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
745             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
746             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
747
748             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
749             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
750             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
751             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
752             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
753             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
754             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
755             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
756             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
757
758             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
759             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
760             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
761             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
762             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
763             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
764             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
765             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
766             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
767
768             fjx0             = _mm256_setzero_ps();
769             fjy0             = _mm256_setzero_ps();
770             fjz0             = _mm256_setzero_ps();
771             fjx1             = _mm256_setzero_ps();
772             fjy1             = _mm256_setzero_ps();
773             fjz1             = _mm256_setzero_ps();
774             fjx2             = _mm256_setzero_ps();
775             fjy2             = _mm256_setzero_ps();
776             fjz2             = _mm256_setzero_ps();
777
778             /**************************
779              * CALCULATE INTERACTIONS *
780              **************************/
781
782             if (gmx_mm256_any_lt(rsq00,rcutoff2))
783             {
784
785             /* REACTION-FIELD ELECTROSTATICS */
786             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
787             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
788
789             /* LENNARD-JONES DISPERSION/REPULSION */
790
791             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
792             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
793             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
794             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
795                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
796             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
797
798             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm256_and_ps(velec,cutoff_mask);
802             velec            = _mm256_andnot_ps(dummy_mask,velec);
803             velecsum         = _mm256_add_ps(velecsum,velec);
804             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
805             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
806             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
807
808             fscal            = _mm256_add_ps(felec,fvdw);
809
810             fscal            = _mm256_and_ps(fscal,cutoff_mask);
811
812             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
813
814             /* Calculate temporary vectorial force */
815             tx               = _mm256_mul_ps(fscal,dx00);
816             ty               = _mm256_mul_ps(fscal,dy00);
817             tz               = _mm256_mul_ps(fscal,dz00);
818
819             /* Update vectorial force */
820             fix0             = _mm256_add_ps(fix0,tx);
821             fiy0             = _mm256_add_ps(fiy0,ty);
822             fiz0             = _mm256_add_ps(fiz0,tz);
823
824             fjx0             = _mm256_add_ps(fjx0,tx);
825             fjy0             = _mm256_add_ps(fjy0,ty);
826             fjz0             = _mm256_add_ps(fjz0,tz);
827
828             }
829
830             /**************************
831              * CALCULATE INTERACTIONS *
832              **************************/
833
834             if (gmx_mm256_any_lt(rsq01,rcutoff2))
835             {
836
837             /* REACTION-FIELD ELECTROSTATICS */
838             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
839             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
840
841             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm256_and_ps(velec,cutoff_mask);
845             velec            = _mm256_andnot_ps(dummy_mask,velec);
846             velecsum         = _mm256_add_ps(velecsum,velec);
847
848             fscal            = felec;
849
850             fscal            = _mm256_and_ps(fscal,cutoff_mask);
851
852             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
853
854             /* Calculate temporary vectorial force */
855             tx               = _mm256_mul_ps(fscal,dx01);
856             ty               = _mm256_mul_ps(fscal,dy01);
857             tz               = _mm256_mul_ps(fscal,dz01);
858
859             /* Update vectorial force */
860             fix0             = _mm256_add_ps(fix0,tx);
861             fiy0             = _mm256_add_ps(fiy0,ty);
862             fiz0             = _mm256_add_ps(fiz0,tz);
863
864             fjx1             = _mm256_add_ps(fjx1,tx);
865             fjy1             = _mm256_add_ps(fjy1,ty);
866             fjz1             = _mm256_add_ps(fjz1,tz);
867
868             }
869
870             /**************************
871              * CALCULATE INTERACTIONS *
872              **************************/
873
874             if (gmx_mm256_any_lt(rsq02,rcutoff2))
875             {
876
877             /* REACTION-FIELD ELECTROSTATICS */
878             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
879             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
880
881             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
882
883             /* Update potential sum for this i atom from the interaction with this j atom. */
884             velec            = _mm256_and_ps(velec,cutoff_mask);
885             velec            = _mm256_andnot_ps(dummy_mask,velec);
886             velecsum         = _mm256_add_ps(velecsum,velec);
887
888             fscal            = felec;
889
890             fscal            = _mm256_and_ps(fscal,cutoff_mask);
891
892             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
893
894             /* Calculate temporary vectorial force */
895             tx               = _mm256_mul_ps(fscal,dx02);
896             ty               = _mm256_mul_ps(fscal,dy02);
897             tz               = _mm256_mul_ps(fscal,dz02);
898
899             /* Update vectorial force */
900             fix0             = _mm256_add_ps(fix0,tx);
901             fiy0             = _mm256_add_ps(fiy0,ty);
902             fiz0             = _mm256_add_ps(fiz0,tz);
903
904             fjx2             = _mm256_add_ps(fjx2,tx);
905             fjy2             = _mm256_add_ps(fjy2,ty);
906             fjz2             = _mm256_add_ps(fjz2,tz);
907
908             }
909
910             /**************************
911              * CALCULATE INTERACTIONS *
912              **************************/
913
914             if (gmx_mm256_any_lt(rsq10,rcutoff2))
915             {
916
917             /* REACTION-FIELD ELECTROSTATICS */
918             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
919             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
920
921             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
922
923             /* Update potential sum for this i atom from the interaction with this j atom. */
924             velec            = _mm256_and_ps(velec,cutoff_mask);
925             velec            = _mm256_andnot_ps(dummy_mask,velec);
926             velecsum         = _mm256_add_ps(velecsum,velec);
927
928             fscal            = felec;
929
930             fscal            = _mm256_and_ps(fscal,cutoff_mask);
931
932             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm256_mul_ps(fscal,dx10);
936             ty               = _mm256_mul_ps(fscal,dy10);
937             tz               = _mm256_mul_ps(fscal,dz10);
938
939             /* Update vectorial force */
940             fix1             = _mm256_add_ps(fix1,tx);
941             fiy1             = _mm256_add_ps(fiy1,ty);
942             fiz1             = _mm256_add_ps(fiz1,tz);
943
944             fjx0             = _mm256_add_ps(fjx0,tx);
945             fjy0             = _mm256_add_ps(fjy0,ty);
946             fjz0             = _mm256_add_ps(fjz0,tz);
947
948             }
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             if (gmx_mm256_any_lt(rsq11,rcutoff2))
955             {
956
957             /* REACTION-FIELD ELECTROSTATICS */
958             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
959             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
960
961             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
962
963             /* Update potential sum for this i atom from the interaction with this j atom. */
964             velec            = _mm256_and_ps(velec,cutoff_mask);
965             velec            = _mm256_andnot_ps(dummy_mask,velec);
966             velecsum         = _mm256_add_ps(velecsum,velec);
967
968             fscal            = felec;
969
970             fscal            = _mm256_and_ps(fscal,cutoff_mask);
971
972             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
973
974             /* Calculate temporary vectorial force */
975             tx               = _mm256_mul_ps(fscal,dx11);
976             ty               = _mm256_mul_ps(fscal,dy11);
977             tz               = _mm256_mul_ps(fscal,dz11);
978
979             /* Update vectorial force */
980             fix1             = _mm256_add_ps(fix1,tx);
981             fiy1             = _mm256_add_ps(fiy1,ty);
982             fiz1             = _mm256_add_ps(fiz1,tz);
983
984             fjx1             = _mm256_add_ps(fjx1,tx);
985             fjy1             = _mm256_add_ps(fjy1,ty);
986             fjz1             = _mm256_add_ps(fjz1,tz);
987
988             }
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             if (gmx_mm256_any_lt(rsq12,rcutoff2))
995             {
996
997             /* REACTION-FIELD ELECTROSTATICS */
998             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
999             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1000
1001             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1002
1003             /* Update potential sum for this i atom from the interaction with this j atom. */
1004             velec            = _mm256_and_ps(velec,cutoff_mask);
1005             velec            = _mm256_andnot_ps(dummy_mask,velec);
1006             velecsum         = _mm256_add_ps(velecsum,velec);
1007
1008             fscal            = felec;
1009
1010             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1011
1012             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1013
1014             /* Calculate temporary vectorial force */
1015             tx               = _mm256_mul_ps(fscal,dx12);
1016             ty               = _mm256_mul_ps(fscal,dy12);
1017             tz               = _mm256_mul_ps(fscal,dz12);
1018
1019             /* Update vectorial force */
1020             fix1             = _mm256_add_ps(fix1,tx);
1021             fiy1             = _mm256_add_ps(fiy1,ty);
1022             fiz1             = _mm256_add_ps(fiz1,tz);
1023
1024             fjx2             = _mm256_add_ps(fjx2,tx);
1025             fjy2             = _mm256_add_ps(fjy2,ty);
1026             fjz2             = _mm256_add_ps(fjz2,tz);
1027
1028             }
1029
1030             /**************************
1031              * CALCULATE INTERACTIONS *
1032              **************************/
1033
1034             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1035             {
1036
1037             /* REACTION-FIELD ELECTROSTATICS */
1038             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1039             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1040
1041             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1042
1043             /* Update potential sum for this i atom from the interaction with this j atom. */
1044             velec            = _mm256_and_ps(velec,cutoff_mask);
1045             velec            = _mm256_andnot_ps(dummy_mask,velec);
1046             velecsum         = _mm256_add_ps(velecsum,velec);
1047
1048             fscal            = felec;
1049
1050             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1051
1052             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1053
1054             /* Calculate temporary vectorial force */
1055             tx               = _mm256_mul_ps(fscal,dx20);
1056             ty               = _mm256_mul_ps(fscal,dy20);
1057             tz               = _mm256_mul_ps(fscal,dz20);
1058
1059             /* Update vectorial force */
1060             fix2             = _mm256_add_ps(fix2,tx);
1061             fiy2             = _mm256_add_ps(fiy2,ty);
1062             fiz2             = _mm256_add_ps(fiz2,tz);
1063
1064             fjx0             = _mm256_add_ps(fjx0,tx);
1065             fjy0             = _mm256_add_ps(fjy0,ty);
1066             fjz0             = _mm256_add_ps(fjz0,tz);
1067
1068             }
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1075             {
1076
1077             /* REACTION-FIELD ELECTROSTATICS */
1078             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1079             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1080
1081             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1082
1083             /* Update potential sum for this i atom from the interaction with this j atom. */
1084             velec            = _mm256_and_ps(velec,cutoff_mask);
1085             velec            = _mm256_andnot_ps(dummy_mask,velec);
1086             velecsum         = _mm256_add_ps(velecsum,velec);
1087
1088             fscal            = felec;
1089
1090             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1091
1092             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1093
1094             /* Calculate temporary vectorial force */
1095             tx               = _mm256_mul_ps(fscal,dx21);
1096             ty               = _mm256_mul_ps(fscal,dy21);
1097             tz               = _mm256_mul_ps(fscal,dz21);
1098
1099             /* Update vectorial force */
1100             fix2             = _mm256_add_ps(fix2,tx);
1101             fiy2             = _mm256_add_ps(fiy2,ty);
1102             fiz2             = _mm256_add_ps(fiz2,tz);
1103
1104             fjx1             = _mm256_add_ps(fjx1,tx);
1105             fjy1             = _mm256_add_ps(fjy1,ty);
1106             fjz1             = _mm256_add_ps(fjz1,tz);
1107
1108             }
1109
1110             /**************************
1111              * CALCULATE INTERACTIONS *
1112              **************************/
1113
1114             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1115             {
1116
1117             /* REACTION-FIELD ELECTROSTATICS */
1118             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1119             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1120
1121             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1122
1123             /* Update potential sum for this i atom from the interaction with this j atom. */
1124             velec            = _mm256_and_ps(velec,cutoff_mask);
1125             velec            = _mm256_andnot_ps(dummy_mask,velec);
1126             velecsum         = _mm256_add_ps(velecsum,velec);
1127
1128             fscal            = felec;
1129
1130             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1131
1132             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1133
1134             /* Calculate temporary vectorial force */
1135             tx               = _mm256_mul_ps(fscal,dx22);
1136             ty               = _mm256_mul_ps(fscal,dy22);
1137             tz               = _mm256_mul_ps(fscal,dz22);
1138
1139             /* Update vectorial force */
1140             fix2             = _mm256_add_ps(fix2,tx);
1141             fiy2             = _mm256_add_ps(fiy2,ty);
1142             fiz2             = _mm256_add_ps(fiz2,tz);
1143
1144             fjx2             = _mm256_add_ps(fjx2,tx);
1145             fjy2             = _mm256_add_ps(fjy2,ty);
1146             fjz2             = _mm256_add_ps(fjz2,tz);
1147
1148             }
1149
1150             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1151             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1152             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1153             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1154             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1155             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1156             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1157             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1158
1159             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1160                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1161
1162             /* Inner loop uses 342 flops */
1163         }
1164
1165         /* End of innermost loop */
1166
1167         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1168                                                  f+i_coord_offset,fshift+i_shift_offset);
1169
1170         ggid                        = gid[iidx];
1171         /* Update potential energies */
1172         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1173         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1174
1175         /* Increment number of inner iterations */
1176         inneriter                  += j_index_end - j_index_start;
1177
1178         /* Outer loop uses 20 flops */
1179     }
1180
1181     /* Increment number of outer iterations */
1182     outeriter        += nri;
1183
1184     /* Update outer/inner flops */
1185
1186     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1187 }
1188 /*
1189  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1190  * Electrostatics interaction: ReactionField
1191  * VdW interaction:            LennardJones
1192  * Geometry:                   Water3-Water3
1193  * Calculate force/pot:        Force
1194  */
1195 void
1196 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1197                     (t_nblist * gmx_restrict                nlist,
1198                      rvec * gmx_restrict                    xx,
1199                      rvec * gmx_restrict                    ff,
1200                      t_forcerec * gmx_restrict              fr,
1201                      t_mdatoms * gmx_restrict               mdatoms,
1202                      nb_kernel_data_t * gmx_restrict        kernel_data,
1203                      t_nrnb * gmx_restrict                  nrnb)
1204 {
1205     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1206      * just 0 for non-waters.
1207      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1208      * jnr indices corresponding to data put in the four positions in the SIMD register.
1209      */
1210     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1211     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1212     int              jnrA,jnrB,jnrC,jnrD;
1213     int              jnrE,jnrF,jnrG,jnrH;
1214     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1215     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1216     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1217     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1218     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1219     real             rcutoff_scalar;
1220     real             *shiftvec,*fshift,*x,*f;
1221     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1222     real             scratch[4*DIM];
1223     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1224     real *           vdwioffsetptr0;
1225     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1226     real *           vdwioffsetptr1;
1227     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1228     real *           vdwioffsetptr2;
1229     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1230     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1231     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1232     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1233     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1234     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1235     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1236     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1237     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1238     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1239     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1240     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1241     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1242     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1243     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1244     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1245     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1246     real             *charge;
1247     int              nvdwtype;
1248     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1249     int              *vdwtype;
1250     real             *vdwparam;
1251     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1252     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1253     __m256           dummy_mask,cutoff_mask;
1254     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1255     __m256           one     = _mm256_set1_ps(1.0);
1256     __m256           two     = _mm256_set1_ps(2.0);
1257     x                = xx[0];
1258     f                = ff[0];
1259
1260     nri              = nlist->nri;
1261     iinr             = nlist->iinr;
1262     jindex           = nlist->jindex;
1263     jjnr             = nlist->jjnr;
1264     shiftidx         = nlist->shift;
1265     gid              = nlist->gid;
1266     shiftvec         = fr->shift_vec[0];
1267     fshift           = fr->fshift[0];
1268     facel            = _mm256_set1_ps(fr->epsfac);
1269     charge           = mdatoms->chargeA;
1270     krf              = _mm256_set1_ps(fr->ic->k_rf);
1271     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1272     crf              = _mm256_set1_ps(fr->ic->c_rf);
1273     nvdwtype         = fr->ntype;
1274     vdwparam         = fr->nbfp;
1275     vdwtype          = mdatoms->typeA;
1276
1277     /* Setup water-specific parameters */
1278     inr              = nlist->iinr[0];
1279     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1280     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1281     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1282     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1283
1284     jq0              = _mm256_set1_ps(charge[inr+0]);
1285     jq1              = _mm256_set1_ps(charge[inr+1]);
1286     jq2              = _mm256_set1_ps(charge[inr+2]);
1287     vdwjidx0A        = 2*vdwtype[inr+0];
1288     qq00             = _mm256_mul_ps(iq0,jq0);
1289     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1290     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1291     qq01             = _mm256_mul_ps(iq0,jq1);
1292     qq02             = _mm256_mul_ps(iq0,jq2);
1293     qq10             = _mm256_mul_ps(iq1,jq0);
1294     qq11             = _mm256_mul_ps(iq1,jq1);
1295     qq12             = _mm256_mul_ps(iq1,jq2);
1296     qq20             = _mm256_mul_ps(iq2,jq0);
1297     qq21             = _mm256_mul_ps(iq2,jq1);
1298     qq22             = _mm256_mul_ps(iq2,jq2);
1299
1300     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1301     rcutoff_scalar   = fr->rcoulomb;
1302     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1303     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1304
1305     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1306     rvdw             = _mm256_set1_ps(fr->rvdw);
1307
1308     /* Avoid stupid compiler warnings */
1309     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1310     j_coord_offsetA = 0;
1311     j_coord_offsetB = 0;
1312     j_coord_offsetC = 0;
1313     j_coord_offsetD = 0;
1314     j_coord_offsetE = 0;
1315     j_coord_offsetF = 0;
1316     j_coord_offsetG = 0;
1317     j_coord_offsetH = 0;
1318
1319     outeriter        = 0;
1320     inneriter        = 0;
1321
1322     for(iidx=0;iidx<4*DIM;iidx++)
1323     {
1324         scratch[iidx] = 0.0;
1325     }
1326
1327     /* Start outer loop over neighborlists */
1328     for(iidx=0; iidx<nri; iidx++)
1329     {
1330         /* Load shift vector for this list */
1331         i_shift_offset   = DIM*shiftidx[iidx];
1332
1333         /* Load limits for loop over neighbors */
1334         j_index_start    = jindex[iidx];
1335         j_index_end      = jindex[iidx+1];
1336
1337         /* Get outer coordinate index */
1338         inr              = iinr[iidx];
1339         i_coord_offset   = DIM*inr;
1340
1341         /* Load i particle coords and add shift vector */
1342         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1343                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1344
1345         fix0             = _mm256_setzero_ps();
1346         fiy0             = _mm256_setzero_ps();
1347         fiz0             = _mm256_setzero_ps();
1348         fix1             = _mm256_setzero_ps();
1349         fiy1             = _mm256_setzero_ps();
1350         fiz1             = _mm256_setzero_ps();
1351         fix2             = _mm256_setzero_ps();
1352         fiy2             = _mm256_setzero_ps();
1353         fiz2             = _mm256_setzero_ps();
1354
1355         /* Start inner kernel loop */
1356         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1357         {
1358
1359             /* Get j neighbor index, and coordinate index */
1360             jnrA             = jjnr[jidx];
1361             jnrB             = jjnr[jidx+1];
1362             jnrC             = jjnr[jidx+2];
1363             jnrD             = jjnr[jidx+3];
1364             jnrE             = jjnr[jidx+4];
1365             jnrF             = jjnr[jidx+5];
1366             jnrG             = jjnr[jidx+6];
1367             jnrH             = jjnr[jidx+7];
1368             j_coord_offsetA  = DIM*jnrA;
1369             j_coord_offsetB  = DIM*jnrB;
1370             j_coord_offsetC  = DIM*jnrC;
1371             j_coord_offsetD  = DIM*jnrD;
1372             j_coord_offsetE  = DIM*jnrE;
1373             j_coord_offsetF  = DIM*jnrF;
1374             j_coord_offsetG  = DIM*jnrG;
1375             j_coord_offsetH  = DIM*jnrH;
1376
1377             /* load j atom coordinates */
1378             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1379                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1380                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1381                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1382                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1383
1384             /* Calculate displacement vector */
1385             dx00             = _mm256_sub_ps(ix0,jx0);
1386             dy00             = _mm256_sub_ps(iy0,jy0);
1387             dz00             = _mm256_sub_ps(iz0,jz0);
1388             dx01             = _mm256_sub_ps(ix0,jx1);
1389             dy01             = _mm256_sub_ps(iy0,jy1);
1390             dz01             = _mm256_sub_ps(iz0,jz1);
1391             dx02             = _mm256_sub_ps(ix0,jx2);
1392             dy02             = _mm256_sub_ps(iy0,jy2);
1393             dz02             = _mm256_sub_ps(iz0,jz2);
1394             dx10             = _mm256_sub_ps(ix1,jx0);
1395             dy10             = _mm256_sub_ps(iy1,jy0);
1396             dz10             = _mm256_sub_ps(iz1,jz0);
1397             dx11             = _mm256_sub_ps(ix1,jx1);
1398             dy11             = _mm256_sub_ps(iy1,jy1);
1399             dz11             = _mm256_sub_ps(iz1,jz1);
1400             dx12             = _mm256_sub_ps(ix1,jx2);
1401             dy12             = _mm256_sub_ps(iy1,jy2);
1402             dz12             = _mm256_sub_ps(iz1,jz2);
1403             dx20             = _mm256_sub_ps(ix2,jx0);
1404             dy20             = _mm256_sub_ps(iy2,jy0);
1405             dz20             = _mm256_sub_ps(iz2,jz0);
1406             dx21             = _mm256_sub_ps(ix2,jx1);
1407             dy21             = _mm256_sub_ps(iy2,jy1);
1408             dz21             = _mm256_sub_ps(iz2,jz1);
1409             dx22             = _mm256_sub_ps(ix2,jx2);
1410             dy22             = _mm256_sub_ps(iy2,jy2);
1411             dz22             = _mm256_sub_ps(iz2,jz2);
1412
1413             /* Calculate squared distance and things based on it */
1414             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1415             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1416             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1417             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1418             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1419             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1420             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1421             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1422             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1423
1424             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1425             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1426             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1427             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1428             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1429             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1430             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1431             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1432             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1433
1434             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1435             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1436             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1437             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1438             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1439             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1440             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1441             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1442             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1443
1444             fjx0             = _mm256_setzero_ps();
1445             fjy0             = _mm256_setzero_ps();
1446             fjz0             = _mm256_setzero_ps();
1447             fjx1             = _mm256_setzero_ps();
1448             fjy1             = _mm256_setzero_ps();
1449             fjz1             = _mm256_setzero_ps();
1450             fjx2             = _mm256_setzero_ps();
1451             fjy2             = _mm256_setzero_ps();
1452             fjz2             = _mm256_setzero_ps();
1453
1454             /**************************
1455              * CALCULATE INTERACTIONS *
1456              **************************/
1457
1458             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1459             {
1460
1461             /* REACTION-FIELD ELECTROSTATICS */
1462             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1463
1464             /* LENNARD-JONES DISPERSION/REPULSION */
1465
1466             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1467             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1468
1469             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1470
1471             fscal            = _mm256_add_ps(felec,fvdw);
1472
1473             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1474
1475             /* Calculate temporary vectorial force */
1476             tx               = _mm256_mul_ps(fscal,dx00);
1477             ty               = _mm256_mul_ps(fscal,dy00);
1478             tz               = _mm256_mul_ps(fscal,dz00);
1479
1480             /* Update vectorial force */
1481             fix0             = _mm256_add_ps(fix0,tx);
1482             fiy0             = _mm256_add_ps(fiy0,ty);
1483             fiz0             = _mm256_add_ps(fiz0,tz);
1484
1485             fjx0             = _mm256_add_ps(fjx0,tx);
1486             fjy0             = _mm256_add_ps(fjy0,ty);
1487             fjz0             = _mm256_add_ps(fjz0,tz);
1488
1489             }
1490
1491             /**************************
1492              * CALCULATE INTERACTIONS *
1493              **************************/
1494
1495             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1496             {
1497
1498             /* REACTION-FIELD ELECTROSTATICS */
1499             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1500
1501             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1502
1503             fscal            = felec;
1504
1505             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1506
1507             /* Calculate temporary vectorial force */
1508             tx               = _mm256_mul_ps(fscal,dx01);
1509             ty               = _mm256_mul_ps(fscal,dy01);
1510             tz               = _mm256_mul_ps(fscal,dz01);
1511
1512             /* Update vectorial force */
1513             fix0             = _mm256_add_ps(fix0,tx);
1514             fiy0             = _mm256_add_ps(fiy0,ty);
1515             fiz0             = _mm256_add_ps(fiz0,tz);
1516
1517             fjx1             = _mm256_add_ps(fjx1,tx);
1518             fjy1             = _mm256_add_ps(fjy1,ty);
1519             fjz1             = _mm256_add_ps(fjz1,tz);
1520
1521             }
1522
1523             /**************************
1524              * CALCULATE INTERACTIONS *
1525              **************************/
1526
1527             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1528             {
1529
1530             /* REACTION-FIELD ELECTROSTATICS */
1531             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1532
1533             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1534
1535             fscal            = felec;
1536
1537             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1538
1539             /* Calculate temporary vectorial force */
1540             tx               = _mm256_mul_ps(fscal,dx02);
1541             ty               = _mm256_mul_ps(fscal,dy02);
1542             tz               = _mm256_mul_ps(fscal,dz02);
1543
1544             /* Update vectorial force */
1545             fix0             = _mm256_add_ps(fix0,tx);
1546             fiy0             = _mm256_add_ps(fiy0,ty);
1547             fiz0             = _mm256_add_ps(fiz0,tz);
1548
1549             fjx2             = _mm256_add_ps(fjx2,tx);
1550             fjy2             = _mm256_add_ps(fjy2,ty);
1551             fjz2             = _mm256_add_ps(fjz2,tz);
1552
1553             }
1554
1555             /**************************
1556              * CALCULATE INTERACTIONS *
1557              **************************/
1558
1559             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1560             {
1561
1562             /* REACTION-FIELD ELECTROSTATICS */
1563             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1564
1565             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1566
1567             fscal            = felec;
1568
1569             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1570
1571             /* Calculate temporary vectorial force */
1572             tx               = _mm256_mul_ps(fscal,dx10);
1573             ty               = _mm256_mul_ps(fscal,dy10);
1574             tz               = _mm256_mul_ps(fscal,dz10);
1575
1576             /* Update vectorial force */
1577             fix1             = _mm256_add_ps(fix1,tx);
1578             fiy1             = _mm256_add_ps(fiy1,ty);
1579             fiz1             = _mm256_add_ps(fiz1,tz);
1580
1581             fjx0             = _mm256_add_ps(fjx0,tx);
1582             fjy0             = _mm256_add_ps(fjy0,ty);
1583             fjz0             = _mm256_add_ps(fjz0,tz);
1584
1585             }
1586
1587             /**************************
1588              * CALCULATE INTERACTIONS *
1589              **************************/
1590
1591             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1592             {
1593
1594             /* REACTION-FIELD ELECTROSTATICS */
1595             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1596
1597             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1598
1599             fscal            = felec;
1600
1601             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1602
1603             /* Calculate temporary vectorial force */
1604             tx               = _mm256_mul_ps(fscal,dx11);
1605             ty               = _mm256_mul_ps(fscal,dy11);
1606             tz               = _mm256_mul_ps(fscal,dz11);
1607
1608             /* Update vectorial force */
1609             fix1             = _mm256_add_ps(fix1,tx);
1610             fiy1             = _mm256_add_ps(fiy1,ty);
1611             fiz1             = _mm256_add_ps(fiz1,tz);
1612
1613             fjx1             = _mm256_add_ps(fjx1,tx);
1614             fjy1             = _mm256_add_ps(fjy1,ty);
1615             fjz1             = _mm256_add_ps(fjz1,tz);
1616
1617             }
1618
1619             /**************************
1620              * CALCULATE INTERACTIONS *
1621              **************************/
1622
1623             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1624             {
1625
1626             /* REACTION-FIELD ELECTROSTATICS */
1627             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1628
1629             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1630
1631             fscal            = felec;
1632
1633             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1634
1635             /* Calculate temporary vectorial force */
1636             tx               = _mm256_mul_ps(fscal,dx12);
1637             ty               = _mm256_mul_ps(fscal,dy12);
1638             tz               = _mm256_mul_ps(fscal,dz12);
1639
1640             /* Update vectorial force */
1641             fix1             = _mm256_add_ps(fix1,tx);
1642             fiy1             = _mm256_add_ps(fiy1,ty);
1643             fiz1             = _mm256_add_ps(fiz1,tz);
1644
1645             fjx2             = _mm256_add_ps(fjx2,tx);
1646             fjy2             = _mm256_add_ps(fjy2,ty);
1647             fjz2             = _mm256_add_ps(fjz2,tz);
1648
1649             }
1650
1651             /**************************
1652              * CALCULATE INTERACTIONS *
1653              **************************/
1654
1655             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1656             {
1657
1658             /* REACTION-FIELD ELECTROSTATICS */
1659             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1660
1661             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1662
1663             fscal            = felec;
1664
1665             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1666
1667             /* Calculate temporary vectorial force */
1668             tx               = _mm256_mul_ps(fscal,dx20);
1669             ty               = _mm256_mul_ps(fscal,dy20);
1670             tz               = _mm256_mul_ps(fscal,dz20);
1671
1672             /* Update vectorial force */
1673             fix2             = _mm256_add_ps(fix2,tx);
1674             fiy2             = _mm256_add_ps(fiy2,ty);
1675             fiz2             = _mm256_add_ps(fiz2,tz);
1676
1677             fjx0             = _mm256_add_ps(fjx0,tx);
1678             fjy0             = _mm256_add_ps(fjy0,ty);
1679             fjz0             = _mm256_add_ps(fjz0,tz);
1680
1681             }
1682
1683             /**************************
1684              * CALCULATE INTERACTIONS *
1685              **************************/
1686
1687             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1688             {
1689
1690             /* REACTION-FIELD ELECTROSTATICS */
1691             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1692
1693             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1694
1695             fscal            = felec;
1696
1697             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1698
1699             /* Calculate temporary vectorial force */
1700             tx               = _mm256_mul_ps(fscal,dx21);
1701             ty               = _mm256_mul_ps(fscal,dy21);
1702             tz               = _mm256_mul_ps(fscal,dz21);
1703
1704             /* Update vectorial force */
1705             fix2             = _mm256_add_ps(fix2,tx);
1706             fiy2             = _mm256_add_ps(fiy2,ty);
1707             fiz2             = _mm256_add_ps(fiz2,tz);
1708
1709             fjx1             = _mm256_add_ps(fjx1,tx);
1710             fjy1             = _mm256_add_ps(fjy1,ty);
1711             fjz1             = _mm256_add_ps(fjz1,tz);
1712
1713             }
1714
1715             /**************************
1716              * CALCULATE INTERACTIONS *
1717              **************************/
1718
1719             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1720             {
1721
1722             /* REACTION-FIELD ELECTROSTATICS */
1723             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1724
1725             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1726
1727             fscal            = felec;
1728
1729             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1730
1731             /* Calculate temporary vectorial force */
1732             tx               = _mm256_mul_ps(fscal,dx22);
1733             ty               = _mm256_mul_ps(fscal,dy22);
1734             tz               = _mm256_mul_ps(fscal,dz22);
1735
1736             /* Update vectorial force */
1737             fix2             = _mm256_add_ps(fix2,tx);
1738             fiy2             = _mm256_add_ps(fiy2,ty);
1739             fiz2             = _mm256_add_ps(fiz2,tz);
1740
1741             fjx2             = _mm256_add_ps(fjx2,tx);
1742             fjy2             = _mm256_add_ps(fjy2,ty);
1743             fjz2             = _mm256_add_ps(fjz2,tz);
1744
1745             }
1746
1747             fjptrA             = f+j_coord_offsetA;
1748             fjptrB             = f+j_coord_offsetB;
1749             fjptrC             = f+j_coord_offsetC;
1750             fjptrD             = f+j_coord_offsetD;
1751             fjptrE             = f+j_coord_offsetE;
1752             fjptrF             = f+j_coord_offsetF;
1753             fjptrG             = f+j_coord_offsetG;
1754             fjptrH             = f+j_coord_offsetH;
1755
1756             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1757                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1758
1759             /* Inner loop uses 277 flops */
1760         }
1761
1762         if(jidx<j_index_end)
1763         {
1764
1765             /* Get j neighbor index, and coordinate index */
1766             jnrlistA         = jjnr[jidx];
1767             jnrlistB         = jjnr[jidx+1];
1768             jnrlistC         = jjnr[jidx+2];
1769             jnrlistD         = jjnr[jidx+3];
1770             jnrlistE         = jjnr[jidx+4];
1771             jnrlistF         = jjnr[jidx+5];
1772             jnrlistG         = jjnr[jidx+6];
1773             jnrlistH         = jjnr[jidx+7];
1774             /* Sign of each element will be negative for non-real atoms.
1775              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1776              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1777              */
1778             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1779                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1780                                             
1781             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1782             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1783             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1784             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1785             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1786             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1787             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1788             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1789             j_coord_offsetA  = DIM*jnrA;
1790             j_coord_offsetB  = DIM*jnrB;
1791             j_coord_offsetC  = DIM*jnrC;
1792             j_coord_offsetD  = DIM*jnrD;
1793             j_coord_offsetE  = DIM*jnrE;
1794             j_coord_offsetF  = DIM*jnrF;
1795             j_coord_offsetG  = DIM*jnrG;
1796             j_coord_offsetH  = DIM*jnrH;
1797
1798             /* load j atom coordinates */
1799             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1800                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1801                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1802                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1803                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1804
1805             /* Calculate displacement vector */
1806             dx00             = _mm256_sub_ps(ix0,jx0);
1807             dy00             = _mm256_sub_ps(iy0,jy0);
1808             dz00             = _mm256_sub_ps(iz0,jz0);
1809             dx01             = _mm256_sub_ps(ix0,jx1);
1810             dy01             = _mm256_sub_ps(iy0,jy1);
1811             dz01             = _mm256_sub_ps(iz0,jz1);
1812             dx02             = _mm256_sub_ps(ix0,jx2);
1813             dy02             = _mm256_sub_ps(iy0,jy2);
1814             dz02             = _mm256_sub_ps(iz0,jz2);
1815             dx10             = _mm256_sub_ps(ix1,jx0);
1816             dy10             = _mm256_sub_ps(iy1,jy0);
1817             dz10             = _mm256_sub_ps(iz1,jz0);
1818             dx11             = _mm256_sub_ps(ix1,jx1);
1819             dy11             = _mm256_sub_ps(iy1,jy1);
1820             dz11             = _mm256_sub_ps(iz1,jz1);
1821             dx12             = _mm256_sub_ps(ix1,jx2);
1822             dy12             = _mm256_sub_ps(iy1,jy2);
1823             dz12             = _mm256_sub_ps(iz1,jz2);
1824             dx20             = _mm256_sub_ps(ix2,jx0);
1825             dy20             = _mm256_sub_ps(iy2,jy0);
1826             dz20             = _mm256_sub_ps(iz2,jz0);
1827             dx21             = _mm256_sub_ps(ix2,jx1);
1828             dy21             = _mm256_sub_ps(iy2,jy1);
1829             dz21             = _mm256_sub_ps(iz2,jz1);
1830             dx22             = _mm256_sub_ps(ix2,jx2);
1831             dy22             = _mm256_sub_ps(iy2,jy2);
1832             dz22             = _mm256_sub_ps(iz2,jz2);
1833
1834             /* Calculate squared distance and things based on it */
1835             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1836             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1837             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1838             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1839             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1840             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1841             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1842             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1843             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1844
1845             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1846             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1847             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1848             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1849             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1850             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1851             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1852             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1853             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1854
1855             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1856             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1857             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1858             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1859             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1860             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1861             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1862             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1863             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1864
1865             fjx0             = _mm256_setzero_ps();
1866             fjy0             = _mm256_setzero_ps();
1867             fjz0             = _mm256_setzero_ps();
1868             fjx1             = _mm256_setzero_ps();
1869             fjy1             = _mm256_setzero_ps();
1870             fjz1             = _mm256_setzero_ps();
1871             fjx2             = _mm256_setzero_ps();
1872             fjy2             = _mm256_setzero_ps();
1873             fjz2             = _mm256_setzero_ps();
1874
1875             /**************************
1876              * CALCULATE INTERACTIONS *
1877              **************************/
1878
1879             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1880             {
1881
1882             /* REACTION-FIELD ELECTROSTATICS */
1883             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1884
1885             /* LENNARD-JONES DISPERSION/REPULSION */
1886
1887             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1888             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1889
1890             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1891
1892             fscal            = _mm256_add_ps(felec,fvdw);
1893
1894             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1895
1896             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1897
1898             /* Calculate temporary vectorial force */
1899             tx               = _mm256_mul_ps(fscal,dx00);
1900             ty               = _mm256_mul_ps(fscal,dy00);
1901             tz               = _mm256_mul_ps(fscal,dz00);
1902
1903             /* Update vectorial force */
1904             fix0             = _mm256_add_ps(fix0,tx);
1905             fiy0             = _mm256_add_ps(fiy0,ty);
1906             fiz0             = _mm256_add_ps(fiz0,tz);
1907
1908             fjx0             = _mm256_add_ps(fjx0,tx);
1909             fjy0             = _mm256_add_ps(fjy0,ty);
1910             fjz0             = _mm256_add_ps(fjz0,tz);
1911
1912             }
1913
1914             /**************************
1915              * CALCULATE INTERACTIONS *
1916              **************************/
1917
1918             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1919             {
1920
1921             /* REACTION-FIELD ELECTROSTATICS */
1922             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1923
1924             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1925
1926             fscal            = felec;
1927
1928             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1929
1930             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm256_mul_ps(fscal,dx01);
1934             ty               = _mm256_mul_ps(fscal,dy01);
1935             tz               = _mm256_mul_ps(fscal,dz01);
1936
1937             /* Update vectorial force */
1938             fix0             = _mm256_add_ps(fix0,tx);
1939             fiy0             = _mm256_add_ps(fiy0,ty);
1940             fiz0             = _mm256_add_ps(fiz0,tz);
1941
1942             fjx1             = _mm256_add_ps(fjx1,tx);
1943             fjy1             = _mm256_add_ps(fjy1,ty);
1944             fjz1             = _mm256_add_ps(fjz1,tz);
1945
1946             }
1947
1948             /**************************
1949              * CALCULATE INTERACTIONS *
1950              **************************/
1951
1952             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1953             {
1954
1955             /* REACTION-FIELD ELECTROSTATICS */
1956             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1957
1958             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1959
1960             fscal            = felec;
1961
1962             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1963
1964             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1965
1966             /* Calculate temporary vectorial force */
1967             tx               = _mm256_mul_ps(fscal,dx02);
1968             ty               = _mm256_mul_ps(fscal,dy02);
1969             tz               = _mm256_mul_ps(fscal,dz02);
1970
1971             /* Update vectorial force */
1972             fix0             = _mm256_add_ps(fix0,tx);
1973             fiy0             = _mm256_add_ps(fiy0,ty);
1974             fiz0             = _mm256_add_ps(fiz0,tz);
1975
1976             fjx2             = _mm256_add_ps(fjx2,tx);
1977             fjy2             = _mm256_add_ps(fjy2,ty);
1978             fjz2             = _mm256_add_ps(fjz2,tz);
1979
1980             }
1981
1982             /**************************
1983              * CALCULATE INTERACTIONS *
1984              **************************/
1985
1986             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1987             {
1988
1989             /* REACTION-FIELD ELECTROSTATICS */
1990             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1991
1992             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1993
1994             fscal            = felec;
1995
1996             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1997
1998             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1999
2000             /* Calculate temporary vectorial force */
2001             tx               = _mm256_mul_ps(fscal,dx10);
2002             ty               = _mm256_mul_ps(fscal,dy10);
2003             tz               = _mm256_mul_ps(fscal,dz10);
2004
2005             /* Update vectorial force */
2006             fix1             = _mm256_add_ps(fix1,tx);
2007             fiy1             = _mm256_add_ps(fiy1,ty);
2008             fiz1             = _mm256_add_ps(fiz1,tz);
2009
2010             fjx0             = _mm256_add_ps(fjx0,tx);
2011             fjy0             = _mm256_add_ps(fjy0,ty);
2012             fjz0             = _mm256_add_ps(fjz0,tz);
2013
2014             }
2015
2016             /**************************
2017              * CALCULATE INTERACTIONS *
2018              **************************/
2019
2020             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2021             {
2022
2023             /* REACTION-FIELD ELECTROSTATICS */
2024             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2025
2026             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2027
2028             fscal            = felec;
2029
2030             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2031
2032             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2033
2034             /* Calculate temporary vectorial force */
2035             tx               = _mm256_mul_ps(fscal,dx11);
2036             ty               = _mm256_mul_ps(fscal,dy11);
2037             tz               = _mm256_mul_ps(fscal,dz11);
2038
2039             /* Update vectorial force */
2040             fix1             = _mm256_add_ps(fix1,tx);
2041             fiy1             = _mm256_add_ps(fiy1,ty);
2042             fiz1             = _mm256_add_ps(fiz1,tz);
2043
2044             fjx1             = _mm256_add_ps(fjx1,tx);
2045             fjy1             = _mm256_add_ps(fjy1,ty);
2046             fjz1             = _mm256_add_ps(fjz1,tz);
2047
2048             }
2049
2050             /**************************
2051              * CALCULATE INTERACTIONS *
2052              **************************/
2053
2054             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2055             {
2056
2057             /* REACTION-FIELD ELECTROSTATICS */
2058             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2059
2060             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2061
2062             fscal            = felec;
2063
2064             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2065
2066             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2067
2068             /* Calculate temporary vectorial force */
2069             tx               = _mm256_mul_ps(fscal,dx12);
2070             ty               = _mm256_mul_ps(fscal,dy12);
2071             tz               = _mm256_mul_ps(fscal,dz12);
2072
2073             /* Update vectorial force */
2074             fix1             = _mm256_add_ps(fix1,tx);
2075             fiy1             = _mm256_add_ps(fiy1,ty);
2076             fiz1             = _mm256_add_ps(fiz1,tz);
2077
2078             fjx2             = _mm256_add_ps(fjx2,tx);
2079             fjy2             = _mm256_add_ps(fjy2,ty);
2080             fjz2             = _mm256_add_ps(fjz2,tz);
2081
2082             }
2083
2084             /**************************
2085              * CALCULATE INTERACTIONS *
2086              **************************/
2087
2088             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2089             {
2090
2091             /* REACTION-FIELD ELECTROSTATICS */
2092             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2093
2094             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2095
2096             fscal            = felec;
2097
2098             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2099
2100             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2101
2102             /* Calculate temporary vectorial force */
2103             tx               = _mm256_mul_ps(fscal,dx20);
2104             ty               = _mm256_mul_ps(fscal,dy20);
2105             tz               = _mm256_mul_ps(fscal,dz20);
2106
2107             /* Update vectorial force */
2108             fix2             = _mm256_add_ps(fix2,tx);
2109             fiy2             = _mm256_add_ps(fiy2,ty);
2110             fiz2             = _mm256_add_ps(fiz2,tz);
2111
2112             fjx0             = _mm256_add_ps(fjx0,tx);
2113             fjy0             = _mm256_add_ps(fjy0,ty);
2114             fjz0             = _mm256_add_ps(fjz0,tz);
2115
2116             }
2117
2118             /**************************
2119              * CALCULATE INTERACTIONS *
2120              **************************/
2121
2122             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2123             {
2124
2125             /* REACTION-FIELD ELECTROSTATICS */
2126             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2127
2128             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2129
2130             fscal            = felec;
2131
2132             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2133
2134             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2135
2136             /* Calculate temporary vectorial force */
2137             tx               = _mm256_mul_ps(fscal,dx21);
2138             ty               = _mm256_mul_ps(fscal,dy21);
2139             tz               = _mm256_mul_ps(fscal,dz21);
2140
2141             /* Update vectorial force */
2142             fix2             = _mm256_add_ps(fix2,tx);
2143             fiy2             = _mm256_add_ps(fiy2,ty);
2144             fiz2             = _mm256_add_ps(fiz2,tz);
2145
2146             fjx1             = _mm256_add_ps(fjx1,tx);
2147             fjy1             = _mm256_add_ps(fjy1,ty);
2148             fjz1             = _mm256_add_ps(fjz1,tz);
2149
2150             }
2151
2152             /**************************
2153              * CALCULATE INTERACTIONS *
2154              **************************/
2155
2156             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2157             {
2158
2159             /* REACTION-FIELD ELECTROSTATICS */
2160             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2161
2162             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2163
2164             fscal            = felec;
2165
2166             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2167
2168             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2169
2170             /* Calculate temporary vectorial force */
2171             tx               = _mm256_mul_ps(fscal,dx22);
2172             ty               = _mm256_mul_ps(fscal,dy22);
2173             tz               = _mm256_mul_ps(fscal,dz22);
2174
2175             /* Update vectorial force */
2176             fix2             = _mm256_add_ps(fix2,tx);
2177             fiy2             = _mm256_add_ps(fiy2,ty);
2178             fiz2             = _mm256_add_ps(fiz2,tz);
2179
2180             fjx2             = _mm256_add_ps(fjx2,tx);
2181             fjy2             = _mm256_add_ps(fjy2,ty);
2182             fjz2             = _mm256_add_ps(fjz2,tz);
2183
2184             }
2185
2186             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2187             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2188             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2189             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2190             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2191             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2192             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2193             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2194
2195             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2196                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2197
2198             /* Inner loop uses 277 flops */
2199         }
2200
2201         /* End of innermost loop */
2202
2203         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2204                                                  f+i_coord_offset,fshift+i_shift_offset);
2205
2206         /* Increment number of inner iterations */
2207         inneriter                  += j_index_end - j_index_start;
2208
2209         /* Outer loop uses 18 flops */
2210     }
2211
2212     /* Increment number of outer iterations */
2213     outeriter        += nri;
2214
2215     /* Update outer/inner flops */
2216
2217     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
2218 }