Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
38  * Electrostatics interaction: ReactionField
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
92     real             *charge;
93     int              nvdwtype;
94     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
95     int              *vdwtype;
96     real             *vdwparam;
97     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
98     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
99     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
100     real             rswitch_scalar,d_scalar;
101     __m256d          dummy_mask,cutoff_mask;
102     __m128           tmpmask0,tmpmask1;
103     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
104     __m256d          one     = _mm256_set1_pd(1.0);
105     __m256d          two     = _mm256_set1_pd(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     facel            = _mm256_set1_pd(fr->epsfac);
118     charge           = mdatoms->chargeA;
119     krf              = _mm256_set1_pd(fr->ic->k_rf);
120     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
121     crf              = _mm256_set1_pd(fr->ic->c_rf);
122     nvdwtype         = fr->ntype;
123     vdwparam         = fr->nbfp;
124     vdwtype          = mdatoms->typeA;
125
126     /* Setup water-specific parameters */
127     inr              = nlist->iinr[0];
128     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
129     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
132
133     jq0              = _mm256_set1_pd(charge[inr+0]);
134     jq1              = _mm256_set1_pd(charge[inr+1]);
135     jq2              = _mm256_set1_pd(charge[inr+2]);
136     vdwjidx0A        = 2*vdwtype[inr+0];
137     qq00             = _mm256_mul_pd(iq0,jq0);
138     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
139     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
140     qq01             = _mm256_mul_pd(iq0,jq1);
141     qq02             = _mm256_mul_pd(iq0,jq2);
142     qq10             = _mm256_mul_pd(iq1,jq0);
143     qq11             = _mm256_mul_pd(iq1,jq1);
144     qq12             = _mm256_mul_pd(iq1,jq2);
145     qq20             = _mm256_mul_pd(iq2,jq0);
146     qq21             = _mm256_mul_pd(iq2,jq1);
147     qq22             = _mm256_mul_pd(iq2,jq2);
148
149     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150     rcutoff_scalar   = fr->rcoulomb;
151     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
152     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
153
154     rswitch_scalar   = fr->rvdw_switch;
155     rswitch          = _mm256_set1_pd(rswitch_scalar);
156     /* Setup switch parameters */
157     d_scalar         = rcutoff_scalar-rswitch_scalar;
158     d                = _mm256_set1_pd(d_scalar);
159     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165
166     /* Avoid stupid compiler warnings */
167     jnrA = jnrB = jnrC = jnrD = 0;
168     j_coord_offsetA = 0;
169     j_coord_offsetB = 0;
170     j_coord_offsetC = 0;
171     j_coord_offsetD = 0;
172
173     outeriter        = 0;
174     inneriter        = 0;
175
176     for(iidx=0;iidx<4*DIM;iidx++)
177     {
178         scratch[iidx] = 0.0;
179     }
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186
187         /* Load limits for loop over neighbors */
188         j_index_start    = jindex[iidx];
189         j_index_end      = jindex[iidx+1];
190
191         /* Get outer coordinate index */
192         inr              = iinr[iidx];
193         i_coord_offset   = DIM*inr;
194
195         /* Load i particle coords and add shift vector */
196         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
197                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198
199         fix0             = _mm256_setzero_pd();
200         fiy0             = _mm256_setzero_pd();
201         fiz0             = _mm256_setzero_pd();
202         fix1             = _mm256_setzero_pd();
203         fiy1             = _mm256_setzero_pd();
204         fiz1             = _mm256_setzero_pd();
205         fix2             = _mm256_setzero_pd();
206         fiy2             = _mm256_setzero_pd();
207         fiz2             = _mm256_setzero_pd();
208
209         /* Reset potential sums */
210         velecsum         = _mm256_setzero_pd();
211         vvdwsum          = _mm256_setzero_pd();
212
213         /* Start inner kernel loop */
214         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215         {
216
217             /* Get j neighbor index, and coordinate index */
218             jnrA             = jjnr[jidx];
219             jnrB             = jjnr[jidx+1];
220             jnrC             = jjnr[jidx+2];
221             jnrD             = jjnr[jidx+3];
222             j_coord_offsetA  = DIM*jnrA;
223             j_coord_offsetB  = DIM*jnrB;
224             j_coord_offsetC  = DIM*jnrC;
225             j_coord_offsetD  = DIM*jnrD;
226
227             /* load j atom coordinates */
228             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
229                                                  x+j_coord_offsetC,x+j_coord_offsetD,
230                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
231
232             /* Calculate displacement vector */
233             dx00             = _mm256_sub_pd(ix0,jx0);
234             dy00             = _mm256_sub_pd(iy0,jy0);
235             dz00             = _mm256_sub_pd(iz0,jz0);
236             dx01             = _mm256_sub_pd(ix0,jx1);
237             dy01             = _mm256_sub_pd(iy0,jy1);
238             dz01             = _mm256_sub_pd(iz0,jz1);
239             dx02             = _mm256_sub_pd(ix0,jx2);
240             dy02             = _mm256_sub_pd(iy0,jy2);
241             dz02             = _mm256_sub_pd(iz0,jz2);
242             dx10             = _mm256_sub_pd(ix1,jx0);
243             dy10             = _mm256_sub_pd(iy1,jy0);
244             dz10             = _mm256_sub_pd(iz1,jz0);
245             dx11             = _mm256_sub_pd(ix1,jx1);
246             dy11             = _mm256_sub_pd(iy1,jy1);
247             dz11             = _mm256_sub_pd(iz1,jz1);
248             dx12             = _mm256_sub_pd(ix1,jx2);
249             dy12             = _mm256_sub_pd(iy1,jy2);
250             dz12             = _mm256_sub_pd(iz1,jz2);
251             dx20             = _mm256_sub_pd(ix2,jx0);
252             dy20             = _mm256_sub_pd(iy2,jy0);
253             dz20             = _mm256_sub_pd(iz2,jz0);
254             dx21             = _mm256_sub_pd(ix2,jx1);
255             dy21             = _mm256_sub_pd(iy2,jy1);
256             dz21             = _mm256_sub_pd(iz2,jz1);
257             dx22             = _mm256_sub_pd(ix2,jx2);
258             dy22             = _mm256_sub_pd(iy2,jy2);
259             dz22             = _mm256_sub_pd(iz2,jz2);
260
261             /* Calculate squared distance and things based on it */
262             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
263             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
264             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
265             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
266             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
267             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
268             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
269             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
270             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
271
272             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
273             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
274             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
275             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
276             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
277             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
278             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
279             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
280             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
281
282             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
283             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
284             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
285             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
286             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
287             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
288             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
289             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
290             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
291
292             fjx0             = _mm256_setzero_pd();
293             fjy0             = _mm256_setzero_pd();
294             fjz0             = _mm256_setzero_pd();
295             fjx1             = _mm256_setzero_pd();
296             fjy1             = _mm256_setzero_pd();
297             fjz1             = _mm256_setzero_pd();
298             fjx2             = _mm256_setzero_pd();
299             fjy2             = _mm256_setzero_pd();
300             fjz2             = _mm256_setzero_pd();
301
302             /**************************
303              * CALCULATE INTERACTIONS *
304              **************************/
305
306             if (gmx_mm256_any_lt(rsq00,rcutoff2))
307             {
308
309             r00              = _mm256_mul_pd(rsq00,rinv00);
310
311             /* REACTION-FIELD ELECTROSTATICS */
312             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
313             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
314
315             /* LENNARD-JONES DISPERSION/REPULSION */
316
317             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
318             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
319             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
320             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
321             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
322
323             d                = _mm256_sub_pd(r00,rswitch);
324             d                = _mm256_max_pd(d,_mm256_setzero_pd());
325             d2               = _mm256_mul_pd(d,d);
326             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
327
328             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
329
330             /* Evaluate switch function */
331             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
332             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
333             vvdw             = _mm256_mul_pd(vvdw,sw);
334             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             velec            = _mm256_and_pd(velec,cutoff_mask);
338             velecsum         = _mm256_add_pd(velecsum,velec);
339             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
340             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
341
342             fscal            = _mm256_add_pd(felec,fvdw);
343
344             fscal            = _mm256_and_pd(fscal,cutoff_mask);
345
346             /* Calculate temporary vectorial force */
347             tx               = _mm256_mul_pd(fscal,dx00);
348             ty               = _mm256_mul_pd(fscal,dy00);
349             tz               = _mm256_mul_pd(fscal,dz00);
350
351             /* Update vectorial force */
352             fix0             = _mm256_add_pd(fix0,tx);
353             fiy0             = _mm256_add_pd(fiy0,ty);
354             fiz0             = _mm256_add_pd(fiz0,tz);
355
356             fjx0             = _mm256_add_pd(fjx0,tx);
357             fjy0             = _mm256_add_pd(fjy0,ty);
358             fjz0             = _mm256_add_pd(fjz0,tz);
359
360             }
361
362             /**************************
363              * CALCULATE INTERACTIONS *
364              **************************/
365
366             if (gmx_mm256_any_lt(rsq01,rcutoff2))
367             {
368
369             /* REACTION-FIELD ELECTROSTATICS */
370             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
371             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
372
373             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
374
375             /* Update potential sum for this i atom from the interaction with this j atom. */
376             velec            = _mm256_and_pd(velec,cutoff_mask);
377             velecsum         = _mm256_add_pd(velecsum,velec);
378
379             fscal            = felec;
380
381             fscal            = _mm256_and_pd(fscal,cutoff_mask);
382
383             /* Calculate temporary vectorial force */
384             tx               = _mm256_mul_pd(fscal,dx01);
385             ty               = _mm256_mul_pd(fscal,dy01);
386             tz               = _mm256_mul_pd(fscal,dz01);
387
388             /* Update vectorial force */
389             fix0             = _mm256_add_pd(fix0,tx);
390             fiy0             = _mm256_add_pd(fiy0,ty);
391             fiz0             = _mm256_add_pd(fiz0,tz);
392
393             fjx1             = _mm256_add_pd(fjx1,tx);
394             fjy1             = _mm256_add_pd(fjy1,ty);
395             fjz1             = _mm256_add_pd(fjz1,tz);
396
397             }
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             if (gmx_mm256_any_lt(rsq02,rcutoff2))
404             {
405
406             /* REACTION-FIELD ELECTROSTATICS */
407             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
408             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
409
410             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velec            = _mm256_and_pd(velec,cutoff_mask);
414             velecsum         = _mm256_add_pd(velecsum,velec);
415
416             fscal            = felec;
417
418             fscal            = _mm256_and_pd(fscal,cutoff_mask);
419
420             /* Calculate temporary vectorial force */
421             tx               = _mm256_mul_pd(fscal,dx02);
422             ty               = _mm256_mul_pd(fscal,dy02);
423             tz               = _mm256_mul_pd(fscal,dz02);
424
425             /* Update vectorial force */
426             fix0             = _mm256_add_pd(fix0,tx);
427             fiy0             = _mm256_add_pd(fiy0,ty);
428             fiz0             = _mm256_add_pd(fiz0,tz);
429
430             fjx2             = _mm256_add_pd(fjx2,tx);
431             fjy2             = _mm256_add_pd(fjy2,ty);
432             fjz2             = _mm256_add_pd(fjz2,tz);
433
434             }
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             if (gmx_mm256_any_lt(rsq10,rcutoff2))
441             {
442
443             /* REACTION-FIELD ELECTROSTATICS */
444             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
445             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
446
447             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
448
449             /* Update potential sum for this i atom from the interaction with this j atom. */
450             velec            = _mm256_and_pd(velec,cutoff_mask);
451             velecsum         = _mm256_add_pd(velecsum,velec);
452
453             fscal            = felec;
454
455             fscal            = _mm256_and_pd(fscal,cutoff_mask);
456
457             /* Calculate temporary vectorial force */
458             tx               = _mm256_mul_pd(fscal,dx10);
459             ty               = _mm256_mul_pd(fscal,dy10);
460             tz               = _mm256_mul_pd(fscal,dz10);
461
462             /* Update vectorial force */
463             fix1             = _mm256_add_pd(fix1,tx);
464             fiy1             = _mm256_add_pd(fiy1,ty);
465             fiz1             = _mm256_add_pd(fiz1,tz);
466
467             fjx0             = _mm256_add_pd(fjx0,tx);
468             fjy0             = _mm256_add_pd(fjy0,ty);
469             fjz0             = _mm256_add_pd(fjz0,tz);
470
471             }
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             if (gmx_mm256_any_lt(rsq11,rcutoff2))
478             {
479
480             /* REACTION-FIELD ELECTROSTATICS */
481             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
482             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
483
484             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velec            = _mm256_and_pd(velec,cutoff_mask);
488             velecsum         = _mm256_add_pd(velecsum,velec);
489
490             fscal            = felec;
491
492             fscal            = _mm256_and_pd(fscal,cutoff_mask);
493
494             /* Calculate temporary vectorial force */
495             tx               = _mm256_mul_pd(fscal,dx11);
496             ty               = _mm256_mul_pd(fscal,dy11);
497             tz               = _mm256_mul_pd(fscal,dz11);
498
499             /* Update vectorial force */
500             fix1             = _mm256_add_pd(fix1,tx);
501             fiy1             = _mm256_add_pd(fiy1,ty);
502             fiz1             = _mm256_add_pd(fiz1,tz);
503
504             fjx1             = _mm256_add_pd(fjx1,tx);
505             fjy1             = _mm256_add_pd(fjy1,ty);
506             fjz1             = _mm256_add_pd(fjz1,tz);
507
508             }
509
510             /**************************
511              * CALCULATE INTERACTIONS *
512              **************************/
513
514             if (gmx_mm256_any_lt(rsq12,rcutoff2))
515             {
516
517             /* REACTION-FIELD ELECTROSTATICS */
518             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
519             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
520
521             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velec            = _mm256_and_pd(velec,cutoff_mask);
525             velecsum         = _mm256_add_pd(velecsum,velec);
526
527             fscal            = felec;
528
529             fscal            = _mm256_and_pd(fscal,cutoff_mask);
530
531             /* Calculate temporary vectorial force */
532             tx               = _mm256_mul_pd(fscal,dx12);
533             ty               = _mm256_mul_pd(fscal,dy12);
534             tz               = _mm256_mul_pd(fscal,dz12);
535
536             /* Update vectorial force */
537             fix1             = _mm256_add_pd(fix1,tx);
538             fiy1             = _mm256_add_pd(fiy1,ty);
539             fiz1             = _mm256_add_pd(fiz1,tz);
540
541             fjx2             = _mm256_add_pd(fjx2,tx);
542             fjy2             = _mm256_add_pd(fjy2,ty);
543             fjz2             = _mm256_add_pd(fjz2,tz);
544
545             }
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             if (gmx_mm256_any_lt(rsq20,rcutoff2))
552             {
553
554             /* REACTION-FIELD ELECTROSTATICS */
555             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
556             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
557
558             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
559
560             /* Update potential sum for this i atom from the interaction with this j atom. */
561             velec            = _mm256_and_pd(velec,cutoff_mask);
562             velecsum         = _mm256_add_pd(velecsum,velec);
563
564             fscal            = felec;
565
566             fscal            = _mm256_and_pd(fscal,cutoff_mask);
567
568             /* Calculate temporary vectorial force */
569             tx               = _mm256_mul_pd(fscal,dx20);
570             ty               = _mm256_mul_pd(fscal,dy20);
571             tz               = _mm256_mul_pd(fscal,dz20);
572
573             /* Update vectorial force */
574             fix2             = _mm256_add_pd(fix2,tx);
575             fiy2             = _mm256_add_pd(fiy2,ty);
576             fiz2             = _mm256_add_pd(fiz2,tz);
577
578             fjx0             = _mm256_add_pd(fjx0,tx);
579             fjy0             = _mm256_add_pd(fjy0,ty);
580             fjz0             = _mm256_add_pd(fjz0,tz);
581
582             }
583
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             if (gmx_mm256_any_lt(rsq21,rcutoff2))
589             {
590
591             /* REACTION-FIELD ELECTROSTATICS */
592             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
593             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
594
595             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
596
597             /* Update potential sum for this i atom from the interaction with this j atom. */
598             velec            = _mm256_and_pd(velec,cutoff_mask);
599             velecsum         = _mm256_add_pd(velecsum,velec);
600
601             fscal            = felec;
602
603             fscal            = _mm256_and_pd(fscal,cutoff_mask);
604
605             /* Calculate temporary vectorial force */
606             tx               = _mm256_mul_pd(fscal,dx21);
607             ty               = _mm256_mul_pd(fscal,dy21);
608             tz               = _mm256_mul_pd(fscal,dz21);
609
610             /* Update vectorial force */
611             fix2             = _mm256_add_pd(fix2,tx);
612             fiy2             = _mm256_add_pd(fiy2,ty);
613             fiz2             = _mm256_add_pd(fiz2,tz);
614
615             fjx1             = _mm256_add_pd(fjx1,tx);
616             fjy1             = _mm256_add_pd(fjy1,ty);
617             fjz1             = _mm256_add_pd(fjz1,tz);
618
619             }
620
621             /**************************
622              * CALCULATE INTERACTIONS *
623              **************************/
624
625             if (gmx_mm256_any_lt(rsq22,rcutoff2))
626             {
627
628             /* REACTION-FIELD ELECTROSTATICS */
629             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
630             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
631
632             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
633
634             /* Update potential sum for this i atom from the interaction with this j atom. */
635             velec            = _mm256_and_pd(velec,cutoff_mask);
636             velecsum         = _mm256_add_pd(velecsum,velec);
637
638             fscal            = felec;
639
640             fscal            = _mm256_and_pd(fscal,cutoff_mask);
641
642             /* Calculate temporary vectorial force */
643             tx               = _mm256_mul_pd(fscal,dx22);
644             ty               = _mm256_mul_pd(fscal,dy22);
645             tz               = _mm256_mul_pd(fscal,dz22);
646
647             /* Update vectorial force */
648             fix2             = _mm256_add_pd(fix2,tx);
649             fiy2             = _mm256_add_pd(fiy2,ty);
650             fiz2             = _mm256_add_pd(fiz2,tz);
651
652             fjx2             = _mm256_add_pd(fjx2,tx);
653             fjy2             = _mm256_add_pd(fjy2,ty);
654             fjz2             = _mm256_add_pd(fjz2,tz);
655
656             }
657
658             fjptrA             = f+j_coord_offsetA;
659             fjptrB             = f+j_coord_offsetB;
660             fjptrC             = f+j_coord_offsetC;
661             fjptrD             = f+j_coord_offsetD;
662
663             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
664                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
665
666             /* Inner loop uses 358 flops */
667         }
668
669         if(jidx<j_index_end)
670         {
671
672             /* Get j neighbor index, and coordinate index */
673             jnrlistA         = jjnr[jidx];
674             jnrlistB         = jjnr[jidx+1];
675             jnrlistC         = jjnr[jidx+2];
676             jnrlistD         = jjnr[jidx+3];
677             /* Sign of each element will be negative for non-real atoms.
678              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
679              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
680              */
681             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
682
683             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
684             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
685             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
686
687             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
688             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
689             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
690             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
691             j_coord_offsetA  = DIM*jnrA;
692             j_coord_offsetB  = DIM*jnrB;
693             j_coord_offsetC  = DIM*jnrC;
694             j_coord_offsetD  = DIM*jnrD;
695
696             /* load j atom coordinates */
697             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
698                                                  x+j_coord_offsetC,x+j_coord_offsetD,
699                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
700
701             /* Calculate displacement vector */
702             dx00             = _mm256_sub_pd(ix0,jx0);
703             dy00             = _mm256_sub_pd(iy0,jy0);
704             dz00             = _mm256_sub_pd(iz0,jz0);
705             dx01             = _mm256_sub_pd(ix0,jx1);
706             dy01             = _mm256_sub_pd(iy0,jy1);
707             dz01             = _mm256_sub_pd(iz0,jz1);
708             dx02             = _mm256_sub_pd(ix0,jx2);
709             dy02             = _mm256_sub_pd(iy0,jy2);
710             dz02             = _mm256_sub_pd(iz0,jz2);
711             dx10             = _mm256_sub_pd(ix1,jx0);
712             dy10             = _mm256_sub_pd(iy1,jy0);
713             dz10             = _mm256_sub_pd(iz1,jz0);
714             dx11             = _mm256_sub_pd(ix1,jx1);
715             dy11             = _mm256_sub_pd(iy1,jy1);
716             dz11             = _mm256_sub_pd(iz1,jz1);
717             dx12             = _mm256_sub_pd(ix1,jx2);
718             dy12             = _mm256_sub_pd(iy1,jy2);
719             dz12             = _mm256_sub_pd(iz1,jz2);
720             dx20             = _mm256_sub_pd(ix2,jx0);
721             dy20             = _mm256_sub_pd(iy2,jy0);
722             dz20             = _mm256_sub_pd(iz2,jz0);
723             dx21             = _mm256_sub_pd(ix2,jx1);
724             dy21             = _mm256_sub_pd(iy2,jy1);
725             dz21             = _mm256_sub_pd(iz2,jz1);
726             dx22             = _mm256_sub_pd(ix2,jx2);
727             dy22             = _mm256_sub_pd(iy2,jy2);
728             dz22             = _mm256_sub_pd(iz2,jz2);
729
730             /* Calculate squared distance and things based on it */
731             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
732             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
733             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
734             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
735             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
736             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
737             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
738             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
739             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
740
741             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
742             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
743             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
744             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
745             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
746             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
747             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
748             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
749             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
750
751             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
752             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
753             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
754             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
755             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
756             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
757             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
758             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
759             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
760
761             fjx0             = _mm256_setzero_pd();
762             fjy0             = _mm256_setzero_pd();
763             fjz0             = _mm256_setzero_pd();
764             fjx1             = _mm256_setzero_pd();
765             fjy1             = _mm256_setzero_pd();
766             fjz1             = _mm256_setzero_pd();
767             fjx2             = _mm256_setzero_pd();
768             fjy2             = _mm256_setzero_pd();
769             fjz2             = _mm256_setzero_pd();
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             if (gmx_mm256_any_lt(rsq00,rcutoff2))
776             {
777
778             r00              = _mm256_mul_pd(rsq00,rinv00);
779             r00              = _mm256_andnot_pd(dummy_mask,r00);
780
781             /* REACTION-FIELD ELECTROSTATICS */
782             velec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
783             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
784
785             /* LENNARD-JONES DISPERSION/REPULSION */
786
787             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
788             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
789             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
790             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
791             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
792
793             d                = _mm256_sub_pd(r00,rswitch);
794             d                = _mm256_max_pd(d,_mm256_setzero_pd());
795             d2               = _mm256_mul_pd(d,d);
796             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
797
798             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
799
800             /* Evaluate switch function */
801             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
802             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
803             vvdw             = _mm256_mul_pd(vvdw,sw);
804             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
805
806             /* Update potential sum for this i atom from the interaction with this j atom. */
807             velec            = _mm256_and_pd(velec,cutoff_mask);
808             velec            = _mm256_andnot_pd(dummy_mask,velec);
809             velecsum         = _mm256_add_pd(velecsum,velec);
810             vvdw             = _mm256_and_pd(vvdw,cutoff_mask);
811             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
812             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
813
814             fscal            = _mm256_add_pd(felec,fvdw);
815
816             fscal            = _mm256_and_pd(fscal,cutoff_mask);
817
818             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
819
820             /* Calculate temporary vectorial force */
821             tx               = _mm256_mul_pd(fscal,dx00);
822             ty               = _mm256_mul_pd(fscal,dy00);
823             tz               = _mm256_mul_pd(fscal,dz00);
824
825             /* Update vectorial force */
826             fix0             = _mm256_add_pd(fix0,tx);
827             fiy0             = _mm256_add_pd(fiy0,ty);
828             fiz0             = _mm256_add_pd(fiz0,tz);
829
830             fjx0             = _mm256_add_pd(fjx0,tx);
831             fjy0             = _mm256_add_pd(fjy0,ty);
832             fjz0             = _mm256_add_pd(fjz0,tz);
833
834             }
835
836             /**************************
837              * CALCULATE INTERACTIONS *
838              **************************/
839
840             if (gmx_mm256_any_lt(rsq01,rcutoff2))
841             {
842
843             /* REACTION-FIELD ELECTROSTATICS */
844             velec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
845             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
846
847             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
848
849             /* Update potential sum for this i atom from the interaction with this j atom. */
850             velec            = _mm256_and_pd(velec,cutoff_mask);
851             velec            = _mm256_andnot_pd(dummy_mask,velec);
852             velecsum         = _mm256_add_pd(velecsum,velec);
853
854             fscal            = felec;
855
856             fscal            = _mm256_and_pd(fscal,cutoff_mask);
857
858             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
859
860             /* Calculate temporary vectorial force */
861             tx               = _mm256_mul_pd(fscal,dx01);
862             ty               = _mm256_mul_pd(fscal,dy01);
863             tz               = _mm256_mul_pd(fscal,dz01);
864
865             /* Update vectorial force */
866             fix0             = _mm256_add_pd(fix0,tx);
867             fiy0             = _mm256_add_pd(fiy0,ty);
868             fiz0             = _mm256_add_pd(fiz0,tz);
869
870             fjx1             = _mm256_add_pd(fjx1,tx);
871             fjy1             = _mm256_add_pd(fjy1,ty);
872             fjz1             = _mm256_add_pd(fjz1,tz);
873
874             }
875
876             /**************************
877              * CALCULATE INTERACTIONS *
878              **************************/
879
880             if (gmx_mm256_any_lt(rsq02,rcutoff2))
881             {
882
883             /* REACTION-FIELD ELECTROSTATICS */
884             velec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
885             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
886
887             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
888
889             /* Update potential sum for this i atom from the interaction with this j atom. */
890             velec            = _mm256_and_pd(velec,cutoff_mask);
891             velec            = _mm256_andnot_pd(dummy_mask,velec);
892             velecsum         = _mm256_add_pd(velecsum,velec);
893
894             fscal            = felec;
895
896             fscal            = _mm256_and_pd(fscal,cutoff_mask);
897
898             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
899
900             /* Calculate temporary vectorial force */
901             tx               = _mm256_mul_pd(fscal,dx02);
902             ty               = _mm256_mul_pd(fscal,dy02);
903             tz               = _mm256_mul_pd(fscal,dz02);
904
905             /* Update vectorial force */
906             fix0             = _mm256_add_pd(fix0,tx);
907             fiy0             = _mm256_add_pd(fiy0,ty);
908             fiz0             = _mm256_add_pd(fiz0,tz);
909
910             fjx2             = _mm256_add_pd(fjx2,tx);
911             fjy2             = _mm256_add_pd(fjy2,ty);
912             fjz2             = _mm256_add_pd(fjz2,tz);
913
914             }
915
916             /**************************
917              * CALCULATE INTERACTIONS *
918              **************************/
919
920             if (gmx_mm256_any_lt(rsq10,rcutoff2))
921             {
922
923             /* REACTION-FIELD ELECTROSTATICS */
924             velec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
925             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
926
927             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
928
929             /* Update potential sum for this i atom from the interaction with this j atom. */
930             velec            = _mm256_and_pd(velec,cutoff_mask);
931             velec            = _mm256_andnot_pd(dummy_mask,velec);
932             velecsum         = _mm256_add_pd(velecsum,velec);
933
934             fscal            = felec;
935
936             fscal            = _mm256_and_pd(fscal,cutoff_mask);
937
938             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
939
940             /* Calculate temporary vectorial force */
941             tx               = _mm256_mul_pd(fscal,dx10);
942             ty               = _mm256_mul_pd(fscal,dy10);
943             tz               = _mm256_mul_pd(fscal,dz10);
944
945             /* Update vectorial force */
946             fix1             = _mm256_add_pd(fix1,tx);
947             fiy1             = _mm256_add_pd(fiy1,ty);
948             fiz1             = _mm256_add_pd(fiz1,tz);
949
950             fjx0             = _mm256_add_pd(fjx0,tx);
951             fjy0             = _mm256_add_pd(fjy0,ty);
952             fjz0             = _mm256_add_pd(fjz0,tz);
953
954             }
955
956             /**************************
957              * CALCULATE INTERACTIONS *
958              **************************/
959
960             if (gmx_mm256_any_lt(rsq11,rcutoff2))
961             {
962
963             /* REACTION-FIELD ELECTROSTATICS */
964             velec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
965             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
966
967             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
968
969             /* Update potential sum for this i atom from the interaction with this j atom. */
970             velec            = _mm256_and_pd(velec,cutoff_mask);
971             velec            = _mm256_andnot_pd(dummy_mask,velec);
972             velecsum         = _mm256_add_pd(velecsum,velec);
973
974             fscal            = felec;
975
976             fscal            = _mm256_and_pd(fscal,cutoff_mask);
977
978             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
979
980             /* Calculate temporary vectorial force */
981             tx               = _mm256_mul_pd(fscal,dx11);
982             ty               = _mm256_mul_pd(fscal,dy11);
983             tz               = _mm256_mul_pd(fscal,dz11);
984
985             /* Update vectorial force */
986             fix1             = _mm256_add_pd(fix1,tx);
987             fiy1             = _mm256_add_pd(fiy1,ty);
988             fiz1             = _mm256_add_pd(fiz1,tz);
989
990             fjx1             = _mm256_add_pd(fjx1,tx);
991             fjy1             = _mm256_add_pd(fjy1,ty);
992             fjz1             = _mm256_add_pd(fjz1,tz);
993
994             }
995
996             /**************************
997              * CALCULATE INTERACTIONS *
998              **************************/
999
1000             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1001             {
1002
1003             /* REACTION-FIELD ELECTROSTATICS */
1004             velec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
1005             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1006
1007             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1008
1009             /* Update potential sum for this i atom from the interaction with this j atom. */
1010             velec            = _mm256_and_pd(velec,cutoff_mask);
1011             velec            = _mm256_andnot_pd(dummy_mask,velec);
1012             velecsum         = _mm256_add_pd(velecsum,velec);
1013
1014             fscal            = felec;
1015
1016             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1017
1018             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm256_mul_pd(fscal,dx12);
1022             ty               = _mm256_mul_pd(fscal,dy12);
1023             tz               = _mm256_mul_pd(fscal,dz12);
1024
1025             /* Update vectorial force */
1026             fix1             = _mm256_add_pd(fix1,tx);
1027             fiy1             = _mm256_add_pd(fiy1,ty);
1028             fiz1             = _mm256_add_pd(fiz1,tz);
1029
1030             fjx2             = _mm256_add_pd(fjx2,tx);
1031             fjy2             = _mm256_add_pd(fjy2,ty);
1032             fjz2             = _mm256_add_pd(fjz2,tz);
1033
1034             }
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1041             {
1042
1043             /* REACTION-FIELD ELECTROSTATICS */
1044             velec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
1045             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1046
1047             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _mm256_and_pd(velec,cutoff_mask);
1051             velec            = _mm256_andnot_pd(dummy_mask,velec);
1052             velecsum         = _mm256_add_pd(velecsum,velec);
1053
1054             fscal            = felec;
1055
1056             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1057
1058             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1059
1060             /* Calculate temporary vectorial force */
1061             tx               = _mm256_mul_pd(fscal,dx20);
1062             ty               = _mm256_mul_pd(fscal,dy20);
1063             tz               = _mm256_mul_pd(fscal,dz20);
1064
1065             /* Update vectorial force */
1066             fix2             = _mm256_add_pd(fix2,tx);
1067             fiy2             = _mm256_add_pd(fiy2,ty);
1068             fiz2             = _mm256_add_pd(fiz2,tz);
1069
1070             fjx0             = _mm256_add_pd(fjx0,tx);
1071             fjy0             = _mm256_add_pd(fjy0,ty);
1072             fjz0             = _mm256_add_pd(fjz0,tz);
1073
1074             }
1075
1076             /**************************
1077              * CALCULATE INTERACTIONS *
1078              **************************/
1079
1080             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1081             {
1082
1083             /* REACTION-FIELD ELECTROSTATICS */
1084             velec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1085             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1086
1087             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm256_and_pd(velec,cutoff_mask);
1091             velec            = _mm256_andnot_pd(dummy_mask,velec);
1092             velecsum         = _mm256_add_pd(velecsum,velec);
1093
1094             fscal            = felec;
1095
1096             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1097
1098             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1099
1100             /* Calculate temporary vectorial force */
1101             tx               = _mm256_mul_pd(fscal,dx21);
1102             ty               = _mm256_mul_pd(fscal,dy21);
1103             tz               = _mm256_mul_pd(fscal,dz21);
1104
1105             /* Update vectorial force */
1106             fix2             = _mm256_add_pd(fix2,tx);
1107             fiy2             = _mm256_add_pd(fiy2,ty);
1108             fiz2             = _mm256_add_pd(fiz2,tz);
1109
1110             fjx1             = _mm256_add_pd(fjx1,tx);
1111             fjy1             = _mm256_add_pd(fjy1,ty);
1112             fjz1             = _mm256_add_pd(fjz1,tz);
1113
1114             }
1115
1116             /**************************
1117              * CALCULATE INTERACTIONS *
1118              **************************/
1119
1120             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1121             {
1122
1123             /* REACTION-FIELD ELECTROSTATICS */
1124             velec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1125             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1126
1127             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1128
1129             /* Update potential sum for this i atom from the interaction with this j atom. */
1130             velec            = _mm256_and_pd(velec,cutoff_mask);
1131             velec            = _mm256_andnot_pd(dummy_mask,velec);
1132             velecsum         = _mm256_add_pd(velecsum,velec);
1133
1134             fscal            = felec;
1135
1136             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1137
1138             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1139
1140             /* Calculate temporary vectorial force */
1141             tx               = _mm256_mul_pd(fscal,dx22);
1142             ty               = _mm256_mul_pd(fscal,dy22);
1143             tz               = _mm256_mul_pd(fscal,dz22);
1144
1145             /* Update vectorial force */
1146             fix2             = _mm256_add_pd(fix2,tx);
1147             fiy2             = _mm256_add_pd(fiy2,ty);
1148             fiz2             = _mm256_add_pd(fiz2,tz);
1149
1150             fjx2             = _mm256_add_pd(fjx2,tx);
1151             fjy2             = _mm256_add_pd(fjy2,ty);
1152             fjz2             = _mm256_add_pd(fjz2,tz);
1153
1154             }
1155
1156             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1157             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1158             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1159             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1160
1161             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1162                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1163
1164             /* Inner loop uses 359 flops */
1165         }
1166
1167         /* End of innermost loop */
1168
1169         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1170                                                  f+i_coord_offset,fshift+i_shift_offset);
1171
1172         ggid                        = gid[iidx];
1173         /* Update potential energies */
1174         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1175         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1176
1177         /* Increment number of inner iterations */
1178         inneriter                  += j_index_end - j_index_start;
1179
1180         /* Outer loop uses 20 flops */
1181     }
1182
1183     /* Increment number of outer iterations */
1184     outeriter        += nri;
1185
1186     /* Update outer/inner flops */
1187
1188     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
1189 }
1190 /*
1191  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1192  * Electrostatics interaction: ReactionField
1193  * VdW interaction:            LennardJones
1194  * Geometry:                   Water3-Water3
1195  * Calculate force/pot:        Force
1196  */
1197 void
1198 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_256_double
1199                     (t_nblist * gmx_restrict                nlist,
1200                      rvec * gmx_restrict                    xx,
1201                      rvec * gmx_restrict                    ff,
1202                      t_forcerec * gmx_restrict              fr,
1203                      t_mdatoms * gmx_restrict               mdatoms,
1204                      nb_kernel_data_t * gmx_restrict        kernel_data,
1205                      t_nrnb * gmx_restrict                  nrnb)
1206 {
1207     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1208      * just 0 for non-waters.
1209      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1210      * jnr indices corresponding to data put in the four positions in the SIMD register.
1211      */
1212     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1213     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1214     int              jnrA,jnrB,jnrC,jnrD;
1215     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1216     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1217     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1218     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1219     real             rcutoff_scalar;
1220     real             *shiftvec,*fshift,*x,*f;
1221     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1222     real             scratch[4*DIM];
1223     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1224     real *           vdwioffsetptr0;
1225     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1226     real *           vdwioffsetptr1;
1227     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1228     real *           vdwioffsetptr2;
1229     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1230     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1231     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1232     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1233     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1234     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1235     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1236     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1237     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1238     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1239     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1240     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1241     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1242     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1243     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1244     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1245     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1246     real             *charge;
1247     int              nvdwtype;
1248     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1249     int              *vdwtype;
1250     real             *vdwparam;
1251     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1252     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1253     __m256d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1254     real             rswitch_scalar,d_scalar;
1255     __m256d          dummy_mask,cutoff_mask;
1256     __m128           tmpmask0,tmpmask1;
1257     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1258     __m256d          one     = _mm256_set1_pd(1.0);
1259     __m256d          two     = _mm256_set1_pd(2.0);
1260     x                = xx[0];
1261     f                = ff[0];
1262
1263     nri              = nlist->nri;
1264     iinr             = nlist->iinr;
1265     jindex           = nlist->jindex;
1266     jjnr             = nlist->jjnr;
1267     shiftidx         = nlist->shift;
1268     gid              = nlist->gid;
1269     shiftvec         = fr->shift_vec[0];
1270     fshift           = fr->fshift[0];
1271     facel            = _mm256_set1_pd(fr->epsfac);
1272     charge           = mdatoms->chargeA;
1273     krf              = _mm256_set1_pd(fr->ic->k_rf);
1274     krf2             = _mm256_set1_pd(fr->ic->k_rf*2.0);
1275     crf              = _mm256_set1_pd(fr->ic->c_rf);
1276     nvdwtype         = fr->ntype;
1277     vdwparam         = fr->nbfp;
1278     vdwtype          = mdatoms->typeA;
1279
1280     /* Setup water-specific parameters */
1281     inr              = nlist->iinr[0];
1282     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1283     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1284     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1285     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1286
1287     jq0              = _mm256_set1_pd(charge[inr+0]);
1288     jq1              = _mm256_set1_pd(charge[inr+1]);
1289     jq2              = _mm256_set1_pd(charge[inr+2]);
1290     vdwjidx0A        = 2*vdwtype[inr+0];
1291     qq00             = _mm256_mul_pd(iq0,jq0);
1292     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1293     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1294     qq01             = _mm256_mul_pd(iq0,jq1);
1295     qq02             = _mm256_mul_pd(iq0,jq2);
1296     qq10             = _mm256_mul_pd(iq1,jq0);
1297     qq11             = _mm256_mul_pd(iq1,jq1);
1298     qq12             = _mm256_mul_pd(iq1,jq2);
1299     qq20             = _mm256_mul_pd(iq2,jq0);
1300     qq21             = _mm256_mul_pd(iq2,jq1);
1301     qq22             = _mm256_mul_pd(iq2,jq2);
1302
1303     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1304     rcutoff_scalar   = fr->rcoulomb;
1305     rcutoff          = _mm256_set1_pd(rcutoff_scalar);
1306     rcutoff2         = _mm256_mul_pd(rcutoff,rcutoff);
1307
1308     rswitch_scalar   = fr->rvdw_switch;
1309     rswitch          = _mm256_set1_pd(rswitch_scalar);
1310     /* Setup switch parameters */
1311     d_scalar         = rcutoff_scalar-rswitch_scalar;
1312     d                = _mm256_set1_pd(d_scalar);
1313     swV3             = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1314     swV4             = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1315     swV5             = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1316     swF2             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1317     swF3             = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1318     swF4             = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1319
1320     /* Avoid stupid compiler warnings */
1321     jnrA = jnrB = jnrC = jnrD = 0;
1322     j_coord_offsetA = 0;
1323     j_coord_offsetB = 0;
1324     j_coord_offsetC = 0;
1325     j_coord_offsetD = 0;
1326
1327     outeriter        = 0;
1328     inneriter        = 0;
1329
1330     for(iidx=0;iidx<4*DIM;iidx++)
1331     {
1332         scratch[iidx] = 0.0;
1333     }
1334
1335     /* Start outer loop over neighborlists */
1336     for(iidx=0; iidx<nri; iidx++)
1337     {
1338         /* Load shift vector for this list */
1339         i_shift_offset   = DIM*shiftidx[iidx];
1340
1341         /* Load limits for loop over neighbors */
1342         j_index_start    = jindex[iidx];
1343         j_index_end      = jindex[iidx+1];
1344
1345         /* Get outer coordinate index */
1346         inr              = iinr[iidx];
1347         i_coord_offset   = DIM*inr;
1348
1349         /* Load i particle coords and add shift vector */
1350         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1351                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1352
1353         fix0             = _mm256_setzero_pd();
1354         fiy0             = _mm256_setzero_pd();
1355         fiz0             = _mm256_setzero_pd();
1356         fix1             = _mm256_setzero_pd();
1357         fiy1             = _mm256_setzero_pd();
1358         fiz1             = _mm256_setzero_pd();
1359         fix2             = _mm256_setzero_pd();
1360         fiy2             = _mm256_setzero_pd();
1361         fiz2             = _mm256_setzero_pd();
1362
1363         /* Start inner kernel loop */
1364         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1365         {
1366
1367             /* Get j neighbor index, and coordinate index */
1368             jnrA             = jjnr[jidx];
1369             jnrB             = jjnr[jidx+1];
1370             jnrC             = jjnr[jidx+2];
1371             jnrD             = jjnr[jidx+3];
1372             j_coord_offsetA  = DIM*jnrA;
1373             j_coord_offsetB  = DIM*jnrB;
1374             j_coord_offsetC  = DIM*jnrC;
1375             j_coord_offsetD  = DIM*jnrD;
1376
1377             /* load j atom coordinates */
1378             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1379                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1380                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1381
1382             /* Calculate displacement vector */
1383             dx00             = _mm256_sub_pd(ix0,jx0);
1384             dy00             = _mm256_sub_pd(iy0,jy0);
1385             dz00             = _mm256_sub_pd(iz0,jz0);
1386             dx01             = _mm256_sub_pd(ix0,jx1);
1387             dy01             = _mm256_sub_pd(iy0,jy1);
1388             dz01             = _mm256_sub_pd(iz0,jz1);
1389             dx02             = _mm256_sub_pd(ix0,jx2);
1390             dy02             = _mm256_sub_pd(iy0,jy2);
1391             dz02             = _mm256_sub_pd(iz0,jz2);
1392             dx10             = _mm256_sub_pd(ix1,jx0);
1393             dy10             = _mm256_sub_pd(iy1,jy0);
1394             dz10             = _mm256_sub_pd(iz1,jz0);
1395             dx11             = _mm256_sub_pd(ix1,jx1);
1396             dy11             = _mm256_sub_pd(iy1,jy1);
1397             dz11             = _mm256_sub_pd(iz1,jz1);
1398             dx12             = _mm256_sub_pd(ix1,jx2);
1399             dy12             = _mm256_sub_pd(iy1,jy2);
1400             dz12             = _mm256_sub_pd(iz1,jz2);
1401             dx20             = _mm256_sub_pd(ix2,jx0);
1402             dy20             = _mm256_sub_pd(iy2,jy0);
1403             dz20             = _mm256_sub_pd(iz2,jz0);
1404             dx21             = _mm256_sub_pd(ix2,jx1);
1405             dy21             = _mm256_sub_pd(iy2,jy1);
1406             dz21             = _mm256_sub_pd(iz2,jz1);
1407             dx22             = _mm256_sub_pd(ix2,jx2);
1408             dy22             = _mm256_sub_pd(iy2,jy2);
1409             dz22             = _mm256_sub_pd(iz2,jz2);
1410
1411             /* Calculate squared distance and things based on it */
1412             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1413             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1414             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1415             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1416             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1417             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1418             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1419             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1420             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1421
1422             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1423             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1424             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1425             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1426             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1427             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1428             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1429             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1430             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1431
1432             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1433             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1434             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1435             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1436             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1437             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1438             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1439             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1440             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1441
1442             fjx0             = _mm256_setzero_pd();
1443             fjy0             = _mm256_setzero_pd();
1444             fjz0             = _mm256_setzero_pd();
1445             fjx1             = _mm256_setzero_pd();
1446             fjy1             = _mm256_setzero_pd();
1447             fjz1             = _mm256_setzero_pd();
1448             fjx2             = _mm256_setzero_pd();
1449             fjy2             = _mm256_setzero_pd();
1450             fjz2             = _mm256_setzero_pd();
1451
1452             /**************************
1453              * CALCULATE INTERACTIONS *
1454              **************************/
1455
1456             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1457             {
1458
1459             r00              = _mm256_mul_pd(rsq00,rinv00);
1460
1461             /* REACTION-FIELD ELECTROSTATICS */
1462             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1463
1464             /* LENNARD-JONES DISPERSION/REPULSION */
1465
1466             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1467             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1468             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1469             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1470             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1471
1472             d                = _mm256_sub_pd(r00,rswitch);
1473             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1474             d2               = _mm256_mul_pd(d,d);
1475             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1476
1477             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1478
1479             /* Evaluate switch function */
1480             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1481             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1482             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1483
1484             fscal            = _mm256_add_pd(felec,fvdw);
1485
1486             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1487
1488             /* Calculate temporary vectorial force */
1489             tx               = _mm256_mul_pd(fscal,dx00);
1490             ty               = _mm256_mul_pd(fscal,dy00);
1491             tz               = _mm256_mul_pd(fscal,dz00);
1492
1493             /* Update vectorial force */
1494             fix0             = _mm256_add_pd(fix0,tx);
1495             fiy0             = _mm256_add_pd(fiy0,ty);
1496             fiz0             = _mm256_add_pd(fiz0,tz);
1497
1498             fjx0             = _mm256_add_pd(fjx0,tx);
1499             fjy0             = _mm256_add_pd(fjy0,ty);
1500             fjz0             = _mm256_add_pd(fjz0,tz);
1501
1502             }
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1509             {
1510
1511             /* REACTION-FIELD ELECTROSTATICS */
1512             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1513
1514             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1515
1516             fscal            = felec;
1517
1518             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm256_mul_pd(fscal,dx01);
1522             ty               = _mm256_mul_pd(fscal,dy01);
1523             tz               = _mm256_mul_pd(fscal,dz01);
1524
1525             /* Update vectorial force */
1526             fix0             = _mm256_add_pd(fix0,tx);
1527             fiy0             = _mm256_add_pd(fiy0,ty);
1528             fiz0             = _mm256_add_pd(fiz0,tz);
1529
1530             fjx1             = _mm256_add_pd(fjx1,tx);
1531             fjy1             = _mm256_add_pd(fjy1,ty);
1532             fjz1             = _mm256_add_pd(fjz1,tz);
1533
1534             }
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1541             {
1542
1543             /* REACTION-FIELD ELECTROSTATICS */
1544             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1545
1546             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1547
1548             fscal            = felec;
1549
1550             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1551
1552             /* Calculate temporary vectorial force */
1553             tx               = _mm256_mul_pd(fscal,dx02);
1554             ty               = _mm256_mul_pd(fscal,dy02);
1555             tz               = _mm256_mul_pd(fscal,dz02);
1556
1557             /* Update vectorial force */
1558             fix0             = _mm256_add_pd(fix0,tx);
1559             fiy0             = _mm256_add_pd(fiy0,ty);
1560             fiz0             = _mm256_add_pd(fiz0,tz);
1561
1562             fjx2             = _mm256_add_pd(fjx2,tx);
1563             fjy2             = _mm256_add_pd(fjy2,ty);
1564             fjz2             = _mm256_add_pd(fjz2,tz);
1565
1566             }
1567
1568             /**************************
1569              * CALCULATE INTERACTIONS *
1570              **************************/
1571
1572             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1573             {
1574
1575             /* REACTION-FIELD ELECTROSTATICS */
1576             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1577
1578             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1579
1580             fscal            = felec;
1581
1582             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm256_mul_pd(fscal,dx10);
1586             ty               = _mm256_mul_pd(fscal,dy10);
1587             tz               = _mm256_mul_pd(fscal,dz10);
1588
1589             /* Update vectorial force */
1590             fix1             = _mm256_add_pd(fix1,tx);
1591             fiy1             = _mm256_add_pd(fiy1,ty);
1592             fiz1             = _mm256_add_pd(fiz1,tz);
1593
1594             fjx0             = _mm256_add_pd(fjx0,tx);
1595             fjy0             = _mm256_add_pd(fjy0,ty);
1596             fjz0             = _mm256_add_pd(fjz0,tz);
1597
1598             }
1599
1600             /**************************
1601              * CALCULATE INTERACTIONS *
1602              **************************/
1603
1604             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1605             {
1606
1607             /* REACTION-FIELD ELECTROSTATICS */
1608             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1609
1610             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1611
1612             fscal            = felec;
1613
1614             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1615
1616             /* Calculate temporary vectorial force */
1617             tx               = _mm256_mul_pd(fscal,dx11);
1618             ty               = _mm256_mul_pd(fscal,dy11);
1619             tz               = _mm256_mul_pd(fscal,dz11);
1620
1621             /* Update vectorial force */
1622             fix1             = _mm256_add_pd(fix1,tx);
1623             fiy1             = _mm256_add_pd(fiy1,ty);
1624             fiz1             = _mm256_add_pd(fiz1,tz);
1625
1626             fjx1             = _mm256_add_pd(fjx1,tx);
1627             fjy1             = _mm256_add_pd(fjy1,ty);
1628             fjz1             = _mm256_add_pd(fjz1,tz);
1629
1630             }
1631
1632             /**************************
1633              * CALCULATE INTERACTIONS *
1634              **************************/
1635
1636             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1637             {
1638
1639             /* REACTION-FIELD ELECTROSTATICS */
1640             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1641
1642             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1643
1644             fscal            = felec;
1645
1646             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1647
1648             /* Calculate temporary vectorial force */
1649             tx               = _mm256_mul_pd(fscal,dx12);
1650             ty               = _mm256_mul_pd(fscal,dy12);
1651             tz               = _mm256_mul_pd(fscal,dz12);
1652
1653             /* Update vectorial force */
1654             fix1             = _mm256_add_pd(fix1,tx);
1655             fiy1             = _mm256_add_pd(fiy1,ty);
1656             fiz1             = _mm256_add_pd(fiz1,tz);
1657
1658             fjx2             = _mm256_add_pd(fjx2,tx);
1659             fjy2             = _mm256_add_pd(fjy2,ty);
1660             fjz2             = _mm256_add_pd(fjz2,tz);
1661
1662             }
1663
1664             /**************************
1665              * CALCULATE INTERACTIONS *
1666              **************************/
1667
1668             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1669             {
1670
1671             /* REACTION-FIELD ELECTROSTATICS */
1672             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1673
1674             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm256_mul_pd(fscal,dx20);
1682             ty               = _mm256_mul_pd(fscal,dy20);
1683             tz               = _mm256_mul_pd(fscal,dz20);
1684
1685             /* Update vectorial force */
1686             fix2             = _mm256_add_pd(fix2,tx);
1687             fiy2             = _mm256_add_pd(fiy2,ty);
1688             fiz2             = _mm256_add_pd(fiz2,tz);
1689
1690             fjx0             = _mm256_add_pd(fjx0,tx);
1691             fjy0             = _mm256_add_pd(fjy0,ty);
1692             fjz0             = _mm256_add_pd(fjz0,tz);
1693
1694             }
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1701             {
1702
1703             /* REACTION-FIELD ELECTROSTATICS */
1704             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1705
1706             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1707
1708             fscal            = felec;
1709
1710             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm256_mul_pd(fscal,dx21);
1714             ty               = _mm256_mul_pd(fscal,dy21);
1715             tz               = _mm256_mul_pd(fscal,dz21);
1716
1717             /* Update vectorial force */
1718             fix2             = _mm256_add_pd(fix2,tx);
1719             fiy2             = _mm256_add_pd(fiy2,ty);
1720             fiz2             = _mm256_add_pd(fiz2,tz);
1721
1722             fjx1             = _mm256_add_pd(fjx1,tx);
1723             fjy1             = _mm256_add_pd(fjy1,ty);
1724             fjz1             = _mm256_add_pd(fjz1,tz);
1725
1726             }
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1733             {
1734
1735             /* REACTION-FIELD ELECTROSTATICS */
1736             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1737
1738             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1739
1740             fscal            = felec;
1741
1742             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm256_mul_pd(fscal,dx22);
1746             ty               = _mm256_mul_pd(fscal,dy22);
1747             tz               = _mm256_mul_pd(fscal,dz22);
1748
1749             /* Update vectorial force */
1750             fix2             = _mm256_add_pd(fix2,tx);
1751             fiy2             = _mm256_add_pd(fiy2,ty);
1752             fiz2             = _mm256_add_pd(fiz2,tz);
1753
1754             fjx2             = _mm256_add_pd(fjx2,tx);
1755             fjy2             = _mm256_add_pd(fjy2,ty);
1756             fjz2             = _mm256_add_pd(fjz2,tz);
1757
1758             }
1759
1760             fjptrA             = f+j_coord_offsetA;
1761             fjptrB             = f+j_coord_offsetB;
1762             fjptrC             = f+j_coord_offsetC;
1763             fjptrD             = f+j_coord_offsetD;
1764
1765             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1766                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1767
1768             /* Inner loop uses 301 flops */
1769         }
1770
1771         if(jidx<j_index_end)
1772         {
1773
1774             /* Get j neighbor index, and coordinate index */
1775             jnrlistA         = jjnr[jidx];
1776             jnrlistB         = jjnr[jidx+1];
1777             jnrlistC         = jjnr[jidx+2];
1778             jnrlistD         = jjnr[jidx+3];
1779             /* Sign of each element will be negative for non-real atoms.
1780              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1781              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1782              */
1783             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1784
1785             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1786             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1787             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1788
1789             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1790             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1791             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1792             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1793             j_coord_offsetA  = DIM*jnrA;
1794             j_coord_offsetB  = DIM*jnrB;
1795             j_coord_offsetC  = DIM*jnrC;
1796             j_coord_offsetD  = DIM*jnrD;
1797
1798             /* load j atom coordinates */
1799             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1800                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1801                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1802
1803             /* Calculate displacement vector */
1804             dx00             = _mm256_sub_pd(ix0,jx0);
1805             dy00             = _mm256_sub_pd(iy0,jy0);
1806             dz00             = _mm256_sub_pd(iz0,jz0);
1807             dx01             = _mm256_sub_pd(ix0,jx1);
1808             dy01             = _mm256_sub_pd(iy0,jy1);
1809             dz01             = _mm256_sub_pd(iz0,jz1);
1810             dx02             = _mm256_sub_pd(ix0,jx2);
1811             dy02             = _mm256_sub_pd(iy0,jy2);
1812             dz02             = _mm256_sub_pd(iz0,jz2);
1813             dx10             = _mm256_sub_pd(ix1,jx0);
1814             dy10             = _mm256_sub_pd(iy1,jy0);
1815             dz10             = _mm256_sub_pd(iz1,jz0);
1816             dx11             = _mm256_sub_pd(ix1,jx1);
1817             dy11             = _mm256_sub_pd(iy1,jy1);
1818             dz11             = _mm256_sub_pd(iz1,jz1);
1819             dx12             = _mm256_sub_pd(ix1,jx2);
1820             dy12             = _mm256_sub_pd(iy1,jy2);
1821             dz12             = _mm256_sub_pd(iz1,jz2);
1822             dx20             = _mm256_sub_pd(ix2,jx0);
1823             dy20             = _mm256_sub_pd(iy2,jy0);
1824             dz20             = _mm256_sub_pd(iz2,jz0);
1825             dx21             = _mm256_sub_pd(ix2,jx1);
1826             dy21             = _mm256_sub_pd(iy2,jy1);
1827             dz21             = _mm256_sub_pd(iz2,jz1);
1828             dx22             = _mm256_sub_pd(ix2,jx2);
1829             dy22             = _mm256_sub_pd(iy2,jy2);
1830             dz22             = _mm256_sub_pd(iz2,jz2);
1831
1832             /* Calculate squared distance and things based on it */
1833             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1834             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1835             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1836             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1837             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1838             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1839             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1840             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1841             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1842
1843             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1844             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1845             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1846             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1847             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1848             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1849             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1850             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1851             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1852
1853             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1854             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1855             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1856             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1857             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1858             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1859             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1860             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1861             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1862
1863             fjx0             = _mm256_setzero_pd();
1864             fjy0             = _mm256_setzero_pd();
1865             fjz0             = _mm256_setzero_pd();
1866             fjx1             = _mm256_setzero_pd();
1867             fjy1             = _mm256_setzero_pd();
1868             fjz1             = _mm256_setzero_pd();
1869             fjx2             = _mm256_setzero_pd();
1870             fjy2             = _mm256_setzero_pd();
1871             fjz2             = _mm256_setzero_pd();
1872
1873             /**************************
1874              * CALCULATE INTERACTIONS *
1875              **************************/
1876
1877             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1878             {
1879
1880             r00              = _mm256_mul_pd(rsq00,rinv00);
1881             r00              = _mm256_andnot_pd(dummy_mask,r00);
1882
1883             /* REACTION-FIELD ELECTROSTATICS */
1884             felec            = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1885
1886             /* LENNARD-JONES DISPERSION/REPULSION */
1887
1888             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1889             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
1890             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1891             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1892             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1893
1894             d                = _mm256_sub_pd(r00,rswitch);
1895             d                = _mm256_max_pd(d,_mm256_setzero_pd());
1896             d2               = _mm256_mul_pd(d,d);
1897             sw               = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1898
1899             dsw              = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1900
1901             /* Evaluate switch function */
1902             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1903             fvdw             = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1904             cutoff_mask      = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1905
1906             fscal            = _mm256_add_pd(felec,fvdw);
1907
1908             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1909
1910             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1911
1912             /* Calculate temporary vectorial force */
1913             tx               = _mm256_mul_pd(fscal,dx00);
1914             ty               = _mm256_mul_pd(fscal,dy00);
1915             tz               = _mm256_mul_pd(fscal,dz00);
1916
1917             /* Update vectorial force */
1918             fix0             = _mm256_add_pd(fix0,tx);
1919             fiy0             = _mm256_add_pd(fiy0,ty);
1920             fiz0             = _mm256_add_pd(fiz0,tz);
1921
1922             fjx0             = _mm256_add_pd(fjx0,tx);
1923             fjy0             = _mm256_add_pd(fjy0,ty);
1924             fjz0             = _mm256_add_pd(fjz0,tz);
1925
1926             }
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1933             {
1934
1935             /* REACTION-FIELD ELECTROSTATICS */
1936             felec            = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1937
1938             cutoff_mask      = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1939
1940             fscal            = felec;
1941
1942             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1943
1944             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1945
1946             /* Calculate temporary vectorial force */
1947             tx               = _mm256_mul_pd(fscal,dx01);
1948             ty               = _mm256_mul_pd(fscal,dy01);
1949             tz               = _mm256_mul_pd(fscal,dz01);
1950
1951             /* Update vectorial force */
1952             fix0             = _mm256_add_pd(fix0,tx);
1953             fiy0             = _mm256_add_pd(fiy0,ty);
1954             fiz0             = _mm256_add_pd(fiz0,tz);
1955
1956             fjx1             = _mm256_add_pd(fjx1,tx);
1957             fjy1             = _mm256_add_pd(fjy1,ty);
1958             fjz1             = _mm256_add_pd(fjz1,tz);
1959
1960             }
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1967             {
1968
1969             /* REACTION-FIELD ELECTROSTATICS */
1970             felec            = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1971
1972             cutoff_mask      = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1973
1974             fscal            = felec;
1975
1976             fscal            = _mm256_and_pd(fscal,cutoff_mask);
1977
1978             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1979
1980             /* Calculate temporary vectorial force */
1981             tx               = _mm256_mul_pd(fscal,dx02);
1982             ty               = _mm256_mul_pd(fscal,dy02);
1983             tz               = _mm256_mul_pd(fscal,dz02);
1984
1985             /* Update vectorial force */
1986             fix0             = _mm256_add_pd(fix0,tx);
1987             fiy0             = _mm256_add_pd(fiy0,ty);
1988             fiz0             = _mm256_add_pd(fiz0,tz);
1989
1990             fjx2             = _mm256_add_pd(fjx2,tx);
1991             fjy2             = _mm256_add_pd(fjy2,ty);
1992             fjz2             = _mm256_add_pd(fjz2,tz);
1993
1994             }
1995
1996             /**************************
1997              * CALCULATE INTERACTIONS *
1998              **************************/
1999
2000             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2001             {
2002
2003             /* REACTION-FIELD ELECTROSTATICS */
2004             felec            = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
2005
2006             cutoff_mask      = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
2007
2008             fscal            = felec;
2009
2010             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2011
2012             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm256_mul_pd(fscal,dx10);
2016             ty               = _mm256_mul_pd(fscal,dy10);
2017             tz               = _mm256_mul_pd(fscal,dz10);
2018
2019             /* Update vectorial force */
2020             fix1             = _mm256_add_pd(fix1,tx);
2021             fiy1             = _mm256_add_pd(fiy1,ty);
2022             fiz1             = _mm256_add_pd(fiz1,tz);
2023
2024             fjx0             = _mm256_add_pd(fjx0,tx);
2025             fjy0             = _mm256_add_pd(fjy0,ty);
2026             fjz0             = _mm256_add_pd(fjz0,tz);
2027
2028             }
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2035             {
2036
2037             /* REACTION-FIELD ELECTROSTATICS */
2038             felec            = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2039
2040             cutoff_mask      = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2041
2042             fscal            = felec;
2043
2044             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2045
2046             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2047
2048             /* Calculate temporary vectorial force */
2049             tx               = _mm256_mul_pd(fscal,dx11);
2050             ty               = _mm256_mul_pd(fscal,dy11);
2051             tz               = _mm256_mul_pd(fscal,dz11);
2052
2053             /* Update vectorial force */
2054             fix1             = _mm256_add_pd(fix1,tx);
2055             fiy1             = _mm256_add_pd(fiy1,ty);
2056             fiz1             = _mm256_add_pd(fiz1,tz);
2057
2058             fjx1             = _mm256_add_pd(fjx1,tx);
2059             fjy1             = _mm256_add_pd(fjy1,ty);
2060             fjz1             = _mm256_add_pd(fjz1,tz);
2061
2062             }
2063
2064             /**************************
2065              * CALCULATE INTERACTIONS *
2066              **************************/
2067
2068             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2069             {
2070
2071             /* REACTION-FIELD ELECTROSTATICS */
2072             felec            = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2073
2074             cutoff_mask      = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2075
2076             fscal            = felec;
2077
2078             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2079
2080             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2081
2082             /* Calculate temporary vectorial force */
2083             tx               = _mm256_mul_pd(fscal,dx12);
2084             ty               = _mm256_mul_pd(fscal,dy12);
2085             tz               = _mm256_mul_pd(fscal,dz12);
2086
2087             /* Update vectorial force */
2088             fix1             = _mm256_add_pd(fix1,tx);
2089             fiy1             = _mm256_add_pd(fiy1,ty);
2090             fiz1             = _mm256_add_pd(fiz1,tz);
2091
2092             fjx2             = _mm256_add_pd(fjx2,tx);
2093             fjy2             = _mm256_add_pd(fjy2,ty);
2094             fjz2             = _mm256_add_pd(fjz2,tz);
2095
2096             }
2097
2098             /**************************
2099              * CALCULATE INTERACTIONS *
2100              **************************/
2101
2102             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2103             {
2104
2105             /* REACTION-FIELD ELECTROSTATICS */
2106             felec            = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
2107
2108             cutoff_mask      = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2113
2114             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2115
2116             /* Calculate temporary vectorial force */
2117             tx               = _mm256_mul_pd(fscal,dx20);
2118             ty               = _mm256_mul_pd(fscal,dy20);
2119             tz               = _mm256_mul_pd(fscal,dz20);
2120
2121             /* Update vectorial force */
2122             fix2             = _mm256_add_pd(fix2,tx);
2123             fiy2             = _mm256_add_pd(fiy2,ty);
2124             fiz2             = _mm256_add_pd(fiz2,tz);
2125
2126             fjx0             = _mm256_add_pd(fjx0,tx);
2127             fjy0             = _mm256_add_pd(fjy0,ty);
2128             fjz0             = _mm256_add_pd(fjz0,tz);
2129
2130             }
2131
2132             /**************************
2133              * CALCULATE INTERACTIONS *
2134              **************************/
2135
2136             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2137             {
2138
2139             /* REACTION-FIELD ELECTROSTATICS */
2140             felec            = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2141
2142             cutoff_mask      = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2143
2144             fscal            = felec;
2145
2146             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2147
2148             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2149
2150             /* Calculate temporary vectorial force */
2151             tx               = _mm256_mul_pd(fscal,dx21);
2152             ty               = _mm256_mul_pd(fscal,dy21);
2153             tz               = _mm256_mul_pd(fscal,dz21);
2154
2155             /* Update vectorial force */
2156             fix2             = _mm256_add_pd(fix2,tx);
2157             fiy2             = _mm256_add_pd(fiy2,ty);
2158             fiz2             = _mm256_add_pd(fiz2,tz);
2159
2160             fjx1             = _mm256_add_pd(fjx1,tx);
2161             fjy1             = _mm256_add_pd(fjy1,ty);
2162             fjz1             = _mm256_add_pd(fjz1,tz);
2163
2164             }
2165
2166             /**************************
2167              * CALCULATE INTERACTIONS *
2168              **************************/
2169
2170             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2171             {
2172
2173             /* REACTION-FIELD ELECTROSTATICS */
2174             felec            = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2175
2176             cutoff_mask      = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm256_and_pd(fscal,cutoff_mask);
2181
2182             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm256_mul_pd(fscal,dx22);
2186             ty               = _mm256_mul_pd(fscal,dy22);
2187             tz               = _mm256_mul_pd(fscal,dz22);
2188
2189             /* Update vectorial force */
2190             fix2             = _mm256_add_pd(fix2,tx);
2191             fiy2             = _mm256_add_pd(fiy2,ty);
2192             fiz2             = _mm256_add_pd(fiz2,tz);
2193
2194             fjx2             = _mm256_add_pd(fjx2,tx);
2195             fjy2             = _mm256_add_pd(fjy2,ty);
2196             fjz2             = _mm256_add_pd(fjz2,tz);
2197
2198             }
2199
2200             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2201             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2202             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2203             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2204
2205             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2206                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2207
2208             /* Inner loop uses 302 flops */
2209         }
2210
2211         /* End of innermost loop */
2212
2213         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2214                                                  f+i_coord_offset,fshift+i_shift_offset);
2215
2216         /* Increment number of inner iterations */
2217         inneriter                  += j_index_end - j_index_start;
2218
2219         /* Outer loop uses 18 flops */
2220     }
2221
2222     /* Increment number of outer iterations */
2223     outeriter        += nri;
2224
2225     /* Update outer/inner flops */
2226
2227     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
2228 }