made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_256_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
65     real             rcutoff_scalar;
66     real             *shiftvec,*fshift,*x,*f;
67     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68     real             scratch[4*DIM];
69     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70     real *           vdwioffsetptr0;
71     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72     real *           vdwioffsetptr1;
73     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74     real *           vdwioffsetptr2;
75     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
92     real             *charge;
93     int              nvdwtype;
94     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
95     int              *vdwtype;
96     real             *vdwparam;
97     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
98     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
99     __m256d          dummy_mask,cutoff_mask;
100     __m128           tmpmask0,tmpmask1;
101     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
102     __m256d          one     = _mm256_set1_pd(1.0);
103     __m256d          two     = _mm256_set1_pd(2.0);
104     x                = xx[0];
105     f                = ff[0];
106
107     nri              = nlist->nri;
108     iinr             = nlist->iinr;
109     jindex           = nlist->jindex;
110     jjnr             = nlist->jjnr;
111     shiftidx         = nlist->shift;
112     gid              = nlist->gid;
113     shiftvec         = fr->shift_vec[0];
114     fshift           = fr->fshift[0];
115     facel            = _mm256_set1_pd(fr->epsfac);
116     charge           = mdatoms->chargeA;
117     nvdwtype         = fr->ntype;
118     vdwparam         = fr->nbfp;
119     vdwtype          = mdatoms->typeA;
120
121     /* Setup water-specific parameters */
122     inr              = nlist->iinr[0];
123     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
124     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
125     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
126     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
127
128     jq0              = _mm256_set1_pd(charge[inr+0]);
129     jq1              = _mm256_set1_pd(charge[inr+1]);
130     jq2              = _mm256_set1_pd(charge[inr+2]);
131     vdwjidx0A        = 2*vdwtype[inr+0];
132     qq00             = _mm256_mul_pd(iq0,jq0);
133     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
134     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
135     qq01             = _mm256_mul_pd(iq0,jq1);
136     qq02             = _mm256_mul_pd(iq0,jq2);
137     qq10             = _mm256_mul_pd(iq1,jq0);
138     qq11             = _mm256_mul_pd(iq1,jq1);
139     qq12             = _mm256_mul_pd(iq1,jq2);
140     qq20             = _mm256_mul_pd(iq2,jq0);
141     qq21             = _mm256_mul_pd(iq2,jq1);
142     qq22             = _mm256_mul_pd(iq2,jq2);
143
144     /* Avoid stupid compiler warnings */
145     jnrA = jnrB = jnrC = jnrD = 0;
146     j_coord_offsetA = 0;
147     j_coord_offsetB = 0;
148     j_coord_offsetC = 0;
149     j_coord_offsetD = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     for(iidx=0;iidx<4*DIM;iidx++)
155     {
156         scratch[iidx] = 0.0;
157     }
158
159     /* Start outer loop over neighborlists */
160     for(iidx=0; iidx<nri; iidx++)
161     {
162         /* Load shift vector for this list */
163         i_shift_offset   = DIM*shiftidx[iidx];
164
165         /* Load limits for loop over neighbors */
166         j_index_start    = jindex[iidx];
167         j_index_end      = jindex[iidx+1];
168
169         /* Get outer coordinate index */
170         inr              = iinr[iidx];
171         i_coord_offset   = DIM*inr;
172
173         /* Load i particle coords and add shift vector */
174         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
176
177         fix0             = _mm256_setzero_pd();
178         fiy0             = _mm256_setzero_pd();
179         fiz0             = _mm256_setzero_pd();
180         fix1             = _mm256_setzero_pd();
181         fiy1             = _mm256_setzero_pd();
182         fiz1             = _mm256_setzero_pd();
183         fix2             = _mm256_setzero_pd();
184         fiy2             = _mm256_setzero_pd();
185         fiz2             = _mm256_setzero_pd();
186
187         /* Reset potential sums */
188         velecsum         = _mm256_setzero_pd();
189         vvdwsum          = _mm256_setzero_pd();
190
191         /* Start inner kernel loop */
192         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
193         {
194
195             /* Get j neighbor index, and coordinate index */
196             jnrA             = jjnr[jidx];
197             jnrB             = jjnr[jidx+1];
198             jnrC             = jjnr[jidx+2];
199             jnrD             = jjnr[jidx+3];
200             j_coord_offsetA  = DIM*jnrA;
201             j_coord_offsetB  = DIM*jnrB;
202             j_coord_offsetC  = DIM*jnrC;
203             j_coord_offsetD  = DIM*jnrD;
204
205             /* load j atom coordinates */
206             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207                                                  x+j_coord_offsetC,x+j_coord_offsetD,
208                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
209
210             /* Calculate displacement vector */
211             dx00             = _mm256_sub_pd(ix0,jx0);
212             dy00             = _mm256_sub_pd(iy0,jy0);
213             dz00             = _mm256_sub_pd(iz0,jz0);
214             dx01             = _mm256_sub_pd(ix0,jx1);
215             dy01             = _mm256_sub_pd(iy0,jy1);
216             dz01             = _mm256_sub_pd(iz0,jz1);
217             dx02             = _mm256_sub_pd(ix0,jx2);
218             dy02             = _mm256_sub_pd(iy0,jy2);
219             dz02             = _mm256_sub_pd(iz0,jz2);
220             dx10             = _mm256_sub_pd(ix1,jx0);
221             dy10             = _mm256_sub_pd(iy1,jy0);
222             dz10             = _mm256_sub_pd(iz1,jz0);
223             dx11             = _mm256_sub_pd(ix1,jx1);
224             dy11             = _mm256_sub_pd(iy1,jy1);
225             dz11             = _mm256_sub_pd(iz1,jz1);
226             dx12             = _mm256_sub_pd(ix1,jx2);
227             dy12             = _mm256_sub_pd(iy1,jy2);
228             dz12             = _mm256_sub_pd(iz1,jz2);
229             dx20             = _mm256_sub_pd(ix2,jx0);
230             dy20             = _mm256_sub_pd(iy2,jy0);
231             dz20             = _mm256_sub_pd(iz2,jz0);
232             dx21             = _mm256_sub_pd(ix2,jx1);
233             dy21             = _mm256_sub_pd(iy2,jy1);
234             dz21             = _mm256_sub_pd(iz2,jz1);
235             dx22             = _mm256_sub_pd(ix2,jx2);
236             dy22             = _mm256_sub_pd(iy2,jy2);
237             dz22             = _mm256_sub_pd(iz2,jz2);
238
239             /* Calculate squared distance and things based on it */
240             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
241             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
242             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
243             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
244             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
245             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
246             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
247             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
248             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
249
250             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
251             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
252             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
253             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
254             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
255             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
256             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
257             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
258             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
259
260             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
261             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
262             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
263             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
264             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
265             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
266             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
267             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
268             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
269
270             fjx0             = _mm256_setzero_pd();
271             fjy0             = _mm256_setzero_pd();
272             fjz0             = _mm256_setzero_pd();
273             fjx1             = _mm256_setzero_pd();
274             fjy1             = _mm256_setzero_pd();
275             fjz1             = _mm256_setzero_pd();
276             fjx2             = _mm256_setzero_pd();
277             fjy2             = _mm256_setzero_pd();
278             fjz2             = _mm256_setzero_pd();
279
280             /**************************
281              * CALCULATE INTERACTIONS *
282              **************************/
283
284             /* COULOMB ELECTROSTATICS */
285             velec            = _mm256_mul_pd(qq00,rinv00);
286             felec            = _mm256_mul_pd(velec,rinvsq00);
287
288             /* LENNARD-JONES DISPERSION/REPULSION */
289
290             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
291             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
292             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
293             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
294             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
295
296             /* Update potential sum for this i atom from the interaction with this j atom. */
297             velecsum         = _mm256_add_pd(velecsum,velec);
298             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
299
300             fscal            = _mm256_add_pd(felec,fvdw);
301
302             /* Calculate temporary vectorial force */
303             tx               = _mm256_mul_pd(fscal,dx00);
304             ty               = _mm256_mul_pd(fscal,dy00);
305             tz               = _mm256_mul_pd(fscal,dz00);
306
307             /* Update vectorial force */
308             fix0             = _mm256_add_pd(fix0,tx);
309             fiy0             = _mm256_add_pd(fiy0,ty);
310             fiz0             = _mm256_add_pd(fiz0,tz);
311
312             fjx0             = _mm256_add_pd(fjx0,tx);
313             fjy0             = _mm256_add_pd(fjy0,ty);
314             fjz0             = _mm256_add_pd(fjz0,tz);
315
316             /**************************
317              * CALCULATE INTERACTIONS *
318              **************************/
319
320             /* COULOMB ELECTROSTATICS */
321             velec            = _mm256_mul_pd(qq01,rinv01);
322             felec            = _mm256_mul_pd(velec,rinvsq01);
323
324             /* Update potential sum for this i atom from the interaction with this j atom. */
325             velecsum         = _mm256_add_pd(velecsum,velec);
326
327             fscal            = felec;
328
329             /* Calculate temporary vectorial force */
330             tx               = _mm256_mul_pd(fscal,dx01);
331             ty               = _mm256_mul_pd(fscal,dy01);
332             tz               = _mm256_mul_pd(fscal,dz01);
333
334             /* Update vectorial force */
335             fix0             = _mm256_add_pd(fix0,tx);
336             fiy0             = _mm256_add_pd(fiy0,ty);
337             fiz0             = _mm256_add_pd(fiz0,tz);
338
339             fjx1             = _mm256_add_pd(fjx1,tx);
340             fjy1             = _mm256_add_pd(fjy1,ty);
341             fjz1             = _mm256_add_pd(fjz1,tz);
342
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             /* COULOMB ELECTROSTATICS */
348             velec            = _mm256_mul_pd(qq02,rinv02);
349             felec            = _mm256_mul_pd(velec,rinvsq02);
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm256_add_pd(velecsum,velec);
353
354             fscal            = felec;
355
356             /* Calculate temporary vectorial force */
357             tx               = _mm256_mul_pd(fscal,dx02);
358             ty               = _mm256_mul_pd(fscal,dy02);
359             tz               = _mm256_mul_pd(fscal,dz02);
360
361             /* Update vectorial force */
362             fix0             = _mm256_add_pd(fix0,tx);
363             fiy0             = _mm256_add_pd(fiy0,ty);
364             fiz0             = _mm256_add_pd(fiz0,tz);
365
366             fjx2             = _mm256_add_pd(fjx2,tx);
367             fjy2             = _mm256_add_pd(fjy2,ty);
368             fjz2             = _mm256_add_pd(fjz2,tz);
369
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             /* COULOMB ELECTROSTATICS */
375             velec            = _mm256_mul_pd(qq10,rinv10);
376             felec            = _mm256_mul_pd(velec,rinvsq10);
377
378             /* Update potential sum for this i atom from the interaction with this j atom. */
379             velecsum         = _mm256_add_pd(velecsum,velec);
380
381             fscal            = felec;
382
383             /* Calculate temporary vectorial force */
384             tx               = _mm256_mul_pd(fscal,dx10);
385             ty               = _mm256_mul_pd(fscal,dy10);
386             tz               = _mm256_mul_pd(fscal,dz10);
387
388             /* Update vectorial force */
389             fix1             = _mm256_add_pd(fix1,tx);
390             fiy1             = _mm256_add_pd(fiy1,ty);
391             fiz1             = _mm256_add_pd(fiz1,tz);
392
393             fjx0             = _mm256_add_pd(fjx0,tx);
394             fjy0             = _mm256_add_pd(fjy0,ty);
395             fjz0             = _mm256_add_pd(fjz0,tz);
396
397             /**************************
398              * CALCULATE INTERACTIONS *
399              **************************/
400
401             /* COULOMB ELECTROSTATICS */
402             velec            = _mm256_mul_pd(qq11,rinv11);
403             felec            = _mm256_mul_pd(velec,rinvsq11);
404
405             /* Update potential sum for this i atom from the interaction with this j atom. */
406             velecsum         = _mm256_add_pd(velecsum,velec);
407
408             fscal            = felec;
409
410             /* Calculate temporary vectorial force */
411             tx               = _mm256_mul_pd(fscal,dx11);
412             ty               = _mm256_mul_pd(fscal,dy11);
413             tz               = _mm256_mul_pd(fscal,dz11);
414
415             /* Update vectorial force */
416             fix1             = _mm256_add_pd(fix1,tx);
417             fiy1             = _mm256_add_pd(fiy1,ty);
418             fiz1             = _mm256_add_pd(fiz1,tz);
419
420             fjx1             = _mm256_add_pd(fjx1,tx);
421             fjy1             = _mm256_add_pd(fjy1,ty);
422             fjz1             = _mm256_add_pd(fjz1,tz);
423
424             /**************************
425              * CALCULATE INTERACTIONS *
426              **************************/
427
428             /* COULOMB ELECTROSTATICS */
429             velec            = _mm256_mul_pd(qq12,rinv12);
430             felec            = _mm256_mul_pd(velec,rinvsq12);
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velecsum         = _mm256_add_pd(velecsum,velec);
434
435             fscal            = felec;
436
437             /* Calculate temporary vectorial force */
438             tx               = _mm256_mul_pd(fscal,dx12);
439             ty               = _mm256_mul_pd(fscal,dy12);
440             tz               = _mm256_mul_pd(fscal,dz12);
441
442             /* Update vectorial force */
443             fix1             = _mm256_add_pd(fix1,tx);
444             fiy1             = _mm256_add_pd(fiy1,ty);
445             fiz1             = _mm256_add_pd(fiz1,tz);
446
447             fjx2             = _mm256_add_pd(fjx2,tx);
448             fjy2             = _mm256_add_pd(fjy2,ty);
449             fjz2             = _mm256_add_pd(fjz2,tz);
450
451             /**************************
452              * CALCULATE INTERACTIONS *
453              **************************/
454
455             /* COULOMB ELECTROSTATICS */
456             velec            = _mm256_mul_pd(qq20,rinv20);
457             felec            = _mm256_mul_pd(velec,rinvsq20);
458
459             /* Update potential sum for this i atom from the interaction with this j atom. */
460             velecsum         = _mm256_add_pd(velecsum,velec);
461
462             fscal            = felec;
463
464             /* Calculate temporary vectorial force */
465             tx               = _mm256_mul_pd(fscal,dx20);
466             ty               = _mm256_mul_pd(fscal,dy20);
467             tz               = _mm256_mul_pd(fscal,dz20);
468
469             /* Update vectorial force */
470             fix2             = _mm256_add_pd(fix2,tx);
471             fiy2             = _mm256_add_pd(fiy2,ty);
472             fiz2             = _mm256_add_pd(fiz2,tz);
473
474             fjx0             = _mm256_add_pd(fjx0,tx);
475             fjy0             = _mm256_add_pd(fjy0,ty);
476             fjz0             = _mm256_add_pd(fjz0,tz);
477
478             /**************************
479              * CALCULATE INTERACTIONS *
480              **************************/
481
482             /* COULOMB ELECTROSTATICS */
483             velec            = _mm256_mul_pd(qq21,rinv21);
484             felec            = _mm256_mul_pd(velec,rinvsq21);
485
486             /* Update potential sum for this i atom from the interaction with this j atom. */
487             velecsum         = _mm256_add_pd(velecsum,velec);
488
489             fscal            = felec;
490
491             /* Calculate temporary vectorial force */
492             tx               = _mm256_mul_pd(fscal,dx21);
493             ty               = _mm256_mul_pd(fscal,dy21);
494             tz               = _mm256_mul_pd(fscal,dz21);
495
496             /* Update vectorial force */
497             fix2             = _mm256_add_pd(fix2,tx);
498             fiy2             = _mm256_add_pd(fiy2,ty);
499             fiz2             = _mm256_add_pd(fiz2,tz);
500
501             fjx1             = _mm256_add_pd(fjx1,tx);
502             fjy1             = _mm256_add_pd(fjy1,ty);
503             fjz1             = _mm256_add_pd(fjz1,tz);
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             /* COULOMB ELECTROSTATICS */
510             velec            = _mm256_mul_pd(qq22,rinv22);
511             felec            = _mm256_mul_pd(velec,rinvsq22);
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velecsum         = _mm256_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             /* Calculate temporary vectorial force */
519             tx               = _mm256_mul_pd(fscal,dx22);
520             ty               = _mm256_mul_pd(fscal,dy22);
521             tz               = _mm256_mul_pd(fscal,dz22);
522
523             /* Update vectorial force */
524             fix2             = _mm256_add_pd(fix2,tx);
525             fiy2             = _mm256_add_pd(fiy2,ty);
526             fiz2             = _mm256_add_pd(fiz2,tz);
527
528             fjx2             = _mm256_add_pd(fjx2,tx);
529             fjy2             = _mm256_add_pd(fjy2,ty);
530             fjz2             = _mm256_add_pd(fjz2,tz);
531
532             fjptrA             = f+j_coord_offsetA;
533             fjptrB             = f+j_coord_offsetB;
534             fjptrC             = f+j_coord_offsetC;
535             fjptrD             = f+j_coord_offsetD;
536
537             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
538                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
539
540             /* Inner loop uses 255 flops */
541         }
542
543         if(jidx<j_index_end)
544         {
545
546             /* Get j neighbor index, and coordinate index */
547             jnrlistA         = jjnr[jidx];
548             jnrlistB         = jjnr[jidx+1];
549             jnrlistC         = jjnr[jidx+2];
550             jnrlistD         = jjnr[jidx+3];
551             /* Sign of each element will be negative for non-real atoms.
552              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
553              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
554              */
555             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
556
557             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
558             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
559             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
560
561             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
562             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
563             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
564             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
565             j_coord_offsetA  = DIM*jnrA;
566             j_coord_offsetB  = DIM*jnrB;
567             j_coord_offsetC  = DIM*jnrC;
568             j_coord_offsetD  = DIM*jnrD;
569
570             /* load j atom coordinates */
571             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
572                                                  x+j_coord_offsetC,x+j_coord_offsetD,
573                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
574
575             /* Calculate displacement vector */
576             dx00             = _mm256_sub_pd(ix0,jx0);
577             dy00             = _mm256_sub_pd(iy0,jy0);
578             dz00             = _mm256_sub_pd(iz0,jz0);
579             dx01             = _mm256_sub_pd(ix0,jx1);
580             dy01             = _mm256_sub_pd(iy0,jy1);
581             dz01             = _mm256_sub_pd(iz0,jz1);
582             dx02             = _mm256_sub_pd(ix0,jx2);
583             dy02             = _mm256_sub_pd(iy0,jy2);
584             dz02             = _mm256_sub_pd(iz0,jz2);
585             dx10             = _mm256_sub_pd(ix1,jx0);
586             dy10             = _mm256_sub_pd(iy1,jy0);
587             dz10             = _mm256_sub_pd(iz1,jz0);
588             dx11             = _mm256_sub_pd(ix1,jx1);
589             dy11             = _mm256_sub_pd(iy1,jy1);
590             dz11             = _mm256_sub_pd(iz1,jz1);
591             dx12             = _mm256_sub_pd(ix1,jx2);
592             dy12             = _mm256_sub_pd(iy1,jy2);
593             dz12             = _mm256_sub_pd(iz1,jz2);
594             dx20             = _mm256_sub_pd(ix2,jx0);
595             dy20             = _mm256_sub_pd(iy2,jy0);
596             dz20             = _mm256_sub_pd(iz2,jz0);
597             dx21             = _mm256_sub_pd(ix2,jx1);
598             dy21             = _mm256_sub_pd(iy2,jy1);
599             dz21             = _mm256_sub_pd(iz2,jz1);
600             dx22             = _mm256_sub_pd(ix2,jx2);
601             dy22             = _mm256_sub_pd(iy2,jy2);
602             dz22             = _mm256_sub_pd(iz2,jz2);
603
604             /* Calculate squared distance and things based on it */
605             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
606             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
607             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
608             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
609             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
610             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
611             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
612             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
613             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
614
615             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
616             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
617             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
618             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
619             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
620             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
621             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
622             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
623             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
624
625             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
626             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
627             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
628             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
629             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
630             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
631             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
632             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
633             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
634
635             fjx0             = _mm256_setzero_pd();
636             fjy0             = _mm256_setzero_pd();
637             fjz0             = _mm256_setzero_pd();
638             fjx1             = _mm256_setzero_pd();
639             fjy1             = _mm256_setzero_pd();
640             fjz1             = _mm256_setzero_pd();
641             fjx2             = _mm256_setzero_pd();
642             fjy2             = _mm256_setzero_pd();
643             fjz2             = _mm256_setzero_pd();
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             /* COULOMB ELECTROSTATICS */
650             velec            = _mm256_mul_pd(qq00,rinv00);
651             felec            = _mm256_mul_pd(velec,rinvsq00);
652
653             /* LENNARD-JONES DISPERSION/REPULSION */
654
655             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
656             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
657             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
658             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
659             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
660
661             /* Update potential sum for this i atom from the interaction with this j atom. */
662             velec            = _mm256_andnot_pd(dummy_mask,velec);
663             velecsum         = _mm256_add_pd(velecsum,velec);
664             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
665             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
666
667             fscal            = _mm256_add_pd(felec,fvdw);
668
669             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
670
671             /* Calculate temporary vectorial force */
672             tx               = _mm256_mul_pd(fscal,dx00);
673             ty               = _mm256_mul_pd(fscal,dy00);
674             tz               = _mm256_mul_pd(fscal,dz00);
675
676             /* Update vectorial force */
677             fix0             = _mm256_add_pd(fix0,tx);
678             fiy0             = _mm256_add_pd(fiy0,ty);
679             fiz0             = _mm256_add_pd(fiz0,tz);
680
681             fjx0             = _mm256_add_pd(fjx0,tx);
682             fjy0             = _mm256_add_pd(fjy0,ty);
683             fjz0             = _mm256_add_pd(fjz0,tz);
684
685             /**************************
686              * CALCULATE INTERACTIONS *
687              **************************/
688
689             /* COULOMB ELECTROSTATICS */
690             velec            = _mm256_mul_pd(qq01,rinv01);
691             felec            = _mm256_mul_pd(velec,rinvsq01);
692
693             /* Update potential sum for this i atom from the interaction with this j atom. */
694             velec            = _mm256_andnot_pd(dummy_mask,velec);
695             velecsum         = _mm256_add_pd(velecsum,velec);
696
697             fscal            = felec;
698
699             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
700
701             /* Calculate temporary vectorial force */
702             tx               = _mm256_mul_pd(fscal,dx01);
703             ty               = _mm256_mul_pd(fscal,dy01);
704             tz               = _mm256_mul_pd(fscal,dz01);
705
706             /* Update vectorial force */
707             fix0             = _mm256_add_pd(fix0,tx);
708             fiy0             = _mm256_add_pd(fiy0,ty);
709             fiz0             = _mm256_add_pd(fiz0,tz);
710
711             fjx1             = _mm256_add_pd(fjx1,tx);
712             fjy1             = _mm256_add_pd(fjy1,ty);
713             fjz1             = _mm256_add_pd(fjz1,tz);
714
715             /**************************
716              * CALCULATE INTERACTIONS *
717              **************************/
718
719             /* COULOMB ELECTROSTATICS */
720             velec            = _mm256_mul_pd(qq02,rinv02);
721             felec            = _mm256_mul_pd(velec,rinvsq02);
722
723             /* Update potential sum for this i atom from the interaction with this j atom. */
724             velec            = _mm256_andnot_pd(dummy_mask,velec);
725             velecsum         = _mm256_add_pd(velecsum,velec);
726
727             fscal            = felec;
728
729             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
730
731             /* Calculate temporary vectorial force */
732             tx               = _mm256_mul_pd(fscal,dx02);
733             ty               = _mm256_mul_pd(fscal,dy02);
734             tz               = _mm256_mul_pd(fscal,dz02);
735
736             /* Update vectorial force */
737             fix0             = _mm256_add_pd(fix0,tx);
738             fiy0             = _mm256_add_pd(fiy0,ty);
739             fiz0             = _mm256_add_pd(fiz0,tz);
740
741             fjx2             = _mm256_add_pd(fjx2,tx);
742             fjy2             = _mm256_add_pd(fjy2,ty);
743             fjz2             = _mm256_add_pd(fjz2,tz);
744
745             /**************************
746              * CALCULATE INTERACTIONS *
747              **************************/
748
749             /* COULOMB ELECTROSTATICS */
750             velec            = _mm256_mul_pd(qq10,rinv10);
751             felec            = _mm256_mul_pd(velec,rinvsq10);
752
753             /* Update potential sum for this i atom from the interaction with this j atom. */
754             velec            = _mm256_andnot_pd(dummy_mask,velec);
755             velecsum         = _mm256_add_pd(velecsum,velec);
756
757             fscal            = felec;
758
759             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
760
761             /* Calculate temporary vectorial force */
762             tx               = _mm256_mul_pd(fscal,dx10);
763             ty               = _mm256_mul_pd(fscal,dy10);
764             tz               = _mm256_mul_pd(fscal,dz10);
765
766             /* Update vectorial force */
767             fix1             = _mm256_add_pd(fix1,tx);
768             fiy1             = _mm256_add_pd(fiy1,ty);
769             fiz1             = _mm256_add_pd(fiz1,tz);
770
771             fjx0             = _mm256_add_pd(fjx0,tx);
772             fjy0             = _mm256_add_pd(fjy0,ty);
773             fjz0             = _mm256_add_pd(fjz0,tz);
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             /* COULOMB ELECTROSTATICS */
780             velec            = _mm256_mul_pd(qq11,rinv11);
781             felec            = _mm256_mul_pd(velec,rinvsq11);
782
783             /* Update potential sum for this i atom from the interaction with this j atom. */
784             velec            = _mm256_andnot_pd(dummy_mask,velec);
785             velecsum         = _mm256_add_pd(velecsum,velec);
786
787             fscal            = felec;
788
789             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
790
791             /* Calculate temporary vectorial force */
792             tx               = _mm256_mul_pd(fscal,dx11);
793             ty               = _mm256_mul_pd(fscal,dy11);
794             tz               = _mm256_mul_pd(fscal,dz11);
795
796             /* Update vectorial force */
797             fix1             = _mm256_add_pd(fix1,tx);
798             fiy1             = _mm256_add_pd(fiy1,ty);
799             fiz1             = _mm256_add_pd(fiz1,tz);
800
801             fjx1             = _mm256_add_pd(fjx1,tx);
802             fjy1             = _mm256_add_pd(fjy1,ty);
803             fjz1             = _mm256_add_pd(fjz1,tz);
804
805             /**************************
806              * CALCULATE INTERACTIONS *
807              **************************/
808
809             /* COULOMB ELECTROSTATICS */
810             velec            = _mm256_mul_pd(qq12,rinv12);
811             felec            = _mm256_mul_pd(velec,rinvsq12);
812
813             /* Update potential sum for this i atom from the interaction with this j atom. */
814             velec            = _mm256_andnot_pd(dummy_mask,velec);
815             velecsum         = _mm256_add_pd(velecsum,velec);
816
817             fscal            = felec;
818
819             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
820
821             /* Calculate temporary vectorial force */
822             tx               = _mm256_mul_pd(fscal,dx12);
823             ty               = _mm256_mul_pd(fscal,dy12);
824             tz               = _mm256_mul_pd(fscal,dz12);
825
826             /* Update vectorial force */
827             fix1             = _mm256_add_pd(fix1,tx);
828             fiy1             = _mm256_add_pd(fiy1,ty);
829             fiz1             = _mm256_add_pd(fiz1,tz);
830
831             fjx2             = _mm256_add_pd(fjx2,tx);
832             fjy2             = _mm256_add_pd(fjy2,ty);
833             fjz2             = _mm256_add_pd(fjz2,tz);
834
835             /**************************
836              * CALCULATE INTERACTIONS *
837              **************************/
838
839             /* COULOMB ELECTROSTATICS */
840             velec            = _mm256_mul_pd(qq20,rinv20);
841             felec            = _mm256_mul_pd(velec,rinvsq20);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm256_andnot_pd(dummy_mask,velec);
845             velecsum         = _mm256_add_pd(velecsum,velec);
846
847             fscal            = felec;
848
849             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
850
851             /* Calculate temporary vectorial force */
852             tx               = _mm256_mul_pd(fscal,dx20);
853             ty               = _mm256_mul_pd(fscal,dy20);
854             tz               = _mm256_mul_pd(fscal,dz20);
855
856             /* Update vectorial force */
857             fix2             = _mm256_add_pd(fix2,tx);
858             fiy2             = _mm256_add_pd(fiy2,ty);
859             fiz2             = _mm256_add_pd(fiz2,tz);
860
861             fjx0             = _mm256_add_pd(fjx0,tx);
862             fjy0             = _mm256_add_pd(fjy0,ty);
863             fjz0             = _mm256_add_pd(fjz0,tz);
864
865             /**************************
866              * CALCULATE INTERACTIONS *
867              **************************/
868
869             /* COULOMB ELECTROSTATICS */
870             velec            = _mm256_mul_pd(qq21,rinv21);
871             felec            = _mm256_mul_pd(velec,rinvsq21);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm256_andnot_pd(dummy_mask,velec);
875             velecsum         = _mm256_add_pd(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm256_mul_pd(fscal,dx21);
883             ty               = _mm256_mul_pd(fscal,dy21);
884             tz               = _mm256_mul_pd(fscal,dz21);
885
886             /* Update vectorial force */
887             fix2             = _mm256_add_pd(fix2,tx);
888             fiy2             = _mm256_add_pd(fiy2,ty);
889             fiz2             = _mm256_add_pd(fiz2,tz);
890
891             fjx1             = _mm256_add_pd(fjx1,tx);
892             fjy1             = _mm256_add_pd(fjy1,ty);
893             fjz1             = _mm256_add_pd(fjz1,tz);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             /* COULOMB ELECTROSTATICS */
900             velec            = _mm256_mul_pd(qq22,rinv22);
901             felec            = _mm256_mul_pd(velec,rinvsq22);
902
903             /* Update potential sum for this i atom from the interaction with this j atom. */
904             velec            = _mm256_andnot_pd(dummy_mask,velec);
905             velecsum         = _mm256_add_pd(velecsum,velec);
906
907             fscal            = felec;
908
909             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
910
911             /* Calculate temporary vectorial force */
912             tx               = _mm256_mul_pd(fscal,dx22);
913             ty               = _mm256_mul_pd(fscal,dy22);
914             tz               = _mm256_mul_pd(fscal,dz22);
915
916             /* Update vectorial force */
917             fix2             = _mm256_add_pd(fix2,tx);
918             fiy2             = _mm256_add_pd(fiy2,ty);
919             fiz2             = _mm256_add_pd(fiz2,tz);
920
921             fjx2             = _mm256_add_pd(fjx2,tx);
922             fjy2             = _mm256_add_pd(fjy2,ty);
923             fjz2             = _mm256_add_pd(fjz2,tz);
924
925             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
926             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
927             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
928             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
929
930             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
931                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
932
933             /* Inner loop uses 255 flops */
934         }
935
936         /* End of innermost loop */
937
938         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
939                                                  f+i_coord_offset,fshift+i_shift_offset);
940
941         ggid                        = gid[iidx];
942         /* Update potential energies */
943         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
944         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
945
946         /* Increment number of inner iterations */
947         inneriter                  += j_index_end - j_index_start;
948
949         /* Outer loop uses 20 flops */
950     }
951
952     /* Increment number of outer iterations */
953     outeriter        += nri;
954
955     /* Update outer/inner flops */
956
957     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*255);
958 }
959 /*
960  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
961  * Electrostatics interaction: Coulomb
962  * VdW interaction:            LennardJones
963  * Geometry:                   Water3-Water3
964  * Calculate force/pot:        Force
965  */
966 void
967 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
968                     (t_nblist * gmx_restrict                nlist,
969                      rvec * gmx_restrict                    xx,
970                      rvec * gmx_restrict                    ff,
971                      t_forcerec * gmx_restrict              fr,
972                      t_mdatoms * gmx_restrict               mdatoms,
973                      nb_kernel_data_t * gmx_restrict        kernel_data,
974                      t_nrnb * gmx_restrict                  nrnb)
975 {
976     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
977      * just 0 for non-waters.
978      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
979      * jnr indices corresponding to data put in the four positions in the SIMD register.
980      */
981     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
982     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
983     int              jnrA,jnrB,jnrC,jnrD;
984     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
985     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
986     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
987     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
988     real             rcutoff_scalar;
989     real             *shiftvec,*fshift,*x,*f;
990     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
991     real             scratch[4*DIM];
992     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
993     real *           vdwioffsetptr0;
994     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
995     real *           vdwioffsetptr1;
996     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
997     real *           vdwioffsetptr2;
998     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
999     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1000     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1001     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1002     __m256d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1003     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1004     __m256d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1005     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1006     __m256d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1007     __m256d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1008     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1009     __m256d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1010     __m256d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1011     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1012     __m256d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1013     __m256d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1014     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
1015     real             *charge;
1016     int              nvdwtype;
1017     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1018     int              *vdwtype;
1019     real             *vdwparam;
1020     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
1021     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
1022     __m256d          dummy_mask,cutoff_mask;
1023     __m128           tmpmask0,tmpmask1;
1024     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1025     __m256d          one     = _mm256_set1_pd(1.0);
1026     __m256d          two     = _mm256_set1_pd(2.0);
1027     x                = xx[0];
1028     f                = ff[0];
1029
1030     nri              = nlist->nri;
1031     iinr             = nlist->iinr;
1032     jindex           = nlist->jindex;
1033     jjnr             = nlist->jjnr;
1034     shiftidx         = nlist->shift;
1035     gid              = nlist->gid;
1036     shiftvec         = fr->shift_vec[0];
1037     fshift           = fr->fshift[0];
1038     facel            = _mm256_set1_pd(fr->epsfac);
1039     charge           = mdatoms->chargeA;
1040     nvdwtype         = fr->ntype;
1041     vdwparam         = fr->nbfp;
1042     vdwtype          = mdatoms->typeA;
1043
1044     /* Setup water-specific parameters */
1045     inr              = nlist->iinr[0];
1046     iq0              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1047     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1048     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1049     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1050
1051     jq0              = _mm256_set1_pd(charge[inr+0]);
1052     jq1              = _mm256_set1_pd(charge[inr+1]);
1053     jq2              = _mm256_set1_pd(charge[inr+2]);
1054     vdwjidx0A        = 2*vdwtype[inr+0];
1055     qq00             = _mm256_mul_pd(iq0,jq0);
1056     c6_00            = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1057     c12_00           = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1058     qq01             = _mm256_mul_pd(iq0,jq1);
1059     qq02             = _mm256_mul_pd(iq0,jq2);
1060     qq10             = _mm256_mul_pd(iq1,jq0);
1061     qq11             = _mm256_mul_pd(iq1,jq1);
1062     qq12             = _mm256_mul_pd(iq1,jq2);
1063     qq20             = _mm256_mul_pd(iq2,jq0);
1064     qq21             = _mm256_mul_pd(iq2,jq1);
1065     qq22             = _mm256_mul_pd(iq2,jq2);
1066
1067     /* Avoid stupid compiler warnings */
1068     jnrA = jnrB = jnrC = jnrD = 0;
1069     j_coord_offsetA = 0;
1070     j_coord_offsetB = 0;
1071     j_coord_offsetC = 0;
1072     j_coord_offsetD = 0;
1073
1074     outeriter        = 0;
1075     inneriter        = 0;
1076
1077     for(iidx=0;iidx<4*DIM;iidx++)
1078     {
1079         scratch[iidx] = 0.0;
1080     }
1081
1082     /* Start outer loop over neighborlists */
1083     for(iidx=0; iidx<nri; iidx++)
1084     {
1085         /* Load shift vector for this list */
1086         i_shift_offset   = DIM*shiftidx[iidx];
1087
1088         /* Load limits for loop over neighbors */
1089         j_index_start    = jindex[iidx];
1090         j_index_end      = jindex[iidx+1];
1091
1092         /* Get outer coordinate index */
1093         inr              = iinr[iidx];
1094         i_coord_offset   = DIM*inr;
1095
1096         /* Load i particle coords and add shift vector */
1097         gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1098                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1099
1100         fix0             = _mm256_setzero_pd();
1101         fiy0             = _mm256_setzero_pd();
1102         fiz0             = _mm256_setzero_pd();
1103         fix1             = _mm256_setzero_pd();
1104         fiy1             = _mm256_setzero_pd();
1105         fiz1             = _mm256_setzero_pd();
1106         fix2             = _mm256_setzero_pd();
1107         fiy2             = _mm256_setzero_pd();
1108         fiz2             = _mm256_setzero_pd();
1109
1110         /* Start inner kernel loop */
1111         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1112         {
1113
1114             /* Get j neighbor index, and coordinate index */
1115             jnrA             = jjnr[jidx];
1116             jnrB             = jjnr[jidx+1];
1117             jnrC             = jjnr[jidx+2];
1118             jnrD             = jjnr[jidx+3];
1119             j_coord_offsetA  = DIM*jnrA;
1120             j_coord_offsetB  = DIM*jnrB;
1121             j_coord_offsetC  = DIM*jnrC;
1122             j_coord_offsetD  = DIM*jnrD;
1123
1124             /* load j atom coordinates */
1125             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1126                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1127                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1128
1129             /* Calculate displacement vector */
1130             dx00             = _mm256_sub_pd(ix0,jx0);
1131             dy00             = _mm256_sub_pd(iy0,jy0);
1132             dz00             = _mm256_sub_pd(iz0,jz0);
1133             dx01             = _mm256_sub_pd(ix0,jx1);
1134             dy01             = _mm256_sub_pd(iy0,jy1);
1135             dz01             = _mm256_sub_pd(iz0,jz1);
1136             dx02             = _mm256_sub_pd(ix0,jx2);
1137             dy02             = _mm256_sub_pd(iy0,jy2);
1138             dz02             = _mm256_sub_pd(iz0,jz2);
1139             dx10             = _mm256_sub_pd(ix1,jx0);
1140             dy10             = _mm256_sub_pd(iy1,jy0);
1141             dz10             = _mm256_sub_pd(iz1,jz0);
1142             dx11             = _mm256_sub_pd(ix1,jx1);
1143             dy11             = _mm256_sub_pd(iy1,jy1);
1144             dz11             = _mm256_sub_pd(iz1,jz1);
1145             dx12             = _mm256_sub_pd(ix1,jx2);
1146             dy12             = _mm256_sub_pd(iy1,jy2);
1147             dz12             = _mm256_sub_pd(iz1,jz2);
1148             dx20             = _mm256_sub_pd(ix2,jx0);
1149             dy20             = _mm256_sub_pd(iy2,jy0);
1150             dz20             = _mm256_sub_pd(iz2,jz0);
1151             dx21             = _mm256_sub_pd(ix2,jx1);
1152             dy21             = _mm256_sub_pd(iy2,jy1);
1153             dz21             = _mm256_sub_pd(iz2,jz1);
1154             dx22             = _mm256_sub_pd(ix2,jx2);
1155             dy22             = _mm256_sub_pd(iy2,jy2);
1156             dz22             = _mm256_sub_pd(iz2,jz2);
1157
1158             /* Calculate squared distance and things based on it */
1159             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1160             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1161             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1162             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1163             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1164             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1165             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1166             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1167             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1168
1169             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1170             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1171             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1172             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1173             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1174             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1175             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1176             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1177             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1178
1179             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1180             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1181             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1182             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1183             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1184             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1185             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1186             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1187             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1188
1189             fjx0             = _mm256_setzero_pd();
1190             fjy0             = _mm256_setzero_pd();
1191             fjz0             = _mm256_setzero_pd();
1192             fjx1             = _mm256_setzero_pd();
1193             fjy1             = _mm256_setzero_pd();
1194             fjz1             = _mm256_setzero_pd();
1195             fjx2             = _mm256_setzero_pd();
1196             fjy2             = _mm256_setzero_pd();
1197             fjz2             = _mm256_setzero_pd();
1198
1199             /**************************
1200              * CALCULATE INTERACTIONS *
1201              **************************/
1202
1203             /* COULOMB ELECTROSTATICS */
1204             velec            = _mm256_mul_pd(qq00,rinv00);
1205             felec            = _mm256_mul_pd(velec,rinvsq00);
1206
1207             /* LENNARD-JONES DISPERSION/REPULSION */
1208
1209             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1210             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1211
1212             fscal            = _mm256_add_pd(felec,fvdw);
1213
1214             /* Calculate temporary vectorial force */
1215             tx               = _mm256_mul_pd(fscal,dx00);
1216             ty               = _mm256_mul_pd(fscal,dy00);
1217             tz               = _mm256_mul_pd(fscal,dz00);
1218
1219             /* Update vectorial force */
1220             fix0             = _mm256_add_pd(fix0,tx);
1221             fiy0             = _mm256_add_pd(fiy0,ty);
1222             fiz0             = _mm256_add_pd(fiz0,tz);
1223
1224             fjx0             = _mm256_add_pd(fjx0,tx);
1225             fjy0             = _mm256_add_pd(fjy0,ty);
1226             fjz0             = _mm256_add_pd(fjz0,tz);
1227
1228             /**************************
1229              * CALCULATE INTERACTIONS *
1230              **************************/
1231
1232             /* COULOMB ELECTROSTATICS */
1233             velec            = _mm256_mul_pd(qq01,rinv01);
1234             felec            = _mm256_mul_pd(velec,rinvsq01);
1235
1236             fscal            = felec;
1237
1238             /* Calculate temporary vectorial force */
1239             tx               = _mm256_mul_pd(fscal,dx01);
1240             ty               = _mm256_mul_pd(fscal,dy01);
1241             tz               = _mm256_mul_pd(fscal,dz01);
1242
1243             /* Update vectorial force */
1244             fix0             = _mm256_add_pd(fix0,tx);
1245             fiy0             = _mm256_add_pd(fiy0,ty);
1246             fiz0             = _mm256_add_pd(fiz0,tz);
1247
1248             fjx1             = _mm256_add_pd(fjx1,tx);
1249             fjy1             = _mm256_add_pd(fjy1,ty);
1250             fjz1             = _mm256_add_pd(fjz1,tz);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm256_mul_pd(qq02,rinv02);
1258             felec            = _mm256_mul_pd(velec,rinvsq02);
1259
1260             fscal            = felec;
1261
1262             /* Calculate temporary vectorial force */
1263             tx               = _mm256_mul_pd(fscal,dx02);
1264             ty               = _mm256_mul_pd(fscal,dy02);
1265             tz               = _mm256_mul_pd(fscal,dz02);
1266
1267             /* Update vectorial force */
1268             fix0             = _mm256_add_pd(fix0,tx);
1269             fiy0             = _mm256_add_pd(fiy0,ty);
1270             fiz0             = _mm256_add_pd(fiz0,tz);
1271
1272             fjx2             = _mm256_add_pd(fjx2,tx);
1273             fjy2             = _mm256_add_pd(fjy2,ty);
1274             fjz2             = _mm256_add_pd(fjz2,tz);
1275
1276             /**************************
1277              * CALCULATE INTERACTIONS *
1278              **************************/
1279
1280             /* COULOMB ELECTROSTATICS */
1281             velec            = _mm256_mul_pd(qq10,rinv10);
1282             felec            = _mm256_mul_pd(velec,rinvsq10);
1283
1284             fscal            = felec;
1285
1286             /* Calculate temporary vectorial force */
1287             tx               = _mm256_mul_pd(fscal,dx10);
1288             ty               = _mm256_mul_pd(fscal,dy10);
1289             tz               = _mm256_mul_pd(fscal,dz10);
1290
1291             /* Update vectorial force */
1292             fix1             = _mm256_add_pd(fix1,tx);
1293             fiy1             = _mm256_add_pd(fiy1,ty);
1294             fiz1             = _mm256_add_pd(fiz1,tz);
1295
1296             fjx0             = _mm256_add_pd(fjx0,tx);
1297             fjy0             = _mm256_add_pd(fjy0,ty);
1298             fjz0             = _mm256_add_pd(fjz0,tz);
1299
1300             /**************************
1301              * CALCULATE INTERACTIONS *
1302              **************************/
1303
1304             /* COULOMB ELECTROSTATICS */
1305             velec            = _mm256_mul_pd(qq11,rinv11);
1306             felec            = _mm256_mul_pd(velec,rinvsq11);
1307
1308             fscal            = felec;
1309
1310             /* Calculate temporary vectorial force */
1311             tx               = _mm256_mul_pd(fscal,dx11);
1312             ty               = _mm256_mul_pd(fscal,dy11);
1313             tz               = _mm256_mul_pd(fscal,dz11);
1314
1315             /* Update vectorial force */
1316             fix1             = _mm256_add_pd(fix1,tx);
1317             fiy1             = _mm256_add_pd(fiy1,ty);
1318             fiz1             = _mm256_add_pd(fiz1,tz);
1319
1320             fjx1             = _mm256_add_pd(fjx1,tx);
1321             fjy1             = _mm256_add_pd(fjy1,ty);
1322             fjz1             = _mm256_add_pd(fjz1,tz);
1323
1324             /**************************
1325              * CALCULATE INTERACTIONS *
1326              **************************/
1327
1328             /* COULOMB ELECTROSTATICS */
1329             velec            = _mm256_mul_pd(qq12,rinv12);
1330             felec            = _mm256_mul_pd(velec,rinvsq12);
1331
1332             fscal            = felec;
1333
1334             /* Calculate temporary vectorial force */
1335             tx               = _mm256_mul_pd(fscal,dx12);
1336             ty               = _mm256_mul_pd(fscal,dy12);
1337             tz               = _mm256_mul_pd(fscal,dz12);
1338
1339             /* Update vectorial force */
1340             fix1             = _mm256_add_pd(fix1,tx);
1341             fiy1             = _mm256_add_pd(fiy1,ty);
1342             fiz1             = _mm256_add_pd(fiz1,tz);
1343
1344             fjx2             = _mm256_add_pd(fjx2,tx);
1345             fjy2             = _mm256_add_pd(fjy2,ty);
1346             fjz2             = _mm256_add_pd(fjz2,tz);
1347
1348             /**************************
1349              * CALCULATE INTERACTIONS *
1350              **************************/
1351
1352             /* COULOMB ELECTROSTATICS */
1353             velec            = _mm256_mul_pd(qq20,rinv20);
1354             felec            = _mm256_mul_pd(velec,rinvsq20);
1355
1356             fscal            = felec;
1357
1358             /* Calculate temporary vectorial force */
1359             tx               = _mm256_mul_pd(fscal,dx20);
1360             ty               = _mm256_mul_pd(fscal,dy20);
1361             tz               = _mm256_mul_pd(fscal,dz20);
1362
1363             /* Update vectorial force */
1364             fix2             = _mm256_add_pd(fix2,tx);
1365             fiy2             = _mm256_add_pd(fiy2,ty);
1366             fiz2             = _mm256_add_pd(fiz2,tz);
1367
1368             fjx0             = _mm256_add_pd(fjx0,tx);
1369             fjy0             = _mm256_add_pd(fjy0,ty);
1370             fjz0             = _mm256_add_pd(fjz0,tz);
1371
1372             /**************************
1373              * CALCULATE INTERACTIONS *
1374              **************************/
1375
1376             /* COULOMB ELECTROSTATICS */
1377             velec            = _mm256_mul_pd(qq21,rinv21);
1378             felec            = _mm256_mul_pd(velec,rinvsq21);
1379
1380             fscal            = felec;
1381
1382             /* Calculate temporary vectorial force */
1383             tx               = _mm256_mul_pd(fscal,dx21);
1384             ty               = _mm256_mul_pd(fscal,dy21);
1385             tz               = _mm256_mul_pd(fscal,dz21);
1386
1387             /* Update vectorial force */
1388             fix2             = _mm256_add_pd(fix2,tx);
1389             fiy2             = _mm256_add_pd(fiy2,ty);
1390             fiz2             = _mm256_add_pd(fiz2,tz);
1391
1392             fjx1             = _mm256_add_pd(fjx1,tx);
1393             fjy1             = _mm256_add_pd(fjy1,ty);
1394             fjz1             = _mm256_add_pd(fjz1,tz);
1395
1396             /**************************
1397              * CALCULATE INTERACTIONS *
1398              **************************/
1399
1400             /* COULOMB ELECTROSTATICS */
1401             velec            = _mm256_mul_pd(qq22,rinv22);
1402             felec            = _mm256_mul_pd(velec,rinvsq22);
1403
1404             fscal            = felec;
1405
1406             /* Calculate temporary vectorial force */
1407             tx               = _mm256_mul_pd(fscal,dx22);
1408             ty               = _mm256_mul_pd(fscal,dy22);
1409             tz               = _mm256_mul_pd(fscal,dz22);
1410
1411             /* Update vectorial force */
1412             fix2             = _mm256_add_pd(fix2,tx);
1413             fiy2             = _mm256_add_pd(fiy2,ty);
1414             fiz2             = _mm256_add_pd(fiz2,tz);
1415
1416             fjx2             = _mm256_add_pd(fjx2,tx);
1417             fjy2             = _mm256_add_pd(fjy2,ty);
1418             fjz2             = _mm256_add_pd(fjz2,tz);
1419
1420             fjptrA             = f+j_coord_offsetA;
1421             fjptrB             = f+j_coord_offsetB;
1422             fjptrC             = f+j_coord_offsetC;
1423             fjptrD             = f+j_coord_offsetD;
1424
1425             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1426                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1427
1428             /* Inner loop uses 241 flops */
1429         }
1430
1431         if(jidx<j_index_end)
1432         {
1433
1434             /* Get j neighbor index, and coordinate index */
1435             jnrlistA         = jjnr[jidx];
1436             jnrlistB         = jjnr[jidx+1];
1437             jnrlistC         = jjnr[jidx+2];
1438             jnrlistD         = jjnr[jidx+3];
1439             /* Sign of each element will be negative for non-real atoms.
1440              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1441              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1442              */
1443             tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1444
1445             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1446             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1447             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1448
1449             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1450             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1451             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1452             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1453             j_coord_offsetA  = DIM*jnrA;
1454             j_coord_offsetB  = DIM*jnrB;
1455             j_coord_offsetC  = DIM*jnrC;
1456             j_coord_offsetD  = DIM*jnrD;
1457
1458             /* load j atom coordinates */
1459             gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1460                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1461                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1462
1463             /* Calculate displacement vector */
1464             dx00             = _mm256_sub_pd(ix0,jx0);
1465             dy00             = _mm256_sub_pd(iy0,jy0);
1466             dz00             = _mm256_sub_pd(iz0,jz0);
1467             dx01             = _mm256_sub_pd(ix0,jx1);
1468             dy01             = _mm256_sub_pd(iy0,jy1);
1469             dz01             = _mm256_sub_pd(iz0,jz1);
1470             dx02             = _mm256_sub_pd(ix0,jx2);
1471             dy02             = _mm256_sub_pd(iy0,jy2);
1472             dz02             = _mm256_sub_pd(iz0,jz2);
1473             dx10             = _mm256_sub_pd(ix1,jx0);
1474             dy10             = _mm256_sub_pd(iy1,jy0);
1475             dz10             = _mm256_sub_pd(iz1,jz0);
1476             dx11             = _mm256_sub_pd(ix1,jx1);
1477             dy11             = _mm256_sub_pd(iy1,jy1);
1478             dz11             = _mm256_sub_pd(iz1,jz1);
1479             dx12             = _mm256_sub_pd(ix1,jx2);
1480             dy12             = _mm256_sub_pd(iy1,jy2);
1481             dz12             = _mm256_sub_pd(iz1,jz2);
1482             dx20             = _mm256_sub_pd(ix2,jx0);
1483             dy20             = _mm256_sub_pd(iy2,jy0);
1484             dz20             = _mm256_sub_pd(iz2,jz0);
1485             dx21             = _mm256_sub_pd(ix2,jx1);
1486             dy21             = _mm256_sub_pd(iy2,jy1);
1487             dz21             = _mm256_sub_pd(iz2,jz1);
1488             dx22             = _mm256_sub_pd(ix2,jx2);
1489             dy22             = _mm256_sub_pd(iy2,jy2);
1490             dz22             = _mm256_sub_pd(iz2,jz2);
1491
1492             /* Calculate squared distance and things based on it */
1493             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1494             rsq01            = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1495             rsq02            = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1496             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1497             rsq11            = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1498             rsq12            = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1499             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1500             rsq21            = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1501             rsq22            = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1502
1503             rinv00           = gmx_mm256_invsqrt_pd(rsq00);
1504             rinv01           = gmx_mm256_invsqrt_pd(rsq01);
1505             rinv02           = gmx_mm256_invsqrt_pd(rsq02);
1506             rinv10           = gmx_mm256_invsqrt_pd(rsq10);
1507             rinv11           = gmx_mm256_invsqrt_pd(rsq11);
1508             rinv12           = gmx_mm256_invsqrt_pd(rsq12);
1509             rinv20           = gmx_mm256_invsqrt_pd(rsq20);
1510             rinv21           = gmx_mm256_invsqrt_pd(rsq21);
1511             rinv22           = gmx_mm256_invsqrt_pd(rsq22);
1512
1513             rinvsq00         = _mm256_mul_pd(rinv00,rinv00);
1514             rinvsq01         = _mm256_mul_pd(rinv01,rinv01);
1515             rinvsq02         = _mm256_mul_pd(rinv02,rinv02);
1516             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1517             rinvsq11         = _mm256_mul_pd(rinv11,rinv11);
1518             rinvsq12         = _mm256_mul_pd(rinv12,rinv12);
1519             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1520             rinvsq21         = _mm256_mul_pd(rinv21,rinv21);
1521             rinvsq22         = _mm256_mul_pd(rinv22,rinv22);
1522
1523             fjx0             = _mm256_setzero_pd();
1524             fjy0             = _mm256_setzero_pd();
1525             fjz0             = _mm256_setzero_pd();
1526             fjx1             = _mm256_setzero_pd();
1527             fjy1             = _mm256_setzero_pd();
1528             fjz1             = _mm256_setzero_pd();
1529             fjx2             = _mm256_setzero_pd();
1530             fjy2             = _mm256_setzero_pd();
1531             fjz2             = _mm256_setzero_pd();
1532
1533             /**************************
1534              * CALCULATE INTERACTIONS *
1535              **************************/
1536
1537             /* COULOMB ELECTROSTATICS */
1538             velec            = _mm256_mul_pd(qq00,rinv00);
1539             felec            = _mm256_mul_pd(velec,rinvsq00);
1540
1541             /* LENNARD-JONES DISPERSION/REPULSION */
1542
1543             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1544             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1545
1546             fscal            = _mm256_add_pd(felec,fvdw);
1547
1548             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1549
1550             /* Calculate temporary vectorial force */
1551             tx               = _mm256_mul_pd(fscal,dx00);
1552             ty               = _mm256_mul_pd(fscal,dy00);
1553             tz               = _mm256_mul_pd(fscal,dz00);
1554
1555             /* Update vectorial force */
1556             fix0             = _mm256_add_pd(fix0,tx);
1557             fiy0             = _mm256_add_pd(fiy0,ty);
1558             fiz0             = _mm256_add_pd(fiz0,tz);
1559
1560             fjx0             = _mm256_add_pd(fjx0,tx);
1561             fjy0             = _mm256_add_pd(fjy0,ty);
1562             fjz0             = _mm256_add_pd(fjz0,tz);
1563
1564             /**************************
1565              * CALCULATE INTERACTIONS *
1566              **************************/
1567
1568             /* COULOMB ELECTROSTATICS */
1569             velec            = _mm256_mul_pd(qq01,rinv01);
1570             felec            = _mm256_mul_pd(velec,rinvsq01);
1571
1572             fscal            = felec;
1573
1574             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1575
1576             /* Calculate temporary vectorial force */
1577             tx               = _mm256_mul_pd(fscal,dx01);
1578             ty               = _mm256_mul_pd(fscal,dy01);
1579             tz               = _mm256_mul_pd(fscal,dz01);
1580
1581             /* Update vectorial force */
1582             fix0             = _mm256_add_pd(fix0,tx);
1583             fiy0             = _mm256_add_pd(fiy0,ty);
1584             fiz0             = _mm256_add_pd(fiz0,tz);
1585
1586             fjx1             = _mm256_add_pd(fjx1,tx);
1587             fjy1             = _mm256_add_pd(fjy1,ty);
1588             fjz1             = _mm256_add_pd(fjz1,tz);
1589
1590             /**************************
1591              * CALCULATE INTERACTIONS *
1592              **************************/
1593
1594             /* COULOMB ELECTROSTATICS */
1595             velec            = _mm256_mul_pd(qq02,rinv02);
1596             felec            = _mm256_mul_pd(velec,rinvsq02);
1597
1598             fscal            = felec;
1599
1600             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1601
1602             /* Calculate temporary vectorial force */
1603             tx               = _mm256_mul_pd(fscal,dx02);
1604             ty               = _mm256_mul_pd(fscal,dy02);
1605             tz               = _mm256_mul_pd(fscal,dz02);
1606
1607             /* Update vectorial force */
1608             fix0             = _mm256_add_pd(fix0,tx);
1609             fiy0             = _mm256_add_pd(fiy0,ty);
1610             fiz0             = _mm256_add_pd(fiz0,tz);
1611
1612             fjx2             = _mm256_add_pd(fjx2,tx);
1613             fjy2             = _mm256_add_pd(fjy2,ty);
1614             fjz2             = _mm256_add_pd(fjz2,tz);
1615
1616             /**************************
1617              * CALCULATE INTERACTIONS *
1618              **************************/
1619
1620             /* COULOMB ELECTROSTATICS */
1621             velec            = _mm256_mul_pd(qq10,rinv10);
1622             felec            = _mm256_mul_pd(velec,rinvsq10);
1623
1624             fscal            = felec;
1625
1626             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1627
1628             /* Calculate temporary vectorial force */
1629             tx               = _mm256_mul_pd(fscal,dx10);
1630             ty               = _mm256_mul_pd(fscal,dy10);
1631             tz               = _mm256_mul_pd(fscal,dz10);
1632
1633             /* Update vectorial force */
1634             fix1             = _mm256_add_pd(fix1,tx);
1635             fiy1             = _mm256_add_pd(fiy1,ty);
1636             fiz1             = _mm256_add_pd(fiz1,tz);
1637
1638             fjx0             = _mm256_add_pd(fjx0,tx);
1639             fjy0             = _mm256_add_pd(fjy0,ty);
1640             fjz0             = _mm256_add_pd(fjz0,tz);
1641
1642             /**************************
1643              * CALCULATE INTERACTIONS *
1644              **************************/
1645
1646             /* COULOMB ELECTROSTATICS */
1647             velec            = _mm256_mul_pd(qq11,rinv11);
1648             felec            = _mm256_mul_pd(velec,rinvsq11);
1649
1650             fscal            = felec;
1651
1652             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1653
1654             /* Calculate temporary vectorial force */
1655             tx               = _mm256_mul_pd(fscal,dx11);
1656             ty               = _mm256_mul_pd(fscal,dy11);
1657             tz               = _mm256_mul_pd(fscal,dz11);
1658
1659             /* Update vectorial force */
1660             fix1             = _mm256_add_pd(fix1,tx);
1661             fiy1             = _mm256_add_pd(fiy1,ty);
1662             fiz1             = _mm256_add_pd(fiz1,tz);
1663
1664             fjx1             = _mm256_add_pd(fjx1,tx);
1665             fjy1             = _mm256_add_pd(fjy1,ty);
1666             fjz1             = _mm256_add_pd(fjz1,tz);
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             /* COULOMB ELECTROSTATICS */
1673             velec            = _mm256_mul_pd(qq12,rinv12);
1674             felec            = _mm256_mul_pd(velec,rinvsq12);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm256_mul_pd(fscal,dx12);
1682             ty               = _mm256_mul_pd(fscal,dy12);
1683             tz               = _mm256_mul_pd(fscal,dz12);
1684
1685             /* Update vectorial force */
1686             fix1             = _mm256_add_pd(fix1,tx);
1687             fiy1             = _mm256_add_pd(fiy1,ty);
1688             fiz1             = _mm256_add_pd(fiz1,tz);
1689
1690             fjx2             = _mm256_add_pd(fjx2,tx);
1691             fjy2             = _mm256_add_pd(fjy2,ty);
1692             fjz2             = _mm256_add_pd(fjz2,tz);
1693
1694             /**************************
1695              * CALCULATE INTERACTIONS *
1696              **************************/
1697
1698             /* COULOMB ELECTROSTATICS */
1699             velec            = _mm256_mul_pd(qq20,rinv20);
1700             felec            = _mm256_mul_pd(velec,rinvsq20);
1701
1702             fscal            = felec;
1703
1704             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1705
1706             /* Calculate temporary vectorial force */
1707             tx               = _mm256_mul_pd(fscal,dx20);
1708             ty               = _mm256_mul_pd(fscal,dy20);
1709             tz               = _mm256_mul_pd(fscal,dz20);
1710
1711             /* Update vectorial force */
1712             fix2             = _mm256_add_pd(fix2,tx);
1713             fiy2             = _mm256_add_pd(fiy2,ty);
1714             fiz2             = _mm256_add_pd(fiz2,tz);
1715
1716             fjx0             = _mm256_add_pd(fjx0,tx);
1717             fjy0             = _mm256_add_pd(fjy0,ty);
1718             fjz0             = _mm256_add_pd(fjz0,tz);
1719
1720             /**************************
1721              * CALCULATE INTERACTIONS *
1722              **************************/
1723
1724             /* COULOMB ELECTROSTATICS */
1725             velec            = _mm256_mul_pd(qq21,rinv21);
1726             felec            = _mm256_mul_pd(velec,rinvsq21);
1727
1728             fscal            = felec;
1729
1730             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1731
1732             /* Calculate temporary vectorial force */
1733             tx               = _mm256_mul_pd(fscal,dx21);
1734             ty               = _mm256_mul_pd(fscal,dy21);
1735             tz               = _mm256_mul_pd(fscal,dz21);
1736
1737             /* Update vectorial force */
1738             fix2             = _mm256_add_pd(fix2,tx);
1739             fiy2             = _mm256_add_pd(fiy2,ty);
1740             fiz2             = _mm256_add_pd(fiz2,tz);
1741
1742             fjx1             = _mm256_add_pd(fjx1,tx);
1743             fjy1             = _mm256_add_pd(fjy1,ty);
1744             fjz1             = _mm256_add_pd(fjz1,tz);
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             /* COULOMB ELECTROSTATICS */
1751             velec            = _mm256_mul_pd(qq22,rinv22);
1752             felec            = _mm256_mul_pd(velec,rinvsq22);
1753
1754             fscal            = felec;
1755
1756             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1757
1758             /* Calculate temporary vectorial force */
1759             tx               = _mm256_mul_pd(fscal,dx22);
1760             ty               = _mm256_mul_pd(fscal,dy22);
1761             tz               = _mm256_mul_pd(fscal,dz22);
1762
1763             /* Update vectorial force */
1764             fix2             = _mm256_add_pd(fix2,tx);
1765             fiy2             = _mm256_add_pd(fiy2,ty);
1766             fiz2             = _mm256_add_pd(fiz2,tz);
1767
1768             fjx2             = _mm256_add_pd(fjx2,tx);
1769             fjy2             = _mm256_add_pd(fjy2,ty);
1770             fjz2             = _mm256_add_pd(fjz2,tz);
1771
1772             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1773             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1774             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1775             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1776
1777             gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1778                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1779
1780             /* Inner loop uses 241 flops */
1781         }
1782
1783         /* End of innermost loop */
1784
1785         gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1786                                                  f+i_coord_offset,fshift+i_shift_offset);
1787
1788         /* Increment number of inner iterations */
1789         inneriter                  += j_index_end - j_index_start;
1790
1791         /* Outer loop uses 18 flops */
1792     }
1793
1794     /* Increment number of outer iterations */
1795     outeriter        += nri;
1796
1797     /* Update outer/inner flops */
1798
1799     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*241);
1800 }