Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_double / nb_kernel_ElecCoul_VdwLJ_GeomW4P1_avx_256_double.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double
51  * Electrostatics interaction: Coulomb
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_avx_256_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     real *           vdwioffsetptr0;
84     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     real *           vdwioffsetptr1;
86     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
98     real             *charge;
99     int              nvdwtype;
100     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101     int              *vdwtype;
102     real             *vdwparam;
103     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
104     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
105     __m256d          dummy_mask,cutoff_mask;
106     __m128           tmpmask0,tmpmask1;
107     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
108     __m256d          one     = _mm256_set1_pd(1.0);
109     __m256d          two     = _mm256_set1_pd(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_pd(fr->ic->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
132     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
133
134     /* Avoid stupid compiler warnings */
135     jnrA = jnrB = jnrC = jnrD = 0;
136     j_coord_offsetA = 0;
137     j_coord_offsetB = 0;
138     j_coord_offsetC = 0;
139     j_coord_offsetD = 0;
140
141     outeriter        = 0;
142     inneriter        = 0;
143
144     for(iidx=0;iidx<4*DIM;iidx++)
145     {
146         scratch[iidx] = 0.0;
147     }
148
149     /* Start outer loop over neighborlists */
150     for(iidx=0; iidx<nri; iidx++)
151     {
152         /* Load shift vector for this list */
153         i_shift_offset   = DIM*shiftidx[iidx];
154
155         /* Load limits for loop over neighbors */
156         j_index_start    = jindex[iidx];
157         j_index_end      = jindex[iidx+1];
158
159         /* Get outer coordinate index */
160         inr              = iinr[iidx];
161         i_coord_offset   = DIM*inr;
162
163         /* Load i particle coords and add shift vector */
164         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
165                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
166
167         fix0             = _mm256_setzero_pd();
168         fiy0             = _mm256_setzero_pd();
169         fiz0             = _mm256_setzero_pd();
170         fix1             = _mm256_setzero_pd();
171         fiy1             = _mm256_setzero_pd();
172         fiz1             = _mm256_setzero_pd();
173         fix2             = _mm256_setzero_pd();
174         fiy2             = _mm256_setzero_pd();
175         fiz2             = _mm256_setzero_pd();
176         fix3             = _mm256_setzero_pd();
177         fiy3             = _mm256_setzero_pd();
178         fiz3             = _mm256_setzero_pd();
179
180         /* Reset potential sums */
181         velecsum         = _mm256_setzero_pd();
182         vvdwsum          = _mm256_setzero_pd();
183
184         /* Start inner kernel loop */
185         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
186         {
187
188             /* Get j neighbor index, and coordinate index */
189             jnrA             = jjnr[jidx];
190             jnrB             = jjnr[jidx+1];
191             jnrC             = jjnr[jidx+2];
192             jnrD             = jjnr[jidx+3];
193             j_coord_offsetA  = DIM*jnrA;
194             j_coord_offsetB  = DIM*jnrB;
195             j_coord_offsetC  = DIM*jnrC;
196             j_coord_offsetD  = DIM*jnrD;
197
198             /* load j atom coordinates */
199             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
200                                                  x+j_coord_offsetC,x+j_coord_offsetD,
201                                                  &jx0,&jy0,&jz0);
202
203             /* Calculate displacement vector */
204             dx00             = _mm256_sub_pd(ix0,jx0);
205             dy00             = _mm256_sub_pd(iy0,jy0);
206             dz00             = _mm256_sub_pd(iz0,jz0);
207             dx10             = _mm256_sub_pd(ix1,jx0);
208             dy10             = _mm256_sub_pd(iy1,jy0);
209             dz10             = _mm256_sub_pd(iz1,jz0);
210             dx20             = _mm256_sub_pd(ix2,jx0);
211             dy20             = _mm256_sub_pd(iy2,jy0);
212             dz20             = _mm256_sub_pd(iz2,jz0);
213             dx30             = _mm256_sub_pd(ix3,jx0);
214             dy30             = _mm256_sub_pd(iy3,jy0);
215             dz30             = _mm256_sub_pd(iz3,jz0);
216
217             /* Calculate squared distance and things based on it */
218             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
219             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
220             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
221             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
222
223             rinv10           = avx256_invsqrt_d(rsq10);
224             rinv20           = avx256_invsqrt_d(rsq20);
225             rinv30           = avx256_invsqrt_d(rsq30);
226
227             rinvsq00         = avx256_inv_d(rsq00);
228             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
229             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
230             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
231
232             /* Load parameters for j particles */
233             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
234                                                                  charge+jnrC+0,charge+jnrD+0);
235             vdwjidx0A        = 2*vdwtype[jnrA+0];
236             vdwjidx0B        = 2*vdwtype[jnrB+0];
237             vdwjidx0C        = 2*vdwtype[jnrC+0];
238             vdwjidx0D        = 2*vdwtype[jnrD+0];
239
240             fjx0             = _mm256_setzero_pd();
241             fjy0             = _mm256_setzero_pd();
242             fjz0             = _mm256_setzero_pd();
243
244             /**************************
245              * CALCULATE INTERACTIONS *
246              **************************/
247
248             /* Compute parameters for interactions between i and j atoms */
249             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
250                                             vdwioffsetptr0+vdwjidx0B,
251                                             vdwioffsetptr0+vdwjidx0C,
252                                             vdwioffsetptr0+vdwjidx0D,
253                                             &c6_00,&c12_00);
254
255             /* LENNARD-JONES DISPERSION/REPULSION */
256
257             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
258             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
259             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
260             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
261             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
262
263             /* Update potential sum for this i atom from the interaction with this j atom. */
264             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
265
266             fscal            = fvdw;
267
268             /* Calculate temporary vectorial force */
269             tx               = _mm256_mul_pd(fscal,dx00);
270             ty               = _mm256_mul_pd(fscal,dy00);
271             tz               = _mm256_mul_pd(fscal,dz00);
272
273             /* Update vectorial force */
274             fix0             = _mm256_add_pd(fix0,tx);
275             fiy0             = _mm256_add_pd(fiy0,ty);
276             fiz0             = _mm256_add_pd(fiz0,tz);
277
278             fjx0             = _mm256_add_pd(fjx0,tx);
279             fjy0             = _mm256_add_pd(fjy0,ty);
280             fjz0             = _mm256_add_pd(fjz0,tz);
281
282             /**************************
283              * CALCULATE INTERACTIONS *
284              **************************/
285
286             /* Compute parameters for interactions between i and j atoms */
287             qq10             = _mm256_mul_pd(iq1,jq0);
288
289             /* COULOMB ELECTROSTATICS */
290             velec            = _mm256_mul_pd(qq10,rinv10);
291             felec            = _mm256_mul_pd(velec,rinvsq10);
292
293             /* Update potential sum for this i atom from the interaction with this j atom. */
294             velecsum         = _mm256_add_pd(velecsum,velec);
295
296             fscal            = felec;
297
298             /* Calculate temporary vectorial force */
299             tx               = _mm256_mul_pd(fscal,dx10);
300             ty               = _mm256_mul_pd(fscal,dy10);
301             tz               = _mm256_mul_pd(fscal,dz10);
302
303             /* Update vectorial force */
304             fix1             = _mm256_add_pd(fix1,tx);
305             fiy1             = _mm256_add_pd(fiy1,ty);
306             fiz1             = _mm256_add_pd(fiz1,tz);
307
308             fjx0             = _mm256_add_pd(fjx0,tx);
309             fjy0             = _mm256_add_pd(fjy0,ty);
310             fjz0             = _mm256_add_pd(fjz0,tz);
311
312             /**************************
313              * CALCULATE INTERACTIONS *
314              **************************/
315
316             /* Compute parameters for interactions between i and j atoms */
317             qq20             = _mm256_mul_pd(iq2,jq0);
318
319             /* COULOMB ELECTROSTATICS */
320             velec            = _mm256_mul_pd(qq20,rinv20);
321             felec            = _mm256_mul_pd(velec,rinvsq20);
322
323             /* Update potential sum for this i atom from the interaction with this j atom. */
324             velecsum         = _mm256_add_pd(velecsum,velec);
325
326             fscal            = felec;
327
328             /* Calculate temporary vectorial force */
329             tx               = _mm256_mul_pd(fscal,dx20);
330             ty               = _mm256_mul_pd(fscal,dy20);
331             tz               = _mm256_mul_pd(fscal,dz20);
332
333             /* Update vectorial force */
334             fix2             = _mm256_add_pd(fix2,tx);
335             fiy2             = _mm256_add_pd(fiy2,ty);
336             fiz2             = _mm256_add_pd(fiz2,tz);
337
338             fjx0             = _mm256_add_pd(fjx0,tx);
339             fjy0             = _mm256_add_pd(fjy0,ty);
340             fjz0             = _mm256_add_pd(fjz0,tz);
341
342             /**************************
343              * CALCULATE INTERACTIONS *
344              **************************/
345
346             /* Compute parameters for interactions between i and j atoms */
347             qq30             = _mm256_mul_pd(iq3,jq0);
348
349             /* COULOMB ELECTROSTATICS */
350             velec            = _mm256_mul_pd(qq30,rinv30);
351             felec            = _mm256_mul_pd(velec,rinvsq30);
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             velecsum         = _mm256_add_pd(velecsum,velec);
355
356             fscal            = felec;
357
358             /* Calculate temporary vectorial force */
359             tx               = _mm256_mul_pd(fscal,dx30);
360             ty               = _mm256_mul_pd(fscal,dy30);
361             tz               = _mm256_mul_pd(fscal,dz30);
362
363             /* Update vectorial force */
364             fix3             = _mm256_add_pd(fix3,tx);
365             fiy3             = _mm256_add_pd(fiy3,ty);
366             fiz3             = _mm256_add_pd(fiz3,tz);
367
368             fjx0             = _mm256_add_pd(fjx0,tx);
369             fjy0             = _mm256_add_pd(fjy0,ty);
370             fjz0             = _mm256_add_pd(fjz0,tz);
371
372             fjptrA             = f+j_coord_offsetA;
373             fjptrB             = f+j_coord_offsetB;
374             fjptrC             = f+j_coord_offsetC;
375             fjptrD             = f+j_coord_offsetD;
376
377             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
378
379             /* Inner loop uses 116 flops */
380         }
381
382         if(jidx<j_index_end)
383         {
384
385             /* Get j neighbor index, and coordinate index */
386             jnrlistA         = jjnr[jidx];
387             jnrlistB         = jjnr[jidx+1];
388             jnrlistC         = jjnr[jidx+2];
389             jnrlistD         = jjnr[jidx+3];
390             /* Sign of each element will be negative for non-real atoms.
391              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
392              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
393              */
394             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
395
396             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
397             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
398             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
399
400             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
401             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
402             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
403             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
404             j_coord_offsetA  = DIM*jnrA;
405             j_coord_offsetB  = DIM*jnrB;
406             j_coord_offsetC  = DIM*jnrC;
407             j_coord_offsetD  = DIM*jnrD;
408
409             /* load j atom coordinates */
410             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
411                                                  x+j_coord_offsetC,x+j_coord_offsetD,
412                                                  &jx0,&jy0,&jz0);
413
414             /* Calculate displacement vector */
415             dx00             = _mm256_sub_pd(ix0,jx0);
416             dy00             = _mm256_sub_pd(iy0,jy0);
417             dz00             = _mm256_sub_pd(iz0,jz0);
418             dx10             = _mm256_sub_pd(ix1,jx0);
419             dy10             = _mm256_sub_pd(iy1,jy0);
420             dz10             = _mm256_sub_pd(iz1,jz0);
421             dx20             = _mm256_sub_pd(ix2,jx0);
422             dy20             = _mm256_sub_pd(iy2,jy0);
423             dz20             = _mm256_sub_pd(iz2,jz0);
424             dx30             = _mm256_sub_pd(ix3,jx0);
425             dy30             = _mm256_sub_pd(iy3,jy0);
426             dz30             = _mm256_sub_pd(iz3,jz0);
427
428             /* Calculate squared distance and things based on it */
429             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
430             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
431             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
432             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
433
434             rinv10           = avx256_invsqrt_d(rsq10);
435             rinv20           = avx256_invsqrt_d(rsq20);
436             rinv30           = avx256_invsqrt_d(rsq30);
437
438             rinvsq00         = avx256_inv_d(rsq00);
439             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
440             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
441             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
442
443             /* Load parameters for j particles */
444             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
445                                                                  charge+jnrC+0,charge+jnrD+0);
446             vdwjidx0A        = 2*vdwtype[jnrA+0];
447             vdwjidx0B        = 2*vdwtype[jnrB+0];
448             vdwjidx0C        = 2*vdwtype[jnrC+0];
449             vdwjidx0D        = 2*vdwtype[jnrD+0];
450
451             fjx0             = _mm256_setzero_pd();
452             fjy0             = _mm256_setzero_pd();
453             fjz0             = _mm256_setzero_pd();
454
455             /**************************
456              * CALCULATE INTERACTIONS *
457              **************************/
458
459             /* Compute parameters for interactions between i and j atoms */
460             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
461                                             vdwioffsetptr0+vdwjidx0B,
462                                             vdwioffsetptr0+vdwjidx0C,
463                                             vdwioffsetptr0+vdwjidx0D,
464                                             &c6_00,&c12_00);
465
466             /* LENNARD-JONES DISPERSION/REPULSION */
467
468             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
469             vvdw6            = _mm256_mul_pd(c6_00,rinvsix);
470             vvdw12           = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
471             vvdw             = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
472             fvdw             = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
473
474             /* Update potential sum for this i atom from the interaction with this j atom. */
475             vvdw             = _mm256_andnot_pd(dummy_mask,vvdw);
476             vvdwsum          = _mm256_add_pd(vvdwsum,vvdw);
477
478             fscal            = fvdw;
479
480             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm256_mul_pd(fscal,dx00);
484             ty               = _mm256_mul_pd(fscal,dy00);
485             tz               = _mm256_mul_pd(fscal,dz00);
486
487             /* Update vectorial force */
488             fix0             = _mm256_add_pd(fix0,tx);
489             fiy0             = _mm256_add_pd(fiy0,ty);
490             fiz0             = _mm256_add_pd(fiz0,tz);
491
492             fjx0             = _mm256_add_pd(fjx0,tx);
493             fjy0             = _mm256_add_pd(fjy0,ty);
494             fjz0             = _mm256_add_pd(fjz0,tz);
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             /* Compute parameters for interactions between i and j atoms */
501             qq10             = _mm256_mul_pd(iq1,jq0);
502
503             /* COULOMB ELECTROSTATICS */
504             velec            = _mm256_mul_pd(qq10,rinv10);
505             felec            = _mm256_mul_pd(velec,rinvsq10);
506
507             /* Update potential sum for this i atom from the interaction with this j atom. */
508             velec            = _mm256_andnot_pd(dummy_mask,velec);
509             velecsum         = _mm256_add_pd(velecsum,velec);
510
511             fscal            = felec;
512
513             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
514
515             /* Calculate temporary vectorial force */
516             tx               = _mm256_mul_pd(fscal,dx10);
517             ty               = _mm256_mul_pd(fscal,dy10);
518             tz               = _mm256_mul_pd(fscal,dz10);
519
520             /* Update vectorial force */
521             fix1             = _mm256_add_pd(fix1,tx);
522             fiy1             = _mm256_add_pd(fiy1,ty);
523             fiz1             = _mm256_add_pd(fiz1,tz);
524
525             fjx0             = _mm256_add_pd(fjx0,tx);
526             fjy0             = _mm256_add_pd(fjy0,ty);
527             fjz0             = _mm256_add_pd(fjz0,tz);
528
529             /**************************
530              * CALCULATE INTERACTIONS *
531              **************************/
532
533             /* Compute parameters for interactions between i and j atoms */
534             qq20             = _mm256_mul_pd(iq2,jq0);
535
536             /* COULOMB ELECTROSTATICS */
537             velec            = _mm256_mul_pd(qq20,rinv20);
538             felec            = _mm256_mul_pd(velec,rinvsq20);
539
540             /* Update potential sum for this i atom from the interaction with this j atom. */
541             velec            = _mm256_andnot_pd(dummy_mask,velec);
542             velecsum         = _mm256_add_pd(velecsum,velec);
543
544             fscal            = felec;
545
546             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
547
548             /* Calculate temporary vectorial force */
549             tx               = _mm256_mul_pd(fscal,dx20);
550             ty               = _mm256_mul_pd(fscal,dy20);
551             tz               = _mm256_mul_pd(fscal,dz20);
552
553             /* Update vectorial force */
554             fix2             = _mm256_add_pd(fix2,tx);
555             fiy2             = _mm256_add_pd(fiy2,ty);
556             fiz2             = _mm256_add_pd(fiz2,tz);
557
558             fjx0             = _mm256_add_pd(fjx0,tx);
559             fjy0             = _mm256_add_pd(fjy0,ty);
560             fjz0             = _mm256_add_pd(fjz0,tz);
561
562             /**************************
563              * CALCULATE INTERACTIONS *
564              **************************/
565
566             /* Compute parameters for interactions between i and j atoms */
567             qq30             = _mm256_mul_pd(iq3,jq0);
568
569             /* COULOMB ELECTROSTATICS */
570             velec            = _mm256_mul_pd(qq30,rinv30);
571             felec            = _mm256_mul_pd(velec,rinvsq30);
572
573             /* Update potential sum for this i atom from the interaction with this j atom. */
574             velec            = _mm256_andnot_pd(dummy_mask,velec);
575             velecsum         = _mm256_add_pd(velecsum,velec);
576
577             fscal            = felec;
578
579             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
580
581             /* Calculate temporary vectorial force */
582             tx               = _mm256_mul_pd(fscal,dx30);
583             ty               = _mm256_mul_pd(fscal,dy30);
584             tz               = _mm256_mul_pd(fscal,dz30);
585
586             /* Update vectorial force */
587             fix3             = _mm256_add_pd(fix3,tx);
588             fiy3             = _mm256_add_pd(fiy3,ty);
589             fiz3             = _mm256_add_pd(fiz3,tz);
590
591             fjx0             = _mm256_add_pd(fjx0,tx);
592             fjy0             = _mm256_add_pd(fjy0,ty);
593             fjz0             = _mm256_add_pd(fjz0,tz);
594
595             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
596             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
597             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
598             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
599
600             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
601
602             /* Inner loop uses 116 flops */
603         }
604
605         /* End of innermost loop */
606
607         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
608                                                  f+i_coord_offset,fshift+i_shift_offset);
609
610         ggid                        = gid[iidx];
611         /* Update potential energies */
612         gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
613         gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
614
615         /* Increment number of inner iterations */
616         inneriter                  += j_index_end - j_index_start;
617
618         /* Outer loop uses 26 flops */
619     }
620
621     /* Increment number of outer iterations */
622     outeriter        += nri;
623
624     /* Update outer/inner flops */
625
626     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*116);
627 }
628 /*
629  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double
630  * Electrostatics interaction: Coulomb
631  * VdW interaction:            LennardJones
632  * Geometry:                   Water4-Particle
633  * Calculate force/pot:        Force
634  */
635 void
636 nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_avx_256_double
637                     (t_nblist                    * gmx_restrict       nlist,
638                      rvec                        * gmx_restrict          xx,
639                      rvec                        * gmx_restrict          ff,
640                      struct t_forcerec           * gmx_restrict          fr,
641                      t_mdatoms                   * gmx_restrict     mdatoms,
642                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
643                      t_nrnb                      * gmx_restrict        nrnb)
644 {
645     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
646      * just 0 for non-waters.
647      * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
648      * jnr indices corresponding to data put in the four positions in the SIMD register.
649      */
650     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
651     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
652     int              jnrA,jnrB,jnrC,jnrD;
653     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
654     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
655     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
656     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
657     real             rcutoff_scalar;
658     real             *shiftvec,*fshift,*x,*f;
659     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
660     real             scratch[4*DIM];
661     __m256d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
662     real *           vdwioffsetptr0;
663     __m256d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
664     real *           vdwioffsetptr1;
665     __m256d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
666     real *           vdwioffsetptr2;
667     __m256d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
668     real *           vdwioffsetptr3;
669     __m256d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
670     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
671     __m256d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
672     __m256d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
673     __m256d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
674     __m256d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
675     __m256d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
676     __m256d          velec,felec,velecsum,facel,crf,krf,krf2;
677     real             *charge;
678     int              nvdwtype;
679     __m256d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
680     int              *vdwtype;
681     real             *vdwparam;
682     __m256d          one_sixth   = _mm256_set1_pd(1.0/6.0);
683     __m256d          one_twelfth = _mm256_set1_pd(1.0/12.0);
684     __m256d          dummy_mask,cutoff_mask;
685     __m128           tmpmask0,tmpmask1;
686     __m256d          signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
687     __m256d          one     = _mm256_set1_pd(1.0);
688     __m256d          two     = _mm256_set1_pd(2.0);
689     x                = xx[0];
690     f                = ff[0];
691
692     nri              = nlist->nri;
693     iinr             = nlist->iinr;
694     jindex           = nlist->jindex;
695     jjnr             = nlist->jjnr;
696     shiftidx         = nlist->shift;
697     gid              = nlist->gid;
698     shiftvec         = fr->shift_vec[0];
699     fshift           = fr->fshift[0];
700     facel            = _mm256_set1_pd(fr->ic->epsfac);
701     charge           = mdatoms->chargeA;
702     nvdwtype         = fr->ntype;
703     vdwparam         = fr->nbfp;
704     vdwtype          = mdatoms->typeA;
705
706     /* Setup water-specific parameters */
707     inr              = nlist->iinr[0];
708     iq1              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
709     iq2              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
710     iq3              = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
711     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
712
713     /* Avoid stupid compiler warnings */
714     jnrA = jnrB = jnrC = jnrD = 0;
715     j_coord_offsetA = 0;
716     j_coord_offsetB = 0;
717     j_coord_offsetC = 0;
718     j_coord_offsetD = 0;
719
720     outeriter        = 0;
721     inneriter        = 0;
722
723     for(iidx=0;iidx<4*DIM;iidx++)
724     {
725         scratch[iidx] = 0.0;
726     }
727
728     /* Start outer loop over neighborlists */
729     for(iidx=0; iidx<nri; iidx++)
730     {
731         /* Load shift vector for this list */
732         i_shift_offset   = DIM*shiftidx[iidx];
733
734         /* Load limits for loop over neighbors */
735         j_index_start    = jindex[iidx];
736         j_index_end      = jindex[iidx+1];
737
738         /* Get outer coordinate index */
739         inr              = iinr[iidx];
740         i_coord_offset   = DIM*inr;
741
742         /* Load i particle coords and add shift vector */
743         gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
744                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
745
746         fix0             = _mm256_setzero_pd();
747         fiy0             = _mm256_setzero_pd();
748         fiz0             = _mm256_setzero_pd();
749         fix1             = _mm256_setzero_pd();
750         fiy1             = _mm256_setzero_pd();
751         fiz1             = _mm256_setzero_pd();
752         fix2             = _mm256_setzero_pd();
753         fiy2             = _mm256_setzero_pd();
754         fiz2             = _mm256_setzero_pd();
755         fix3             = _mm256_setzero_pd();
756         fiy3             = _mm256_setzero_pd();
757         fiz3             = _mm256_setzero_pd();
758
759         /* Start inner kernel loop */
760         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
761         {
762
763             /* Get j neighbor index, and coordinate index */
764             jnrA             = jjnr[jidx];
765             jnrB             = jjnr[jidx+1];
766             jnrC             = jjnr[jidx+2];
767             jnrD             = jjnr[jidx+3];
768             j_coord_offsetA  = DIM*jnrA;
769             j_coord_offsetB  = DIM*jnrB;
770             j_coord_offsetC  = DIM*jnrC;
771             j_coord_offsetD  = DIM*jnrD;
772
773             /* load j atom coordinates */
774             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
775                                                  x+j_coord_offsetC,x+j_coord_offsetD,
776                                                  &jx0,&jy0,&jz0);
777
778             /* Calculate displacement vector */
779             dx00             = _mm256_sub_pd(ix0,jx0);
780             dy00             = _mm256_sub_pd(iy0,jy0);
781             dz00             = _mm256_sub_pd(iz0,jz0);
782             dx10             = _mm256_sub_pd(ix1,jx0);
783             dy10             = _mm256_sub_pd(iy1,jy0);
784             dz10             = _mm256_sub_pd(iz1,jz0);
785             dx20             = _mm256_sub_pd(ix2,jx0);
786             dy20             = _mm256_sub_pd(iy2,jy0);
787             dz20             = _mm256_sub_pd(iz2,jz0);
788             dx30             = _mm256_sub_pd(ix3,jx0);
789             dy30             = _mm256_sub_pd(iy3,jy0);
790             dz30             = _mm256_sub_pd(iz3,jz0);
791
792             /* Calculate squared distance and things based on it */
793             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
794             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
795             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
796             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
797
798             rinv10           = avx256_invsqrt_d(rsq10);
799             rinv20           = avx256_invsqrt_d(rsq20);
800             rinv30           = avx256_invsqrt_d(rsq30);
801
802             rinvsq00         = avx256_inv_d(rsq00);
803             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
804             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
805             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
806
807             /* Load parameters for j particles */
808             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
809                                                                  charge+jnrC+0,charge+jnrD+0);
810             vdwjidx0A        = 2*vdwtype[jnrA+0];
811             vdwjidx0B        = 2*vdwtype[jnrB+0];
812             vdwjidx0C        = 2*vdwtype[jnrC+0];
813             vdwjidx0D        = 2*vdwtype[jnrD+0];
814
815             fjx0             = _mm256_setzero_pd();
816             fjy0             = _mm256_setzero_pd();
817             fjz0             = _mm256_setzero_pd();
818
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             /* Compute parameters for interactions between i and j atoms */
824             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
825                                             vdwioffsetptr0+vdwjidx0B,
826                                             vdwioffsetptr0+vdwjidx0C,
827                                             vdwioffsetptr0+vdwjidx0D,
828                                             &c6_00,&c12_00);
829
830             /* LENNARD-JONES DISPERSION/REPULSION */
831
832             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
833             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
834
835             fscal            = fvdw;
836
837             /* Calculate temporary vectorial force */
838             tx               = _mm256_mul_pd(fscal,dx00);
839             ty               = _mm256_mul_pd(fscal,dy00);
840             tz               = _mm256_mul_pd(fscal,dz00);
841
842             /* Update vectorial force */
843             fix0             = _mm256_add_pd(fix0,tx);
844             fiy0             = _mm256_add_pd(fiy0,ty);
845             fiz0             = _mm256_add_pd(fiz0,tz);
846
847             fjx0             = _mm256_add_pd(fjx0,tx);
848             fjy0             = _mm256_add_pd(fjy0,ty);
849             fjz0             = _mm256_add_pd(fjz0,tz);
850
851             /**************************
852              * CALCULATE INTERACTIONS *
853              **************************/
854
855             /* Compute parameters for interactions between i and j atoms */
856             qq10             = _mm256_mul_pd(iq1,jq0);
857
858             /* COULOMB ELECTROSTATICS */
859             velec            = _mm256_mul_pd(qq10,rinv10);
860             felec            = _mm256_mul_pd(velec,rinvsq10);
861
862             fscal            = felec;
863
864             /* Calculate temporary vectorial force */
865             tx               = _mm256_mul_pd(fscal,dx10);
866             ty               = _mm256_mul_pd(fscal,dy10);
867             tz               = _mm256_mul_pd(fscal,dz10);
868
869             /* Update vectorial force */
870             fix1             = _mm256_add_pd(fix1,tx);
871             fiy1             = _mm256_add_pd(fiy1,ty);
872             fiz1             = _mm256_add_pd(fiz1,tz);
873
874             fjx0             = _mm256_add_pd(fjx0,tx);
875             fjy0             = _mm256_add_pd(fjy0,ty);
876             fjz0             = _mm256_add_pd(fjz0,tz);
877
878             /**************************
879              * CALCULATE INTERACTIONS *
880              **************************/
881
882             /* Compute parameters for interactions between i and j atoms */
883             qq20             = _mm256_mul_pd(iq2,jq0);
884
885             /* COULOMB ELECTROSTATICS */
886             velec            = _mm256_mul_pd(qq20,rinv20);
887             felec            = _mm256_mul_pd(velec,rinvsq20);
888
889             fscal            = felec;
890
891             /* Calculate temporary vectorial force */
892             tx               = _mm256_mul_pd(fscal,dx20);
893             ty               = _mm256_mul_pd(fscal,dy20);
894             tz               = _mm256_mul_pd(fscal,dz20);
895
896             /* Update vectorial force */
897             fix2             = _mm256_add_pd(fix2,tx);
898             fiy2             = _mm256_add_pd(fiy2,ty);
899             fiz2             = _mm256_add_pd(fiz2,tz);
900
901             fjx0             = _mm256_add_pd(fjx0,tx);
902             fjy0             = _mm256_add_pd(fjy0,ty);
903             fjz0             = _mm256_add_pd(fjz0,tz);
904
905             /**************************
906              * CALCULATE INTERACTIONS *
907              **************************/
908
909             /* Compute parameters for interactions between i and j atoms */
910             qq30             = _mm256_mul_pd(iq3,jq0);
911
912             /* COULOMB ELECTROSTATICS */
913             velec            = _mm256_mul_pd(qq30,rinv30);
914             felec            = _mm256_mul_pd(velec,rinvsq30);
915
916             fscal            = felec;
917
918             /* Calculate temporary vectorial force */
919             tx               = _mm256_mul_pd(fscal,dx30);
920             ty               = _mm256_mul_pd(fscal,dy30);
921             tz               = _mm256_mul_pd(fscal,dz30);
922
923             /* Update vectorial force */
924             fix3             = _mm256_add_pd(fix3,tx);
925             fiy3             = _mm256_add_pd(fiy3,ty);
926             fiz3             = _mm256_add_pd(fiz3,tz);
927
928             fjx0             = _mm256_add_pd(fjx0,tx);
929             fjy0             = _mm256_add_pd(fjy0,ty);
930             fjz0             = _mm256_add_pd(fjz0,tz);
931
932             fjptrA             = f+j_coord_offsetA;
933             fjptrB             = f+j_coord_offsetB;
934             fjptrC             = f+j_coord_offsetC;
935             fjptrD             = f+j_coord_offsetD;
936
937             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
938
939             /* Inner loop uses 108 flops */
940         }
941
942         if(jidx<j_index_end)
943         {
944
945             /* Get j neighbor index, and coordinate index */
946             jnrlistA         = jjnr[jidx];
947             jnrlistB         = jjnr[jidx+1];
948             jnrlistC         = jjnr[jidx+2];
949             jnrlistD         = jjnr[jidx+3];
950             /* Sign of each element will be negative for non-real atoms.
951              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
952              * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
953              */
954             tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
955
956             tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
957             tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
958             dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
959
960             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
961             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
962             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
963             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
964             j_coord_offsetA  = DIM*jnrA;
965             j_coord_offsetB  = DIM*jnrB;
966             j_coord_offsetC  = DIM*jnrC;
967             j_coord_offsetD  = DIM*jnrD;
968
969             /* load j atom coordinates */
970             gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
971                                                  x+j_coord_offsetC,x+j_coord_offsetD,
972                                                  &jx0,&jy0,&jz0);
973
974             /* Calculate displacement vector */
975             dx00             = _mm256_sub_pd(ix0,jx0);
976             dy00             = _mm256_sub_pd(iy0,jy0);
977             dz00             = _mm256_sub_pd(iz0,jz0);
978             dx10             = _mm256_sub_pd(ix1,jx0);
979             dy10             = _mm256_sub_pd(iy1,jy0);
980             dz10             = _mm256_sub_pd(iz1,jz0);
981             dx20             = _mm256_sub_pd(ix2,jx0);
982             dy20             = _mm256_sub_pd(iy2,jy0);
983             dz20             = _mm256_sub_pd(iz2,jz0);
984             dx30             = _mm256_sub_pd(ix3,jx0);
985             dy30             = _mm256_sub_pd(iy3,jy0);
986             dz30             = _mm256_sub_pd(iz3,jz0);
987
988             /* Calculate squared distance and things based on it */
989             rsq00            = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
990             rsq10            = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
991             rsq20            = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
992             rsq30            = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
993
994             rinv10           = avx256_invsqrt_d(rsq10);
995             rinv20           = avx256_invsqrt_d(rsq20);
996             rinv30           = avx256_invsqrt_d(rsq30);
997
998             rinvsq00         = avx256_inv_d(rsq00);
999             rinvsq10         = _mm256_mul_pd(rinv10,rinv10);
1000             rinvsq20         = _mm256_mul_pd(rinv20,rinv20);
1001             rinvsq30         = _mm256_mul_pd(rinv30,rinv30);
1002
1003             /* Load parameters for j particles */
1004             jq0              = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1005                                                                  charge+jnrC+0,charge+jnrD+0);
1006             vdwjidx0A        = 2*vdwtype[jnrA+0];
1007             vdwjidx0B        = 2*vdwtype[jnrB+0];
1008             vdwjidx0C        = 2*vdwtype[jnrC+0];
1009             vdwjidx0D        = 2*vdwtype[jnrD+0];
1010
1011             fjx0             = _mm256_setzero_pd();
1012             fjy0             = _mm256_setzero_pd();
1013             fjz0             = _mm256_setzero_pd();
1014
1015             /**************************
1016              * CALCULATE INTERACTIONS *
1017              **************************/
1018
1019             /* Compute parameters for interactions between i and j atoms */
1020             gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1021                                             vdwioffsetptr0+vdwjidx0B,
1022                                             vdwioffsetptr0+vdwjidx0C,
1023                                             vdwioffsetptr0+vdwjidx0D,
1024                                             &c6_00,&c12_00);
1025
1026             /* LENNARD-JONES DISPERSION/REPULSION */
1027
1028             rinvsix          = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1029             fvdw             = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1030
1031             fscal            = fvdw;
1032
1033             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1034
1035             /* Calculate temporary vectorial force */
1036             tx               = _mm256_mul_pd(fscal,dx00);
1037             ty               = _mm256_mul_pd(fscal,dy00);
1038             tz               = _mm256_mul_pd(fscal,dz00);
1039
1040             /* Update vectorial force */
1041             fix0             = _mm256_add_pd(fix0,tx);
1042             fiy0             = _mm256_add_pd(fiy0,ty);
1043             fiz0             = _mm256_add_pd(fiz0,tz);
1044
1045             fjx0             = _mm256_add_pd(fjx0,tx);
1046             fjy0             = _mm256_add_pd(fjy0,ty);
1047             fjz0             = _mm256_add_pd(fjz0,tz);
1048
1049             /**************************
1050              * CALCULATE INTERACTIONS *
1051              **************************/
1052
1053             /* Compute parameters for interactions between i and j atoms */
1054             qq10             = _mm256_mul_pd(iq1,jq0);
1055
1056             /* COULOMB ELECTROSTATICS */
1057             velec            = _mm256_mul_pd(qq10,rinv10);
1058             felec            = _mm256_mul_pd(velec,rinvsq10);
1059
1060             fscal            = felec;
1061
1062             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1063
1064             /* Calculate temporary vectorial force */
1065             tx               = _mm256_mul_pd(fscal,dx10);
1066             ty               = _mm256_mul_pd(fscal,dy10);
1067             tz               = _mm256_mul_pd(fscal,dz10);
1068
1069             /* Update vectorial force */
1070             fix1             = _mm256_add_pd(fix1,tx);
1071             fiy1             = _mm256_add_pd(fiy1,ty);
1072             fiz1             = _mm256_add_pd(fiz1,tz);
1073
1074             fjx0             = _mm256_add_pd(fjx0,tx);
1075             fjy0             = _mm256_add_pd(fjy0,ty);
1076             fjz0             = _mm256_add_pd(fjz0,tz);
1077
1078             /**************************
1079              * CALCULATE INTERACTIONS *
1080              **************************/
1081
1082             /* Compute parameters for interactions between i and j atoms */
1083             qq20             = _mm256_mul_pd(iq2,jq0);
1084
1085             /* COULOMB ELECTROSTATICS */
1086             velec            = _mm256_mul_pd(qq20,rinv20);
1087             felec            = _mm256_mul_pd(velec,rinvsq20);
1088
1089             fscal            = felec;
1090
1091             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1092
1093             /* Calculate temporary vectorial force */
1094             tx               = _mm256_mul_pd(fscal,dx20);
1095             ty               = _mm256_mul_pd(fscal,dy20);
1096             tz               = _mm256_mul_pd(fscal,dz20);
1097
1098             /* Update vectorial force */
1099             fix2             = _mm256_add_pd(fix2,tx);
1100             fiy2             = _mm256_add_pd(fiy2,ty);
1101             fiz2             = _mm256_add_pd(fiz2,tz);
1102
1103             fjx0             = _mm256_add_pd(fjx0,tx);
1104             fjy0             = _mm256_add_pd(fjy0,ty);
1105             fjz0             = _mm256_add_pd(fjz0,tz);
1106
1107             /**************************
1108              * CALCULATE INTERACTIONS *
1109              **************************/
1110
1111             /* Compute parameters for interactions between i and j atoms */
1112             qq30             = _mm256_mul_pd(iq3,jq0);
1113
1114             /* COULOMB ELECTROSTATICS */
1115             velec            = _mm256_mul_pd(qq30,rinv30);
1116             felec            = _mm256_mul_pd(velec,rinvsq30);
1117
1118             fscal            = felec;
1119
1120             fscal            = _mm256_andnot_pd(dummy_mask,fscal);
1121
1122             /* Calculate temporary vectorial force */
1123             tx               = _mm256_mul_pd(fscal,dx30);
1124             ty               = _mm256_mul_pd(fscal,dy30);
1125             tz               = _mm256_mul_pd(fscal,dz30);
1126
1127             /* Update vectorial force */
1128             fix3             = _mm256_add_pd(fix3,tx);
1129             fiy3             = _mm256_add_pd(fiy3,ty);
1130             fiz3             = _mm256_add_pd(fiz3,tz);
1131
1132             fjx0             = _mm256_add_pd(fjx0,tx);
1133             fjy0             = _mm256_add_pd(fjy0,ty);
1134             fjz0             = _mm256_add_pd(fjz0,tz);
1135
1136             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1137             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1138             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1139             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1140
1141             gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1142
1143             /* Inner loop uses 108 flops */
1144         }
1145
1146         /* End of innermost loop */
1147
1148         gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1149                                                  f+i_coord_offset,fshift+i_shift_offset);
1150
1151         /* Increment number of inner iterations */
1152         inneriter                  += j_index_end - j_index_start;
1153
1154         /* Outer loop uses 24 flops */
1155     }
1156
1157     /* Increment number of outer iterations */
1158     outeriter        += nri;
1159
1160     /* Update outer/inner flops */
1161
1162     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*108);
1163 }