src/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecRF_VdwCSTab_GeomW4W4_avx_256_single.c

   1 /*
   2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 2001-2012, The GROMACS Development Team
   9  *
  10  * Gromacs is a library for molecular simulation and trajectory analysis,
  11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  12  * a full list of developers and information, check out http://www.gromacs.org
  13  *
  14  * This program is free software; you can redistribute it and/or modify it under
  15  * the terms of the GNU Lesser General Public License as published by the Free
  16  * Software Foundation; either version 2 of the License, or (at your option) any
  17  * later version.
  18  *
  19  * To help fund GROMACS development, we humbly ask that you cite
  20  * the papers people have written on it - you can find them on the website.
  21  */
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #include <math.h>
  27
  28 #include "../nb_kernel.h"
  29 #include "types/simple.h"
  30 #include "vec.h"
  31 #include "nrnb.h"
  32
  33 #include "gmx_math_x86_avx_256_single.h"
  34 #include "kernelutil_x86_avx_256_single.h"
  35
  36 /*
  37  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single
  38  * Electrostatics interaction: ReactionField
  39  * VdW interaction:            CubicSplineTable
  40  * Geometry:                   Water4-Water4
  41  * Calculate force/pot:        PotentialAndForce
  42  */
  43 void
  44 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_VF_avx_256_single
  45                     (t_nblist * gmx_restrict                nlist,
  46                      rvec * gmx_restrict                    xx,
  47                      rvec * gmx_restrict                    ff,
  48                      t_forcerec * gmx_restrict              fr,
  49                      t_mdatoms * gmx_restrict               mdatoms,
  50                      nb_kernel_data_t * gmx_restrict        kernel_data,
  51                      t_nrnb * gmx_restrict                  nrnb)
  52 {
  53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  54      * just 0 for non-waters.
  55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
  56      * jnr indices corresponding to data put in the four positions in the SIMD register.
  57      */
  58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  60     int              jnrA,jnrB,jnrC,jnrD;
  61     int              jnrE,jnrF,jnrG,jnrH;
  62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
  64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
  66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  67     real             rcutoff_scalar;
  68     real             *shiftvec,*fshift,*x,*f;
  69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
  70     real             scratch[4*DIM];
  71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
  72     real *           vdwioffsetptr0;
  73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  74     real *           vdwioffsetptr1;
  75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  76     real *           vdwioffsetptr2;
  77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  78     real *           vdwioffsetptr3;
  79     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
  80     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
  81     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  82     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
  83     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
  84     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
  85     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
  86     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
  87     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
  88     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  89     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
  90     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
  91     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
  92     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
  93     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
  94     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
  95     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
  96     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
  97     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
  98     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
  99     real             *charge;
 100     int              nvdwtype;
 101     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 102     int              *vdwtype;
 103     real             *vdwparam;
 104     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 105     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 106     __m256i          vfitab;
 107     __m128i          vfitab_lo,vfitab_hi;
 108     __m128i          ifour       = _mm_set1_epi32(4);
 109     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 110     real             *vftab;
 111     __m256           dummy_mask,cutoff_mask;
 112     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 113     __m256           one     = _mm256_set1_ps(1.0);
 114     __m256           two     = _mm256_set1_ps(2.0);
 115     x                = xx[0];
 116     f                = ff[0];
 117
 118     nri              = nlist->nri;
 119     iinr             = nlist->iinr;
 120     jindex           = nlist->jindex;
 121     jjnr             = nlist->jjnr;
 122     shiftidx         = nlist->shift;
 123     gid              = nlist->gid;
 124     shiftvec         = fr->shift_vec[0];
 125     fshift           = fr->fshift[0];
 126     facel            = _mm256_set1_ps(fr->epsfac);
 127     charge           = mdatoms->chargeA;
 128     krf              = _mm256_set1_ps(fr->ic->k_rf);
 129     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
 130     crf              = _mm256_set1_ps(fr->ic->c_rf);
 131     nvdwtype         = fr->ntype;
 132     vdwparam         = fr->nbfp;
 133     vdwtype          = mdatoms->typeA;
 134
 135     vftab            = kernel_data->table_vdw->data;
 136     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
 137
 138     /* Setup water-specific parameters */
 139     inr              = nlist->iinr[0];
 140     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
 141     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
 142     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
 143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 144
 145     jq1              = _mm256_set1_ps(charge[inr+1]);
 146     jq2              = _mm256_set1_ps(charge[inr+2]);
 147     jq3              = _mm256_set1_ps(charge[inr+3]);
 148     vdwjidx0A        = 2*vdwtype[inr+0];
 149     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
 150     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
 151     qq11             = _mm256_mul_ps(iq1,jq1);
 152     qq12             = _mm256_mul_ps(iq1,jq2);
 153     qq13             = _mm256_mul_ps(iq1,jq3);
 154     qq21             = _mm256_mul_ps(iq2,jq1);
 155     qq22             = _mm256_mul_ps(iq2,jq2);
 156     qq23             = _mm256_mul_ps(iq2,jq3);
 157     qq31             = _mm256_mul_ps(iq3,jq1);
 158     qq32             = _mm256_mul_ps(iq3,jq2);
 159     qq33             = _mm256_mul_ps(iq3,jq3);
 160
 161     /* Avoid stupid compiler warnings */
 162     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 163     j_coord_offsetA = 0;
 164     j_coord_offsetB = 0;
 165     j_coord_offsetC = 0;
 166     j_coord_offsetD = 0;
 167     j_coord_offsetE = 0;
 168     j_coord_offsetF = 0;
 169     j_coord_offsetG = 0;
 170     j_coord_offsetH = 0;
 171
 172     outeriter        = 0;
 173     inneriter        = 0;
 174
 175     for(iidx=0;iidx<4*DIM;iidx++)
 176     {
 177         scratch[iidx] = 0.0;
 178     }
 179
 180     /* Start outer loop over neighborlists */
 181     for(iidx=0; iidx<nri; iidx++)
 182     {
 183         /* Load shift vector for this list */
 184         i_shift_offset   = DIM*shiftidx[iidx];
 185
 186         /* Load limits for loop over neighbors */
 187         j_index_start    = jindex[iidx];
 188         j_index_end      = jindex[iidx+1];
 189
 190         /* Get outer coordinate index */
 191         inr              = iinr[iidx];
 192         i_coord_offset   = DIM*inr;
 193
 194         /* Load i particle coords and add shift vector */
 195         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 196                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
 197
 198         fix0             = _mm256_setzero_ps();
 199         fiy0             = _mm256_setzero_ps();
 200         fiz0             = _mm256_setzero_ps();
 201         fix1             = _mm256_setzero_ps();
 202         fiy1             = _mm256_setzero_ps();
 203         fiz1             = _mm256_setzero_ps();
 204         fix2             = _mm256_setzero_ps();
 205         fiy2             = _mm256_setzero_ps();
 206         fiz2             = _mm256_setzero_ps();
 207         fix3             = _mm256_setzero_ps();
 208         fiy3             = _mm256_setzero_ps();
 209         fiz3             = _mm256_setzero_ps();
 210
 211         /* Reset potential sums */
 212         velecsum         = _mm256_setzero_ps();
 213         vvdwsum          = _mm256_setzero_ps();
 214
 215         /* Start inner kernel loop */
 216         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 217         {
 218
 219             /* Get j neighbor index, and coordinate index */
 220             jnrA             = jjnr[jidx];
 221             jnrB             = jjnr[jidx+1];
 222             jnrC             = jjnr[jidx+2];
 223             jnrD             = jjnr[jidx+3];
 224             jnrE             = jjnr[jidx+4];
 225             jnrF             = jjnr[jidx+5];
 226             jnrG             = jjnr[jidx+6];
 227             jnrH             = jjnr[jidx+7];
 228             j_coord_offsetA  = DIM*jnrA;
 229             j_coord_offsetB  = DIM*jnrB;
 230             j_coord_offsetC  = DIM*jnrC;
 231             j_coord_offsetD  = DIM*jnrD;
 232             j_coord_offsetE  = DIM*jnrE;
 233             j_coord_offsetF  = DIM*jnrF;
 234             j_coord_offsetG  = DIM*jnrG;
 235             j_coord_offsetH  = DIM*jnrH;
 236
 237             /* load j atom coordinates */
 238             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 239                                                  x+j_coord_offsetC,x+j_coord_offsetD,
 240                                                  x+j_coord_offsetE,x+j_coord_offsetF,
 241                                                  x+j_coord_offsetG,x+j_coord_offsetH,
 242                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 243                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
 244
 245             /* Calculate displacement vector */
 246             dx00             = _mm256_sub_ps(ix0,jx0);
 247             dy00             = _mm256_sub_ps(iy0,jy0);
 248             dz00             = _mm256_sub_ps(iz0,jz0);
 249             dx11             = _mm256_sub_ps(ix1,jx1);
 250             dy11             = _mm256_sub_ps(iy1,jy1);
 251             dz11             = _mm256_sub_ps(iz1,jz1);
 252             dx12             = _mm256_sub_ps(ix1,jx2);
 253             dy12             = _mm256_sub_ps(iy1,jy2);
 254             dz12             = _mm256_sub_ps(iz1,jz2);
 255             dx13             = _mm256_sub_ps(ix1,jx3);
 256             dy13             = _mm256_sub_ps(iy1,jy3);
 257             dz13             = _mm256_sub_ps(iz1,jz3);
 258             dx21             = _mm256_sub_ps(ix2,jx1);
 259             dy21             = _mm256_sub_ps(iy2,jy1);
 260             dz21             = _mm256_sub_ps(iz2,jz1);
 261             dx22             = _mm256_sub_ps(ix2,jx2);
 262             dy22             = _mm256_sub_ps(iy2,jy2);
 263             dz22             = _mm256_sub_ps(iz2,jz2);
 264             dx23             = _mm256_sub_ps(ix2,jx3);
 265             dy23             = _mm256_sub_ps(iy2,jy3);
 266             dz23             = _mm256_sub_ps(iz2,jz3);
 267             dx31             = _mm256_sub_ps(ix3,jx1);
 268             dy31             = _mm256_sub_ps(iy3,jy1);
 269             dz31             = _mm256_sub_ps(iz3,jz1);
 270             dx32             = _mm256_sub_ps(ix3,jx2);
 271             dy32             = _mm256_sub_ps(iy3,jy2);
 272             dz32             = _mm256_sub_ps(iz3,jz2);
 273             dx33             = _mm256_sub_ps(ix3,jx3);
 274             dy33             = _mm256_sub_ps(iy3,jy3);
 275             dz33             = _mm256_sub_ps(iz3,jz3);
 276
 277             /* Calculate squared distance and things based on it */
 278             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 279             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
 280             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
 281             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
 282             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
 283             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
 284             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
 285             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
 286             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
 287             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
 288
 289             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 290             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
 291             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
 292             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
 293             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
 294             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
 295             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
 296             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
 297             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
 298             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
 299
 300             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
 301             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
 302             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
 303             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
 304             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
 305             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
 306             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
 307             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
 308             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
 309
 310             fjx0             = _mm256_setzero_ps();
 311             fjy0             = _mm256_setzero_ps();
 312             fjz0             = _mm256_setzero_ps();
 313             fjx1             = _mm256_setzero_ps();
 314             fjy1             = _mm256_setzero_ps();
 315             fjz1             = _mm256_setzero_ps();
 316             fjx2             = _mm256_setzero_ps();
 317             fjy2             = _mm256_setzero_ps();
 318             fjz2             = _mm256_setzero_ps();
 319             fjx3             = _mm256_setzero_ps();
 320             fjy3             = _mm256_setzero_ps();
 321             fjz3             = _mm256_setzero_ps();
 322
 323             /**************************
 324              * CALCULATE INTERACTIONS *
 325              **************************/
 326
 327             r00              = _mm256_mul_ps(rsq00,rinv00);
 328
 329             /* Calculate table index by multiplying r with table scale and truncate to integer */
 330             rt               = _mm256_mul_ps(r00,vftabscale);
 331             vfitab           = _mm256_cvttps_epi32(rt);
 332             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 333             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 334             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 335             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 336             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 337             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 338
 339             /* CUBIC SPLINE TABLE DISPERSION */
 340             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 341                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 342             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 343                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 344             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 345                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 346             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 347                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 348             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 349             Heps             = _mm256_mul_ps(vfeps,H);
 350             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 351             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 352             vvdw6            = _mm256_mul_ps(c6_00,VV);
 353             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 354             fvdw6            = _mm256_mul_ps(c6_00,FF);
 355
 356             /* CUBIC SPLINE TABLE REPULSION */
 357             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 358             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 359             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 360                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 361             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 362                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 363             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 364                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 365             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 366                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 367             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 368             Heps             = _mm256_mul_ps(vfeps,H);
 369             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 370             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 371             vvdw12           = _mm256_mul_ps(c12_00,VV);
 372             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 373             fvdw12           = _mm256_mul_ps(c12_00,FF);
 374             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
 375             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 376
 377             /* Update potential sum for this i atom from the interaction with this j atom. */
 378             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 379
 380             fscal            = fvdw;
 381
 382             /* Calculate temporary vectorial force */
 383             tx               = _mm256_mul_ps(fscal,dx00);
 384             ty               = _mm256_mul_ps(fscal,dy00);
 385             tz               = _mm256_mul_ps(fscal,dz00);
 386
 387             /* Update vectorial force */
 388             fix0             = _mm256_add_ps(fix0,tx);
 389             fiy0             = _mm256_add_ps(fiy0,ty);
 390             fiz0             = _mm256_add_ps(fiz0,tz);
 391
 392             fjx0             = _mm256_add_ps(fjx0,tx);
 393             fjy0             = _mm256_add_ps(fjy0,ty);
 394             fjz0             = _mm256_add_ps(fjz0,tz);
 395
 396             /**************************
 397              * CALCULATE INTERACTIONS *
 398              **************************/
 399
 400             /* REACTION-FIELD ELECTROSTATICS */
 401             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
 402             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
 403
 404             /* Update potential sum for this i atom from the interaction with this j atom. */
 405             velecsum         = _mm256_add_ps(velecsum,velec);
 406
 407             fscal            = felec;
 408
 409             /* Calculate temporary vectorial force */
 410             tx               = _mm256_mul_ps(fscal,dx11);
 411             ty               = _mm256_mul_ps(fscal,dy11);
 412             tz               = _mm256_mul_ps(fscal,dz11);
 413
 414             /* Update vectorial force */
 415             fix1             = _mm256_add_ps(fix1,tx);
 416             fiy1             = _mm256_add_ps(fiy1,ty);
 417             fiz1             = _mm256_add_ps(fiz1,tz);
 418
 419             fjx1             = _mm256_add_ps(fjx1,tx);
 420             fjy1             = _mm256_add_ps(fjy1,ty);
 421             fjz1             = _mm256_add_ps(fjz1,tz);
 422
 423             /**************************
 424              * CALCULATE INTERACTIONS *
 425              **************************/
 426
 427             /* REACTION-FIELD ELECTROSTATICS */
 428             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
 429             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
 430
 431             /* Update potential sum for this i atom from the interaction with this j atom. */
 432             velecsum         = _mm256_add_ps(velecsum,velec);
 433
 434             fscal            = felec;
 435
 436             /* Calculate temporary vectorial force */
 437             tx               = _mm256_mul_ps(fscal,dx12);
 438             ty               = _mm256_mul_ps(fscal,dy12);
 439             tz               = _mm256_mul_ps(fscal,dz12);
 440
 441             /* Update vectorial force */
 442             fix1             = _mm256_add_ps(fix1,tx);
 443             fiy1             = _mm256_add_ps(fiy1,ty);
 444             fiz1             = _mm256_add_ps(fiz1,tz);
 445
 446             fjx2             = _mm256_add_ps(fjx2,tx);
 447             fjy2             = _mm256_add_ps(fjy2,ty);
 448             fjz2             = _mm256_add_ps(fjz2,tz);
 449
 450             /**************************
 451              * CALCULATE INTERACTIONS *
 452              **************************/
 453
 454             /* REACTION-FIELD ELECTROSTATICS */
 455             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
 456             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
 457
 458             /* Update potential sum for this i atom from the interaction with this j atom. */
 459             velecsum         = _mm256_add_ps(velecsum,velec);
 460
 461             fscal            = felec;
 462
 463             /* Calculate temporary vectorial force */
 464             tx               = _mm256_mul_ps(fscal,dx13);
 465             ty               = _mm256_mul_ps(fscal,dy13);
 466             tz               = _mm256_mul_ps(fscal,dz13);
 467
 468             /* Update vectorial force */
 469             fix1             = _mm256_add_ps(fix1,tx);
 470             fiy1             = _mm256_add_ps(fiy1,ty);
 471             fiz1             = _mm256_add_ps(fiz1,tz);
 472
 473             fjx3             = _mm256_add_ps(fjx3,tx);
 474             fjy3             = _mm256_add_ps(fjy3,ty);
 475             fjz3             = _mm256_add_ps(fjz3,tz);
 476
 477             /**************************
 478              * CALCULATE INTERACTIONS *
 479              **************************/
 480
 481             /* REACTION-FIELD ELECTROSTATICS */
 482             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
 483             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
 484
 485             /* Update potential sum for this i atom from the interaction with this j atom. */
 486             velecsum         = _mm256_add_ps(velecsum,velec);
 487
 488             fscal            = felec;
 489
 490             /* Calculate temporary vectorial force */
 491             tx               = _mm256_mul_ps(fscal,dx21);
 492             ty               = _mm256_mul_ps(fscal,dy21);
 493             tz               = _mm256_mul_ps(fscal,dz21);
 494
 495             /* Update vectorial force */
 496             fix2             = _mm256_add_ps(fix2,tx);
 497             fiy2             = _mm256_add_ps(fiy2,ty);
 498             fiz2             = _mm256_add_ps(fiz2,tz);
 499
 500             fjx1             = _mm256_add_ps(fjx1,tx);
 501             fjy1             = _mm256_add_ps(fjy1,ty);
 502             fjz1             = _mm256_add_ps(fjz1,tz);
 503
 504             /**************************
 505              * CALCULATE INTERACTIONS *
 506              **************************/
 507
 508             /* REACTION-FIELD ELECTROSTATICS */
 509             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
 510             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
 511
 512             /* Update potential sum for this i atom from the interaction with this j atom. */
 513             velecsum         = _mm256_add_ps(velecsum,velec);
 514
 515             fscal            = felec;
 516
 517             /* Calculate temporary vectorial force */
 518             tx               = _mm256_mul_ps(fscal,dx22);
 519             ty               = _mm256_mul_ps(fscal,dy22);
 520             tz               = _mm256_mul_ps(fscal,dz22);
 521
 522             /* Update vectorial force */
 523             fix2             = _mm256_add_ps(fix2,tx);
 524             fiy2             = _mm256_add_ps(fiy2,ty);
 525             fiz2             = _mm256_add_ps(fiz2,tz);
 526
 527             fjx2             = _mm256_add_ps(fjx2,tx);
 528             fjy2             = _mm256_add_ps(fjy2,ty);
 529             fjz2             = _mm256_add_ps(fjz2,tz);
 530
 531             /**************************
 532              * CALCULATE INTERACTIONS *
 533              **************************/
 534
 535             /* REACTION-FIELD ELECTROSTATICS */
 536             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
 537             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
 538
 539             /* Update potential sum for this i atom from the interaction with this j atom. */
 540             velecsum         = _mm256_add_ps(velecsum,velec);
 541
 542             fscal            = felec;
 543
 544             /* Calculate temporary vectorial force */
 545             tx               = _mm256_mul_ps(fscal,dx23);
 546             ty               = _mm256_mul_ps(fscal,dy23);
 547             tz               = _mm256_mul_ps(fscal,dz23);
 548
 549             /* Update vectorial force */
 550             fix2             = _mm256_add_ps(fix2,tx);
 551             fiy2             = _mm256_add_ps(fiy2,ty);
 552             fiz2             = _mm256_add_ps(fiz2,tz);
 553
 554             fjx3             = _mm256_add_ps(fjx3,tx);
 555             fjy3             = _mm256_add_ps(fjy3,ty);
 556             fjz3             = _mm256_add_ps(fjz3,tz);
 557
 558             /**************************
 559              * CALCULATE INTERACTIONS *
 560              **************************/
 561
 562             /* REACTION-FIELD ELECTROSTATICS */
 563             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
 564             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
 565
 566             /* Update potential sum for this i atom from the interaction with this j atom. */
 567             velecsum         = _mm256_add_ps(velecsum,velec);
 568
 569             fscal            = felec;
 570
 571             /* Calculate temporary vectorial force */
 572             tx               = _mm256_mul_ps(fscal,dx31);
 573             ty               = _mm256_mul_ps(fscal,dy31);
 574             tz               = _mm256_mul_ps(fscal,dz31);
 575
 576             /* Update vectorial force */
 577             fix3             = _mm256_add_ps(fix3,tx);
 578             fiy3             = _mm256_add_ps(fiy3,ty);
 579             fiz3             = _mm256_add_ps(fiz3,tz);
 580
 581             fjx1             = _mm256_add_ps(fjx1,tx);
 582             fjy1             = _mm256_add_ps(fjy1,ty);
 583             fjz1             = _mm256_add_ps(fjz1,tz);
 584
 585             /**************************
 586              * CALCULATE INTERACTIONS *
 587              **************************/
 588
 589             /* REACTION-FIELD ELECTROSTATICS */
 590             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
 591             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
 592
 593             /* Update potential sum for this i atom from the interaction with this j atom. */
 594             velecsum         = _mm256_add_ps(velecsum,velec);
 595
 596             fscal            = felec;
 597
 598             /* Calculate temporary vectorial force */
 599             tx               = _mm256_mul_ps(fscal,dx32);
 600             ty               = _mm256_mul_ps(fscal,dy32);
 601             tz               = _mm256_mul_ps(fscal,dz32);
 602
 603             /* Update vectorial force */
 604             fix3             = _mm256_add_ps(fix3,tx);
 605             fiy3             = _mm256_add_ps(fiy3,ty);
 606             fiz3             = _mm256_add_ps(fiz3,tz);
 607
 608             fjx2             = _mm256_add_ps(fjx2,tx);
 609             fjy2             = _mm256_add_ps(fjy2,ty);
 610             fjz2             = _mm256_add_ps(fjz2,tz);
 611
 612             /**************************
 613              * CALCULATE INTERACTIONS *
 614              **************************/
 615
 616             /* REACTION-FIELD ELECTROSTATICS */
 617             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
 618             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
 619
 620             /* Update potential sum for this i atom from the interaction with this j atom. */
 621             velecsum         = _mm256_add_ps(velecsum,velec);
 622
 623             fscal            = felec;
 624
 625             /* Calculate temporary vectorial force */
 626             tx               = _mm256_mul_ps(fscal,dx33);
 627             ty               = _mm256_mul_ps(fscal,dy33);
 628             tz               = _mm256_mul_ps(fscal,dz33);
 629
 630             /* Update vectorial force */
 631             fix3             = _mm256_add_ps(fix3,tx);
 632             fiy3             = _mm256_add_ps(fiy3,ty);
 633             fiz3             = _mm256_add_ps(fiz3,tz);
 634
 635             fjx3             = _mm256_add_ps(fjx3,tx);
 636             fjy3             = _mm256_add_ps(fjy3,ty);
 637             fjz3             = _mm256_add_ps(fjz3,tz);
 638
 639             fjptrA             = f+j_coord_offsetA;
 640             fjptrB             = f+j_coord_offsetB;
 641             fjptrC             = f+j_coord_offsetC;
 642             fjptrD             = f+j_coord_offsetD;
 643             fjptrE             = f+j_coord_offsetE;
 644             fjptrF             = f+j_coord_offsetF;
 645             fjptrG             = f+j_coord_offsetG;
 646             fjptrH             = f+j_coord_offsetH;
 647
 648             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 649                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
 650                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 651
 652             /* Inner loop uses 347 flops */
 653         }
 654
 655         if(jidx<j_index_end)
 656         {
 657
 658             /* Get j neighbor index, and coordinate index */
 659             jnrlistA         = jjnr[jidx];
 660             jnrlistB         = jjnr[jidx+1];
 661             jnrlistC         = jjnr[jidx+2];
 662             jnrlistD         = jjnr[jidx+3];
 663             jnrlistE         = jjnr[jidx+4];
 664             jnrlistF         = jjnr[jidx+5];
 665             jnrlistG         = jjnr[jidx+6];
 666             jnrlistH         = jjnr[jidx+7];
 667             /* Sign of each element will be negative for non-real atoms.
 668              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 669              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 670              */
 671             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 672                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 673
 674             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 675             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 676             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 677             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 678             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 679             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 680             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 681             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 682             j_coord_offsetA  = DIM*jnrA;
 683             j_coord_offsetB  = DIM*jnrB;
 684             j_coord_offsetC  = DIM*jnrC;
 685             j_coord_offsetD  = DIM*jnrD;
 686             j_coord_offsetE  = DIM*jnrE;
 687             j_coord_offsetF  = DIM*jnrF;
 688             j_coord_offsetG  = DIM*jnrG;
 689             j_coord_offsetH  = DIM*jnrH;
 690
 691             /* load j atom coordinates */
 692             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 693                                                  x+j_coord_offsetC,x+j_coord_offsetD,
 694                                                  x+j_coord_offsetE,x+j_coord_offsetF,
 695                                                  x+j_coord_offsetG,x+j_coord_offsetH,
 696                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
 697                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
 698
 699             /* Calculate displacement vector */
 700             dx00             = _mm256_sub_ps(ix0,jx0);
 701             dy00             = _mm256_sub_ps(iy0,jy0);
 702             dz00             = _mm256_sub_ps(iz0,jz0);
 703             dx11             = _mm256_sub_ps(ix1,jx1);
 704             dy11             = _mm256_sub_ps(iy1,jy1);
 705             dz11             = _mm256_sub_ps(iz1,jz1);
 706             dx12             = _mm256_sub_ps(ix1,jx2);
 707             dy12             = _mm256_sub_ps(iy1,jy2);
 708             dz12             = _mm256_sub_ps(iz1,jz2);
 709             dx13             = _mm256_sub_ps(ix1,jx3);
 710             dy13             = _mm256_sub_ps(iy1,jy3);
 711             dz13             = _mm256_sub_ps(iz1,jz3);
 712             dx21             = _mm256_sub_ps(ix2,jx1);
 713             dy21             = _mm256_sub_ps(iy2,jy1);
 714             dz21             = _mm256_sub_ps(iz2,jz1);
 715             dx22             = _mm256_sub_ps(ix2,jx2);
 716             dy22             = _mm256_sub_ps(iy2,jy2);
 717             dz22             = _mm256_sub_ps(iz2,jz2);
 718             dx23             = _mm256_sub_ps(ix2,jx3);
 719             dy23             = _mm256_sub_ps(iy2,jy3);
 720             dz23             = _mm256_sub_ps(iz2,jz3);
 721             dx31             = _mm256_sub_ps(ix3,jx1);
 722             dy31             = _mm256_sub_ps(iy3,jy1);
 723             dz31             = _mm256_sub_ps(iz3,jz1);
 724             dx32             = _mm256_sub_ps(ix3,jx2);
 725             dy32             = _mm256_sub_ps(iy3,jy2);
 726             dz32             = _mm256_sub_ps(iz3,jz2);
 727             dx33             = _mm256_sub_ps(ix3,jx3);
 728             dy33             = _mm256_sub_ps(iy3,jy3);
 729             dz33             = _mm256_sub_ps(iz3,jz3);
 730
 731             /* Calculate squared distance and things based on it */
 732             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 733             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
 734             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
 735             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
 736             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
 737             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
 738             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
 739             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
 740             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
 741             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
 742
 743             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
 744             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
 745             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
 746             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
 747             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
 748             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
 749             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
 750             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
 751             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
 752             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
 753
 754             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
 755             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
 756             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
 757             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
 758             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
 759             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
 760             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
 761             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
 762             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
 763
 764             fjx0             = _mm256_setzero_ps();
 765             fjy0             = _mm256_setzero_ps();
 766             fjz0             = _mm256_setzero_ps();
 767             fjx1             = _mm256_setzero_ps();
 768             fjy1             = _mm256_setzero_ps();
 769             fjz1             = _mm256_setzero_ps();
 770             fjx2             = _mm256_setzero_ps();
 771             fjy2             = _mm256_setzero_ps();
 772             fjz2             = _mm256_setzero_ps();
 773             fjx3             = _mm256_setzero_ps();
 774             fjy3             = _mm256_setzero_ps();
 775             fjz3             = _mm256_setzero_ps();
 776
 777             /**************************
 778              * CALCULATE INTERACTIONS *
 779              **************************/
 780
 781             r00              = _mm256_mul_ps(rsq00,rinv00);
 782             r00              = _mm256_andnot_ps(dummy_mask,r00);
 783
 784             /* Calculate table index by multiplying r with table scale and truncate to integer */
 785             rt               = _mm256_mul_ps(r00,vftabscale);
 786             vfitab           = _mm256_cvttps_epi32(rt);
 787             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
 788             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
 789             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
 790             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
 791             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
 792             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
 793
 794             /* CUBIC SPLINE TABLE DISPERSION */
 795             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 796                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 797             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 798                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 799             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 800                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 801             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 802                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 803             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 804             Heps             = _mm256_mul_ps(vfeps,H);
 805             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 806             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 807             vvdw6            = _mm256_mul_ps(c6_00,VV);
 808             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 809             fvdw6            = _mm256_mul_ps(c6_00,FF);
 810
 811             /* CUBIC SPLINE TABLE REPULSION */
 812             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
 813             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
 814             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
 815                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
 816             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
 817                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
 818             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
 819                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
 820             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
 821                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
 822             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
 823             Heps             = _mm256_mul_ps(vfeps,H);
 824             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
 825             VV               = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
 826             vvdw12           = _mm256_mul_ps(c12_00,VV);
 827             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
 828             fvdw12           = _mm256_mul_ps(c12_00,FF);
 829             vvdw             = _mm256_add_ps(vvdw12,vvdw6);
 830             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
 831
 832             /* Update potential sum for this i atom from the interaction with this j atom. */
 833             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
 834             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 835
 836             fscal            = fvdw;
 837
 838             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 839
 840             /* Calculate temporary vectorial force */
 841             tx               = _mm256_mul_ps(fscal,dx00);
 842             ty               = _mm256_mul_ps(fscal,dy00);
 843             tz               = _mm256_mul_ps(fscal,dz00);
 844
 845             /* Update vectorial force */
 846             fix0             = _mm256_add_ps(fix0,tx);
 847             fiy0             = _mm256_add_ps(fiy0,ty);
 848             fiz0             = _mm256_add_ps(fiz0,tz);
 849
 850             fjx0             = _mm256_add_ps(fjx0,tx);
 851             fjy0             = _mm256_add_ps(fjy0,ty);
 852             fjz0             = _mm256_add_ps(fjz0,tz);
 853
 854             /**************************
 855              * CALCULATE INTERACTIONS *
 856              **************************/
 857
 858             /* REACTION-FIELD ELECTROSTATICS */
 859             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
 860             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
 861
 862             /* Update potential sum for this i atom from the interaction with this j atom. */
 863             velec            = _mm256_andnot_ps(dummy_mask,velec);
 864             velecsum         = _mm256_add_ps(velecsum,velec);
 865
 866             fscal            = felec;
 867
 868             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 869
 870             /* Calculate temporary vectorial force */
 871             tx               = _mm256_mul_ps(fscal,dx11);
 872             ty               = _mm256_mul_ps(fscal,dy11);
 873             tz               = _mm256_mul_ps(fscal,dz11);
 874
 875             /* Update vectorial force */
 876             fix1             = _mm256_add_ps(fix1,tx);
 877             fiy1             = _mm256_add_ps(fiy1,ty);
 878             fiz1             = _mm256_add_ps(fiz1,tz);
 879
 880             fjx1             = _mm256_add_ps(fjx1,tx);
 881             fjy1             = _mm256_add_ps(fjy1,ty);
 882             fjz1             = _mm256_add_ps(fjz1,tz);
 883
 884             /**************************
 885              * CALCULATE INTERACTIONS *
 886              **************************/
 887
 888             /* REACTION-FIELD ELECTROSTATICS */
 889             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
 890             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
 891
 892             /* Update potential sum for this i atom from the interaction with this j atom. */
 893             velec            = _mm256_andnot_ps(dummy_mask,velec);
 894             velecsum         = _mm256_add_ps(velecsum,velec);
 895
 896             fscal            = felec;
 897
 898             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 899
 900             /* Calculate temporary vectorial force */
 901             tx               = _mm256_mul_ps(fscal,dx12);
 902             ty               = _mm256_mul_ps(fscal,dy12);
 903             tz               = _mm256_mul_ps(fscal,dz12);
 904
 905             /* Update vectorial force */
 906             fix1             = _mm256_add_ps(fix1,tx);
 907             fiy1             = _mm256_add_ps(fiy1,ty);
 908             fiz1             = _mm256_add_ps(fiz1,tz);
 909
 910             fjx2             = _mm256_add_ps(fjx2,tx);
 911             fjy2             = _mm256_add_ps(fjy2,ty);
 912             fjz2             = _mm256_add_ps(fjz2,tz);
 913
 914             /**************************
 915              * CALCULATE INTERACTIONS *
 916              **************************/
 917
 918             /* REACTION-FIELD ELECTROSTATICS */
 919             velec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_add_ps(rinv13,_mm256_mul_ps(krf,rsq13)),crf));
 920             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
 921
 922             /* Update potential sum for this i atom from the interaction with this j atom. */
 923             velec            = _mm256_andnot_ps(dummy_mask,velec);
 924             velecsum         = _mm256_add_ps(velecsum,velec);
 925
 926             fscal            = felec;
 927
 928             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 929
 930             /* Calculate temporary vectorial force */
 931             tx               = _mm256_mul_ps(fscal,dx13);
 932             ty               = _mm256_mul_ps(fscal,dy13);
 933             tz               = _mm256_mul_ps(fscal,dz13);
 934
 935             /* Update vectorial force */
 936             fix1             = _mm256_add_ps(fix1,tx);
 937             fiy1             = _mm256_add_ps(fiy1,ty);
 938             fiz1             = _mm256_add_ps(fiz1,tz);
 939
 940             fjx3             = _mm256_add_ps(fjx3,tx);
 941             fjy3             = _mm256_add_ps(fjy3,ty);
 942             fjz3             = _mm256_add_ps(fjz3,tz);
 943
 944             /**************************
 945              * CALCULATE INTERACTIONS *
 946              **************************/
 947
 948             /* REACTION-FIELD ELECTROSTATICS */
 949             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
 950             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
 951
 952             /* Update potential sum for this i atom from the interaction with this j atom. */
 953             velec            = _mm256_andnot_ps(dummy_mask,velec);
 954             velecsum         = _mm256_add_ps(velecsum,velec);
 955
 956             fscal            = felec;
 957
 958             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 959
 960             /* Calculate temporary vectorial force */
 961             tx               = _mm256_mul_ps(fscal,dx21);
 962             ty               = _mm256_mul_ps(fscal,dy21);
 963             tz               = _mm256_mul_ps(fscal,dz21);
 964
 965             /* Update vectorial force */
 966             fix2             = _mm256_add_ps(fix2,tx);
 967             fiy2             = _mm256_add_ps(fiy2,ty);
 968             fiz2             = _mm256_add_ps(fiz2,tz);
 969
 970             fjx1             = _mm256_add_ps(fjx1,tx);
 971             fjy1             = _mm256_add_ps(fjy1,ty);
 972             fjz1             = _mm256_add_ps(fjz1,tz);
 973
 974             /**************************
 975              * CALCULATE INTERACTIONS *
 976              **************************/
 977
 978             /* REACTION-FIELD ELECTROSTATICS */
 979             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
 980             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
 981
 982             /* Update potential sum for this i atom from the interaction with this j atom. */
 983             velec            = _mm256_andnot_ps(dummy_mask,velec);
 984             velecsum         = _mm256_add_ps(velecsum,velec);
 985
 986             fscal            = felec;
 987
 988             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 989
 990             /* Calculate temporary vectorial force */
 991             tx               = _mm256_mul_ps(fscal,dx22);
 992             ty               = _mm256_mul_ps(fscal,dy22);
 993             tz               = _mm256_mul_ps(fscal,dz22);
 994
 995             /* Update vectorial force */
 996             fix2             = _mm256_add_ps(fix2,tx);
 997             fiy2             = _mm256_add_ps(fiy2,ty);
 998             fiz2             = _mm256_add_ps(fiz2,tz);
 999
1000             fjx2             = _mm256_add_ps(fjx2,tx);
1001             fjy2             = _mm256_add_ps(fjy2,ty);
1002             fjz2             = _mm256_add_ps(fjz2,tz);
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             /* REACTION-FIELD ELECTROSTATICS */
1009             velec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_add_ps(rinv23,_mm256_mul_ps(krf,rsq23)),crf));
1010             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1011
1012             /* Update potential sum for this i atom from the interaction with this j atom. */
1013             velec            = _mm256_andnot_ps(dummy_mask,velec);
1014             velecsum         = _mm256_add_ps(velecsum,velec);
1015
1016             fscal            = felec;
1017
1018             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1019
1020             /* Calculate temporary vectorial force */
1021             tx               = _mm256_mul_ps(fscal,dx23);
1022             ty               = _mm256_mul_ps(fscal,dy23);
1023             tz               = _mm256_mul_ps(fscal,dz23);
1024
1025             /* Update vectorial force */
1026             fix2             = _mm256_add_ps(fix2,tx);
1027             fiy2             = _mm256_add_ps(fiy2,ty);
1028             fiz2             = _mm256_add_ps(fiz2,tz);
1029
1030             fjx3             = _mm256_add_ps(fjx3,tx);
1031             fjy3             = _mm256_add_ps(fjy3,ty);
1032             fjz3             = _mm256_add_ps(fjz3,tz);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             /* REACTION-FIELD ELECTROSTATICS */
1039             velec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_add_ps(rinv31,_mm256_mul_ps(krf,rsq31)),crf));
1040             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm256_andnot_ps(dummy_mask,velec);
1044             velecsum         = _mm256_add_ps(velecsum,velec);
1045
1046             fscal            = felec;
1047
1048             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1049
1050             /* Calculate temporary vectorial force */
1051             tx               = _mm256_mul_ps(fscal,dx31);
1052             ty               = _mm256_mul_ps(fscal,dy31);
1053             tz               = _mm256_mul_ps(fscal,dz31);
1054
1055             /* Update vectorial force */
1056             fix3             = _mm256_add_ps(fix3,tx);
1057             fiy3             = _mm256_add_ps(fiy3,ty);
1058             fiz3             = _mm256_add_ps(fiz3,tz);
1059
1060             fjx1             = _mm256_add_ps(fjx1,tx);
1061             fjy1             = _mm256_add_ps(fjy1,ty);
1062             fjz1             = _mm256_add_ps(fjz1,tz);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             /* REACTION-FIELD ELECTROSTATICS */
1069             velec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_add_ps(rinv32,_mm256_mul_ps(krf,rsq32)),crf));
1070             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1071
1072             /* Update potential sum for this i atom from the interaction with this j atom. */
1073             velec            = _mm256_andnot_ps(dummy_mask,velec);
1074             velecsum         = _mm256_add_ps(velecsum,velec);
1075
1076             fscal            = felec;
1077
1078             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1079
1080             /* Calculate temporary vectorial force */
1081             tx               = _mm256_mul_ps(fscal,dx32);
1082             ty               = _mm256_mul_ps(fscal,dy32);
1083             tz               = _mm256_mul_ps(fscal,dz32);
1084
1085             /* Update vectorial force */
1086             fix3             = _mm256_add_ps(fix3,tx);
1087             fiy3             = _mm256_add_ps(fiy3,ty);
1088             fiz3             = _mm256_add_ps(fiz3,tz);
1089
1090             fjx2             = _mm256_add_ps(fjx2,tx);
1091             fjy2             = _mm256_add_ps(fjy2,ty);
1092             fjz2             = _mm256_add_ps(fjz2,tz);
1093
1094             /**************************
1095              * CALCULATE INTERACTIONS *
1096              **************************/
1097
1098             /* REACTION-FIELD ELECTROSTATICS */
1099             velec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_add_ps(rinv33,_mm256_mul_ps(krf,rsq33)),crf));
1100             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1101
1102             /* Update potential sum for this i atom from the interaction with this j atom. */
1103             velec            = _mm256_andnot_ps(dummy_mask,velec);
1104             velecsum         = _mm256_add_ps(velecsum,velec);
1105
1106             fscal            = felec;
1107
1108             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1109
1110             /* Calculate temporary vectorial force */
1111             tx               = _mm256_mul_ps(fscal,dx33);
1112             ty               = _mm256_mul_ps(fscal,dy33);
1113             tz               = _mm256_mul_ps(fscal,dz33);
1114
1115             /* Update vectorial force */
1116             fix3             = _mm256_add_ps(fix3,tx);
1117             fiy3             = _mm256_add_ps(fiy3,ty);
1118             fiz3             = _mm256_add_ps(fiz3,tz);
1119
1120             fjx3             = _mm256_add_ps(fjx3,tx);
1121             fjy3             = _mm256_add_ps(fjy3,ty);
1122             fjz3             = _mm256_add_ps(fjz3,tz);
1123
1124             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1125             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1126             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1127             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1128             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1129             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1130             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1131             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1132
1133             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1134                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1135                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1136
1137             /* Inner loop uses 348 flops */
1138         }
1139
1140         /* End of innermost loop */
1141
1142         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1143                                                  f+i_coord_offset,fshift+i_shift_offset);
1144
1145         ggid                        = gid[iidx];
1146         /* Update potential energies */
1147         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1148         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1149
1150         /* Increment number of inner iterations */
1151         inneriter                  += j_index_end - j_index_start;
1152
1153         /* Outer loop uses 26 flops */
1154     }
1155
1156     /* Increment number of outer iterations */
1157     outeriter        += nri;
1158
1159     /* Update outer/inner flops */
1160
1161     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*348);
1162 }
1163 /*
1164  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single
1165  * Electrostatics interaction: ReactionField
1166  * VdW interaction:            CubicSplineTable
1167  * Geometry:                   Water4-Water4
1168  * Calculate force/pot:        Force
1169  */
1170 void
1171 nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_avx_256_single
1172                     (t_nblist * gmx_restrict                nlist,
1173                      rvec * gmx_restrict                    xx,
1174                      rvec * gmx_restrict                    ff,
1175                      t_forcerec * gmx_restrict              fr,
1176                      t_mdatoms * gmx_restrict               mdatoms,
1177                      nb_kernel_data_t * gmx_restrict        kernel_data,
1178                      t_nrnb * gmx_restrict                  nrnb)
1179 {
1180     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1181      * just 0 for non-waters.
1182      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1183      * jnr indices corresponding to data put in the four positions in the SIMD register.
1184      */
1185     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1186     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1187     int              jnrA,jnrB,jnrC,jnrD;
1188     int              jnrE,jnrF,jnrG,jnrH;
1189     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1190     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1191     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1192     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1193     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1194     real             rcutoff_scalar;
1195     real             *shiftvec,*fshift,*x,*f;
1196     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1197     real             scratch[4*DIM];
1198     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1199     real *           vdwioffsetptr0;
1200     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1201     real *           vdwioffsetptr1;
1202     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1203     real *           vdwioffsetptr2;
1204     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1205     real *           vdwioffsetptr3;
1206     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1207     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1208     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1209     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1210     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1211     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1212     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1213     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1214     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1215     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1216     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1217     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1218     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1219     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1220     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1221     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1222     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1223     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1224     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1225     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1226     real             *charge;
1227     int              nvdwtype;
1228     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1229     int              *vdwtype;
1230     real             *vdwparam;
1231     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1232     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1233     __m256i          vfitab;
1234     __m128i          vfitab_lo,vfitab_hi;
1235     __m128i          ifour       = _mm_set1_epi32(4);
1236     __m256           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1237     real             *vftab;
1238     __m256           dummy_mask,cutoff_mask;
1239     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1240     __m256           one     = _mm256_set1_ps(1.0);
1241     __m256           two     = _mm256_set1_ps(2.0);
1242     x                = xx[0];
1243     f                = ff[0];
1244
1245     nri              = nlist->nri;
1246     iinr             = nlist->iinr;
1247     jindex           = nlist->jindex;
1248     jjnr             = nlist->jjnr;
1249     shiftidx         = nlist->shift;
1250     gid              = nlist->gid;
1251     shiftvec         = fr->shift_vec[0];
1252     fshift           = fr->fshift[0];
1253     facel            = _mm256_set1_ps(fr->epsfac);
1254     charge           = mdatoms->chargeA;
1255     krf              = _mm256_set1_ps(fr->ic->k_rf);
1256     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1257     crf              = _mm256_set1_ps(fr->ic->c_rf);
1258     nvdwtype         = fr->ntype;
1259     vdwparam         = fr->nbfp;
1260     vdwtype          = mdatoms->typeA;
1261
1262     vftab            = kernel_data->table_vdw->data;
1263     vftabscale       = _mm256_set1_ps(kernel_data->table_vdw->scale);
1264
1265     /* Setup water-specific parameters */
1266     inr              = nlist->iinr[0];
1267     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1268     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1269     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1270     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1271
1272     jq1              = _mm256_set1_ps(charge[inr+1]);
1273     jq2              = _mm256_set1_ps(charge[inr+2]);
1274     jq3              = _mm256_set1_ps(charge[inr+3]);
1275     vdwjidx0A        = 2*vdwtype[inr+0];
1276     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1277     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1278     qq11             = _mm256_mul_ps(iq1,jq1);
1279     qq12             = _mm256_mul_ps(iq1,jq2);
1280     qq13             = _mm256_mul_ps(iq1,jq3);
1281     qq21             = _mm256_mul_ps(iq2,jq1);
1282     qq22             = _mm256_mul_ps(iq2,jq2);
1283     qq23             = _mm256_mul_ps(iq2,jq3);
1284     qq31             = _mm256_mul_ps(iq3,jq1);
1285     qq32             = _mm256_mul_ps(iq3,jq2);
1286     qq33             = _mm256_mul_ps(iq3,jq3);
1287
1288     /* Avoid stupid compiler warnings */
1289     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1290     j_coord_offsetA = 0;
1291     j_coord_offsetB = 0;
1292     j_coord_offsetC = 0;
1293     j_coord_offsetD = 0;
1294     j_coord_offsetE = 0;
1295     j_coord_offsetF = 0;
1296     j_coord_offsetG = 0;
1297     j_coord_offsetH = 0;
1298
1299     outeriter        = 0;
1300     inneriter        = 0;
1301
1302     for(iidx=0;iidx<4*DIM;iidx++)
1303     {
1304         scratch[iidx] = 0.0;
1305     }
1306
1307     /* Start outer loop over neighborlists */
1308     for(iidx=0; iidx<nri; iidx++)
1309     {
1310         /* Load shift vector for this list */
1311         i_shift_offset   = DIM*shiftidx[iidx];
1312
1313         /* Load limits for loop over neighbors */
1314         j_index_start    = jindex[iidx];
1315         j_index_end      = jindex[iidx+1];
1316
1317         /* Get outer coordinate index */
1318         inr              = iinr[iidx];
1319         i_coord_offset   = DIM*inr;
1320
1321         /* Load i particle coords and add shift vector */
1322         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1323                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1324
1325         fix0             = _mm256_setzero_ps();
1326         fiy0             = _mm256_setzero_ps();
1327         fiz0             = _mm256_setzero_ps();
1328         fix1             = _mm256_setzero_ps();
1329         fiy1             = _mm256_setzero_ps();
1330         fiz1             = _mm256_setzero_ps();
1331         fix2             = _mm256_setzero_ps();
1332         fiy2             = _mm256_setzero_ps();
1333         fiz2             = _mm256_setzero_ps();
1334         fix3             = _mm256_setzero_ps();
1335         fiy3             = _mm256_setzero_ps();
1336         fiz3             = _mm256_setzero_ps();
1337
1338         /* Start inner kernel loop */
1339         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1340         {
1341
1342             /* Get j neighbor index, and coordinate index */
1343             jnrA             = jjnr[jidx];
1344             jnrB             = jjnr[jidx+1];
1345             jnrC             = jjnr[jidx+2];
1346             jnrD             = jjnr[jidx+3];
1347             jnrE             = jjnr[jidx+4];
1348             jnrF             = jjnr[jidx+5];
1349             jnrG             = jjnr[jidx+6];
1350             jnrH             = jjnr[jidx+7];
1351             j_coord_offsetA  = DIM*jnrA;
1352             j_coord_offsetB  = DIM*jnrB;
1353             j_coord_offsetC  = DIM*jnrC;
1354             j_coord_offsetD  = DIM*jnrD;
1355             j_coord_offsetE  = DIM*jnrE;
1356             j_coord_offsetF  = DIM*jnrF;
1357             j_coord_offsetG  = DIM*jnrG;
1358             j_coord_offsetH  = DIM*jnrH;
1359
1360             /* load j atom coordinates */
1361             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1362                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1363                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1364                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1365                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1366                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1367
1368             /* Calculate displacement vector */
1369             dx00             = _mm256_sub_ps(ix0,jx0);
1370             dy00             = _mm256_sub_ps(iy0,jy0);
1371             dz00             = _mm256_sub_ps(iz0,jz0);
1372             dx11             = _mm256_sub_ps(ix1,jx1);
1373             dy11             = _mm256_sub_ps(iy1,jy1);
1374             dz11             = _mm256_sub_ps(iz1,jz1);
1375             dx12             = _mm256_sub_ps(ix1,jx2);
1376             dy12             = _mm256_sub_ps(iy1,jy2);
1377             dz12             = _mm256_sub_ps(iz1,jz2);
1378             dx13             = _mm256_sub_ps(ix1,jx3);
1379             dy13             = _mm256_sub_ps(iy1,jy3);
1380             dz13             = _mm256_sub_ps(iz1,jz3);
1381             dx21             = _mm256_sub_ps(ix2,jx1);
1382             dy21             = _mm256_sub_ps(iy2,jy1);
1383             dz21             = _mm256_sub_ps(iz2,jz1);
1384             dx22             = _mm256_sub_ps(ix2,jx2);
1385             dy22             = _mm256_sub_ps(iy2,jy2);
1386             dz22             = _mm256_sub_ps(iz2,jz2);
1387             dx23             = _mm256_sub_ps(ix2,jx3);
1388             dy23             = _mm256_sub_ps(iy2,jy3);
1389             dz23             = _mm256_sub_ps(iz2,jz3);
1390             dx31             = _mm256_sub_ps(ix3,jx1);
1391             dy31             = _mm256_sub_ps(iy3,jy1);
1392             dz31             = _mm256_sub_ps(iz3,jz1);
1393             dx32             = _mm256_sub_ps(ix3,jx2);
1394             dy32             = _mm256_sub_ps(iy3,jy2);
1395             dz32             = _mm256_sub_ps(iz3,jz2);
1396             dx33             = _mm256_sub_ps(ix3,jx3);
1397             dy33             = _mm256_sub_ps(iy3,jy3);
1398             dz33             = _mm256_sub_ps(iz3,jz3);
1399
1400             /* Calculate squared distance and things based on it */
1401             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1402             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1403             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1404             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1405             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1406             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1407             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1408             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1409             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1410             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1411
1412             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1413             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1414             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1415             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1416             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1417             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1418             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1419             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1420             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1421             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1422
1423             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1424             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1425             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1426             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1427             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1428             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1429             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1430             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1431             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1432
1433             fjx0             = _mm256_setzero_ps();
1434             fjy0             = _mm256_setzero_ps();
1435             fjz0             = _mm256_setzero_ps();
1436             fjx1             = _mm256_setzero_ps();
1437             fjy1             = _mm256_setzero_ps();
1438             fjz1             = _mm256_setzero_ps();
1439             fjx2             = _mm256_setzero_ps();
1440             fjy2             = _mm256_setzero_ps();
1441             fjz2             = _mm256_setzero_ps();
1442             fjx3             = _mm256_setzero_ps();
1443             fjy3             = _mm256_setzero_ps();
1444             fjz3             = _mm256_setzero_ps();
1445
1446             /**************************
1447              * CALCULATE INTERACTIONS *
1448              **************************/
1449
1450             r00              = _mm256_mul_ps(rsq00,rinv00);
1451
1452             /* Calculate table index by multiplying r with table scale and truncate to integer */
1453             rt               = _mm256_mul_ps(r00,vftabscale);
1454             vfitab           = _mm256_cvttps_epi32(rt);
1455             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1456             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1457             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1458             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1459             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1460             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1461
1462             /* CUBIC SPLINE TABLE DISPERSION */
1463             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1464                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1465             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1466                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1467             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1468                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1469             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1470                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1471             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1472             Heps             = _mm256_mul_ps(vfeps,H);
1473             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1474             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1475             fvdw6            = _mm256_mul_ps(c6_00,FF);
1476
1477             /* CUBIC SPLINE TABLE REPULSION */
1478             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1479             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1480             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1481                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1482             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1483                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1484             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1485                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1486             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1487                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1488             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1489             Heps             = _mm256_mul_ps(vfeps,H);
1490             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1491             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1492             fvdw12           = _mm256_mul_ps(c12_00,FF);
1493             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1494
1495             fscal            = fvdw;
1496
1497             /* Calculate temporary vectorial force */
1498             tx               = _mm256_mul_ps(fscal,dx00);
1499             ty               = _mm256_mul_ps(fscal,dy00);
1500             tz               = _mm256_mul_ps(fscal,dz00);
1501
1502             /* Update vectorial force */
1503             fix0             = _mm256_add_ps(fix0,tx);
1504             fiy0             = _mm256_add_ps(fiy0,ty);
1505             fiz0             = _mm256_add_ps(fiz0,tz);
1506
1507             fjx0             = _mm256_add_ps(fjx0,tx);
1508             fjy0             = _mm256_add_ps(fjy0,ty);
1509             fjz0             = _mm256_add_ps(fjz0,tz);
1510
1511             /**************************
1512              * CALCULATE INTERACTIONS *
1513              **************************/
1514
1515             /* REACTION-FIELD ELECTROSTATICS */
1516             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1517
1518             fscal            = felec;
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm256_mul_ps(fscal,dx11);
1522             ty               = _mm256_mul_ps(fscal,dy11);
1523             tz               = _mm256_mul_ps(fscal,dz11);
1524
1525             /* Update vectorial force */
1526             fix1             = _mm256_add_ps(fix1,tx);
1527             fiy1             = _mm256_add_ps(fiy1,ty);
1528             fiz1             = _mm256_add_ps(fiz1,tz);
1529
1530             fjx1             = _mm256_add_ps(fjx1,tx);
1531             fjy1             = _mm256_add_ps(fjy1,ty);
1532             fjz1             = _mm256_add_ps(fjz1,tz);
1533
1534             /**************************
1535              * CALCULATE INTERACTIONS *
1536              **************************/
1537
1538             /* REACTION-FIELD ELECTROSTATICS */
1539             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1540
1541             fscal            = felec;
1542
1543             /* Calculate temporary vectorial force */
1544             tx               = _mm256_mul_ps(fscal,dx12);
1545             ty               = _mm256_mul_ps(fscal,dy12);
1546             tz               = _mm256_mul_ps(fscal,dz12);
1547
1548             /* Update vectorial force */
1549             fix1             = _mm256_add_ps(fix1,tx);
1550             fiy1             = _mm256_add_ps(fiy1,ty);
1551             fiz1             = _mm256_add_ps(fiz1,tz);
1552
1553             fjx2             = _mm256_add_ps(fjx2,tx);
1554             fjy2             = _mm256_add_ps(fjy2,ty);
1555             fjz2             = _mm256_add_ps(fjz2,tz);
1556
1557             /**************************
1558              * CALCULATE INTERACTIONS *
1559              **************************/
1560
1561             /* REACTION-FIELD ELECTROSTATICS */
1562             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1563
1564             fscal            = felec;
1565
1566             /* Calculate temporary vectorial force */
1567             tx               = _mm256_mul_ps(fscal,dx13);
1568             ty               = _mm256_mul_ps(fscal,dy13);
1569             tz               = _mm256_mul_ps(fscal,dz13);
1570
1571             /* Update vectorial force */
1572             fix1             = _mm256_add_ps(fix1,tx);
1573             fiy1             = _mm256_add_ps(fiy1,ty);
1574             fiz1             = _mm256_add_ps(fiz1,tz);
1575
1576             fjx3             = _mm256_add_ps(fjx3,tx);
1577             fjy3             = _mm256_add_ps(fjy3,ty);
1578             fjz3             = _mm256_add_ps(fjz3,tz);
1579
1580             /**************************
1581              * CALCULATE INTERACTIONS *
1582              **************************/
1583
1584             /* REACTION-FIELD ELECTROSTATICS */
1585             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1586
1587             fscal            = felec;
1588
1589             /* Calculate temporary vectorial force */
1590             tx               = _mm256_mul_ps(fscal,dx21);
1591             ty               = _mm256_mul_ps(fscal,dy21);
1592             tz               = _mm256_mul_ps(fscal,dz21);
1593
1594             /* Update vectorial force */
1595             fix2             = _mm256_add_ps(fix2,tx);
1596             fiy2             = _mm256_add_ps(fiy2,ty);
1597             fiz2             = _mm256_add_ps(fiz2,tz);
1598
1599             fjx1             = _mm256_add_ps(fjx1,tx);
1600             fjy1             = _mm256_add_ps(fjy1,ty);
1601             fjz1             = _mm256_add_ps(fjz1,tz);
1602
1603             /**************************
1604              * CALCULATE INTERACTIONS *
1605              **************************/
1606
1607             /* REACTION-FIELD ELECTROSTATICS */
1608             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1609
1610             fscal            = felec;
1611
1612             /* Calculate temporary vectorial force */
1613             tx               = _mm256_mul_ps(fscal,dx22);
1614             ty               = _mm256_mul_ps(fscal,dy22);
1615             tz               = _mm256_mul_ps(fscal,dz22);
1616
1617             /* Update vectorial force */
1618             fix2             = _mm256_add_ps(fix2,tx);
1619             fiy2             = _mm256_add_ps(fiy2,ty);
1620             fiz2             = _mm256_add_ps(fiz2,tz);
1621
1622             fjx2             = _mm256_add_ps(fjx2,tx);
1623             fjy2             = _mm256_add_ps(fjy2,ty);
1624             fjz2             = _mm256_add_ps(fjz2,tz);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             /* REACTION-FIELD ELECTROSTATICS */
1631             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
1632
1633             fscal            = felec;
1634
1635             /* Calculate temporary vectorial force */
1636             tx               = _mm256_mul_ps(fscal,dx23);
1637             ty               = _mm256_mul_ps(fscal,dy23);
1638             tz               = _mm256_mul_ps(fscal,dz23);
1639
1640             /* Update vectorial force */
1641             fix2             = _mm256_add_ps(fix2,tx);
1642             fiy2             = _mm256_add_ps(fiy2,ty);
1643             fiz2             = _mm256_add_ps(fiz2,tz);
1644
1645             fjx3             = _mm256_add_ps(fjx3,tx);
1646             fjy3             = _mm256_add_ps(fjy3,ty);
1647             fjz3             = _mm256_add_ps(fjz3,tz);
1648
1649             /**************************
1650              * CALCULATE INTERACTIONS *
1651              **************************/
1652
1653             /* REACTION-FIELD ELECTROSTATICS */
1654             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
1655
1656             fscal            = felec;
1657
1658             /* Calculate temporary vectorial force */
1659             tx               = _mm256_mul_ps(fscal,dx31);
1660             ty               = _mm256_mul_ps(fscal,dy31);
1661             tz               = _mm256_mul_ps(fscal,dz31);
1662
1663             /* Update vectorial force */
1664             fix3             = _mm256_add_ps(fix3,tx);
1665             fiy3             = _mm256_add_ps(fiy3,ty);
1666             fiz3             = _mm256_add_ps(fiz3,tz);
1667
1668             fjx1             = _mm256_add_ps(fjx1,tx);
1669             fjy1             = _mm256_add_ps(fjy1,ty);
1670             fjz1             = _mm256_add_ps(fjz1,tz);
1671
1672             /**************************
1673              * CALCULATE INTERACTIONS *
1674              **************************/
1675
1676             /* REACTION-FIELD ELECTROSTATICS */
1677             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
1678
1679             fscal            = felec;
1680
1681             /* Calculate temporary vectorial force */
1682             tx               = _mm256_mul_ps(fscal,dx32);
1683             ty               = _mm256_mul_ps(fscal,dy32);
1684             tz               = _mm256_mul_ps(fscal,dz32);
1685
1686             /* Update vectorial force */
1687             fix3             = _mm256_add_ps(fix3,tx);
1688             fiy3             = _mm256_add_ps(fiy3,ty);
1689             fiz3             = _mm256_add_ps(fiz3,tz);
1690
1691             fjx2             = _mm256_add_ps(fjx2,tx);
1692             fjy2             = _mm256_add_ps(fjy2,ty);
1693             fjz2             = _mm256_add_ps(fjz2,tz);
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             /* REACTION-FIELD ELECTROSTATICS */
1700             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
1701
1702             fscal            = felec;
1703
1704             /* Calculate temporary vectorial force */
1705             tx               = _mm256_mul_ps(fscal,dx33);
1706             ty               = _mm256_mul_ps(fscal,dy33);
1707             tz               = _mm256_mul_ps(fscal,dz33);
1708
1709             /* Update vectorial force */
1710             fix3             = _mm256_add_ps(fix3,tx);
1711             fiy3             = _mm256_add_ps(fiy3,ty);
1712             fiz3             = _mm256_add_ps(fiz3,tz);
1713
1714             fjx3             = _mm256_add_ps(fjx3,tx);
1715             fjy3             = _mm256_add_ps(fjy3,ty);
1716             fjz3             = _mm256_add_ps(fjz3,tz);
1717
1718             fjptrA             = f+j_coord_offsetA;
1719             fjptrB             = f+j_coord_offsetB;
1720             fjptrC             = f+j_coord_offsetC;
1721             fjptrD             = f+j_coord_offsetD;
1722             fjptrE             = f+j_coord_offsetE;
1723             fjptrF             = f+j_coord_offsetF;
1724             fjptrG             = f+j_coord_offsetG;
1725             fjptrH             = f+j_coord_offsetH;
1726
1727             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1728                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1729                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1730
1731             /* Inner loop uses 294 flops */
1732         }
1733
1734         if(jidx<j_index_end)
1735         {
1736
1737             /* Get j neighbor index, and coordinate index */
1738             jnrlistA         = jjnr[jidx];
1739             jnrlistB         = jjnr[jidx+1];
1740             jnrlistC         = jjnr[jidx+2];
1741             jnrlistD         = jjnr[jidx+3];
1742             jnrlistE         = jjnr[jidx+4];
1743             jnrlistF         = jjnr[jidx+5];
1744             jnrlistG         = jjnr[jidx+6];
1745             jnrlistH         = jjnr[jidx+7];
1746             /* Sign of each element will be negative for non-real atoms.
1747              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1748              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1749              */
1750             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1751                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1752
1753             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1754             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1755             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1756             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1757             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1758             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1759             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1760             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1761             j_coord_offsetA  = DIM*jnrA;
1762             j_coord_offsetB  = DIM*jnrB;
1763             j_coord_offsetC  = DIM*jnrC;
1764             j_coord_offsetD  = DIM*jnrD;
1765             j_coord_offsetE  = DIM*jnrE;
1766             j_coord_offsetF  = DIM*jnrF;
1767             j_coord_offsetG  = DIM*jnrG;
1768             j_coord_offsetH  = DIM*jnrH;
1769
1770             /* load j atom coordinates */
1771             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1772                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1773                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1774                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1775                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1776                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1777
1778             /* Calculate displacement vector */
1779             dx00             = _mm256_sub_ps(ix0,jx0);
1780             dy00             = _mm256_sub_ps(iy0,jy0);
1781             dz00             = _mm256_sub_ps(iz0,jz0);
1782             dx11             = _mm256_sub_ps(ix1,jx1);
1783             dy11             = _mm256_sub_ps(iy1,jy1);
1784             dz11             = _mm256_sub_ps(iz1,jz1);
1785             dx12             = _mm256_sub_ps(ix1,jx2);
1786             dy12             = _mm256_sub_ps(iy1,jy2);
1787             dz12             = _mm256_sub_ps(iz1,jz2);
1788             dx13             = _mm256_sub_ps(ix1,jx3);
1789             dy13             = _mm256_sub_ps(iy1,jy3);
1790             dz13             = _mm256_sub_ps(iz1,jz3);
1791             dx21             = _mm256_sub_ps(ix2,jx1);
1792             dy21             = _mm256_sub_ps(iy2,jy1);
1793             dz21             = _mm256_sub_ps(iz2,jz1);
1794             dx22             = _mm256_sub_ps(ix2,jx2);
1795             dy22             = _mm256_sub_ps(iy2,jy2);
1796             dz22             = _mm256_sub_ps(iz2,jz2);
1797             dx23             = _mm256_sub_ps(ix2,jx3);
1798             dy23             = _mm256_sub_ps(iy2,jy3);
1799             dz23             = _mm256_sub_ps(iz2,jz3);
1800             dx31             = _mm256_sub_ps(ix3,jx1);
1801             dy31             = _mm256_sub_ps(iy3,jy1);
1802             dz31             = _mm256_sub_ps(iz3,jz1);
1803             dx32             = _mm256_sub_ps(ix3,jx2);
1804             dy32             = _mm256_sub_ps(iy3,jy2);
1805             dz32             = _mm256_sub_ps(iz3,jz2);
1806             dx33             = _mm256_sub_ps(ix3,jx3);
1807             dy33             = _mm256_sub_ps(iy3,jy3);
1808             dz33             = _mm256_sub_ps(iz3,jz3);
1809
1810             /* Calculate squared distance and things based on it */
1811             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1812             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1813             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1814             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1815             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1816             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1817             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1818             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1819             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1820             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1821
1822             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1823             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1824             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1825             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1826             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1827             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1828             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1829             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1830             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1831             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1832
1833             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1834             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1835             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1836             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1837             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1838             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1839             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1840             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1841             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1842
1843             fjx0             = _mm256_setzero_ps();
1844             fjy0             = _mm256_setzero_ps();
1845             fjz0             = _mm256_setzero_ps();
1846             fjx1             = _mm256_setzero_ps();
1847             fjy1             = _mm256_setzero_ps();
1848             fjz1             = _mm256_setzero_ps();
1849             fjx2             = _mm256_setzero_ps();
1850             fjy2             = _mm256_setzero_ps();
1851             fjz2             = _mm256_setzero_ps();
1852             fjx3             = _mm256_setzero_ps();
1853             fjy3             = _mm256_setzero_ps();
1854             fjz3             = _mm256_setzero_ps();
1855
1856             /**************************
1857              * CALCULATE INTERACTIONS *
1858              **************************/
1859
1860             r00              = _mm256_mul_ps(rsq00,rinv00);
1861             r00              = _mm256_andnot_ps(dummy_mask,r00);
1862
1863             /* Calculate table index by multiplying r with table scale and truncate to integer */
1864             rt               = _mm256_mul_ps(r00,vftabscale);
1865             vfitab           = _mm256_cvttps_epi32(rt);
1866             vfeps            = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1867             /*         AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1868             vfitab_lo        = _mm256_extractf128_si256(vfitab,0x0);
1869             vfitab_hi        = _mm256_extractf128_si256(vfitab,0x1);
1870             vfitab_lo        = _mm_slli_epi32(vfitab_lo,3);
1871             vfitab_hi        = _mm_slli_epi32(vfitab_hi,3);
1872
1873             /* CUBIC SPLINE TABLE DISPERSION */
1874             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1875                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1876             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1877                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1878             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1879                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1880             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1881                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1882             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1883             Heps             = _mm256_mul_ps(vfeps,H);
1884             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1885             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1886             fvdw6            = _mm256_mul_ps(c6_00,FF);
1887
1888             /* CUBIC SPLINE TABLE REPULSION */
1889             vfitab_lo        = _mm_add_epi32(vfitab_lo,ifour);
1890             vfitab_hi        = _mm_add_epi32(vfitab_hi,ifour);
1891             Y                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1892                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1893             F                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1894                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1895             G                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1896                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1897             H                = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1898                                                   _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1899             GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1900             Heps             = _mm256_mul_ps(vfeps,H);
1901             Fp               = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1902             FF               = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1903             fvdw12           = _mm256_mul_ps(c12_00,FF);
1904             fvdw             = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1905
1906             fscal            = fvdw;
1907
1908             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1909
1910             /* Calculate temporary vectorial force */
1911             tx               = _mm256_mul_ps(fscal,dx00);
1912             ty               = _mm256_mul_ps(fscal,dy00);
1913             tz               = _mm256_mul_ps(fscal,dz00);
1914
1915             /* Update vectorial force */
1916             fix0             = _mm256_add_ps(fix0,tx);
1917             fiy0             = _mm256_add_ps(fiy0,ty);
1918             fiz0             = _mm256_add_ps(fiz0,tz);
1919
1920             fjx0             = _mm256_add_ps(fjx0,tx);
1921             fjy0             = _mm256_add_ps(fjy0,ty);
1922             fjz0             = _mm256_add_ps(fjz0,tz);
1923
1924             /**************************
1925              * CALCULATE INTERACTIONS *
1926              **************************/
1927
1928             /* REACTION-FIELD ELECTROSTATICS */
1929             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1930
1931             fscal            = felec;
1932
1933             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1934
1935             /* Calculate temporary vectorial force */
1936             tx               = _mm256_mul_ps(fscal,dx11);
1937             ty               = _mm256_mul_ps(fscal,dy11);
1938             tz               = _mm256_mul_ps(fscal,dz11);
1939
1940             /* Update vectorial force */
1941             fix1             = _mm256_add_ps(fix1,tx);
1942             fiy1             = _mm256_add_ps(fiy1,ty);
1943             fiz1             = _mm256_add_ps(fiz1,tz);
1944
1945             fjx1             = _mm256_add_ps(fjx1,tx);
1946             fjy1             = _mm256_add_ps(fjy1,ty);
1947             fjz1             = _mm256_add_ps(fjz1,tz);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             /* REACTION-FIELD ELECTROSTATICS */
1954             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1959
1960             /* Calculate temporary vectorial force */
1961             tx               = _mm256_mul_ps(fscal,dx12);
1962             ty               = _mm256_mul_ps(fscal,dy12);
1963             tz               = _mm256_mul_ps(fscal,dz12);
1964
1965             /* Update vectorial force */
1966             fix1             = _mm256_add_ps(fix1,tx);
1967             fiy1             = _mm256_add_ps(fiy1,ty);
1968             fiz1             = _mm256_add_ps(fiz1,tz);
1969
1970             fjx2             = _mm256_add_ps(fjx2,tx);
1971             fjy2             = _mm256_add_ps(fjy2,ty);
1972             fjz2             = _mm256_add_ps(fjz2,tz);
1973
1974             /**************************
1975              * CALCULATE INTERACTIONS *
1976              **************************/
1977
1978             /* REACTION-FIELD ELECTROSTATICS */
1979             felec            = _mm256_mul_ps(qq13,_mm256_sub_ps(_mm256_mul_ps(rinv13,rinvsq13),krf2));
1980
1981             fscal            = felec;
1982
1983             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1984
1985             /* Calculate temporary vectorial force */
1986             tx               = _mm256_mul_ps(fscal,dx13);
1987             ty               = _mm256_mul_ps(fscal,dy13);
1988             tz               = _mm256_mul_ps(fscal,dz13);
1989
1990             /* Update vectorial force */
1991             fix1             = _mm256_add_ps(fix1,tx);
1992             fiy1             = _mm256_add_ps(fiy1,ty);
1993             fiz1             = _mm256_add_ps(fiz1,tz);
1994
1995             fjx3             = _mm256_add_ps(fjx3,tx);
1996             fjy3             = _mm256_add_ps(fjy3,ty);
1997             fjz3             = _mm256_add_ps(fjz3,tz);
1998
1999             /**************************
2000              * CALCULATE INTERACTIONS *
2001              **************************/
2002
2003             /* REACTION-FIELD ELECTROSTATICS */
2004             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2005
2006             fscal            = felec;
2007
2008             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2009
2010             /* Calculate temporary vectorial force */
2011             tx               = _mm256_mul_ps(fscal,dx21);
2012             ty               = _mm256_mul_ps(fscal,dy21);
2013             tz               = _mm256_mul_ps(fscal,dz21);
2014
2015             /* Update vectorial force */
2016             fix2             = _mm256_add_ps(fix2,tx);
2017             fiy2             = _mm256_add_ps(fiy2,ty);
2018             fiz2             = _mm256_add_ps(fiz2,tz);
2019
2020             fjx1             = _mm256_add_ps(fjx1,tx);
2021             fjy1             = _mm256_add_ps(fjy1,ty);
2022             fjz1             = _mm256_add_ps(fjz1,tz);
2023
2024             /**************************
2025              * CALCULATE INTERACTIONS *
2026              **************************/
2027
2028             /* REACTION-FIELD ELECTROSTATICS */
2029             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2030
2031             fscal            = felec;
2032
2033             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2034
2035             /* Calculate temporary vectorial force */
2036             tx               = _mm256_mul_ps(fscal,dx22);
2037             ty               = _mm256_mul_ps(fscal,dy22);
2038             tz               = _mm256_mul_ps(fscal,dz22);
2039
2040             /* Update vectorial force */
2041             fix2             = _mm256_add_ps(fix2,tx);
2042             fiy2             = _mm256_add_ps(fiy2,ty);
2043             fiz2             = _mm256_add_ps(fiz2,tz);
2044
2045             fjx2             = _mm256_add_ps(fjx2,tx);
2046             fjy2             = _mm256_add_ps(fjy2,ty);
2047             fjz2             = _mm256_add_ps(fjz2,tz);
2048
2049             /**************************
2050              * CALCULATE INTERACTIONS *
2051              **************************/
2052
2053             /* REACTION-FIELD ELECTROSTATICS */
2054             felec            = _mm256_mul_ps(qq23,_mm256_sub_ps(_mm256_mul_ps(rinv23,rinvsq23),krf2));
2055
2056             fscal            = felec;
2057
2058             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm256_mul_ps(fscal,dx23);
2062             ty               = _mm256_mul_ps(fscal,dy23);
2063             tz               = _mm256_mul_ps(fscal,dz23);
2064
2065             /* Update vectorial force */
2066             fix2             = _mm256_add_ps(fix2,tx);
2067             fiy2             = _mm256_add_ps(fiy2,ty);
2068             fiz2             = _mm256_add_ps(fiz2,tz);
2069
2070             fjx3             = _mm256_add_ps(fjx3,tx);
2071             fjy3             = _mm256_add_ps(fjy3,ty);
2072             fjz3             = _mm256_add_ps(fjz3,tz);
2073
2074             /**************************
2075              * CALCULATE INTERACTIONS *
2076              **************************/
2077
2078             /* REACTION-FIELD ELECTROSTATICS */
2079             felec            = _mm256_mul_ps(qq31,_mm256_sub_ps(_mm256_mul_ps(rinv31,rinvsq31),krf2));
2080
2081             fscal            = felec;
2082
2083             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2084
2085             /* Calculate temporary vectorial force */
2086             tx               = _mm256_mul_ps(fscal,dx31);
2087             ty               = _mm256_mul_ps(fscal,dy31);
2088             tz               = _mm256_mul_ps(fscal,dz31);
2089
2090             /* Update vectorial force */
2091             fix3             = _mm256_add_ps(fix3,tx);
2092             fiy3             = _mm256_add_ps(fiy3,ty);
2093             fiz3             = _mm256_add_ps(fiz3,tz);
2094
2095             fjx1             = _mm256_add_ps(fjx1,tx);
2096             fjy1             = _mm256_add_ps(fjy1,ty);
2097             fjz1             = _mm256_add_ps(fjz1,tz);
2098
2099             /**************************
2100              * CALCULATE INTERACTIONS *
2101              **************************/
2102
2103             /* REACTION-FIELD ELECTROSTATICS */
2104             felec            = _mm256_mul_ps(qq32,_mm256_sub_ps(_mm256_mul_ps(rinv32,rinvsq32),krf2));
2105
2106             fscal            = felec;
2107
2108             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2109
2110             /* Calculate temporary vectorial force */
2111             tx               = _mm256_mul_ps(fscal,dx32);
2112             ty               = _mm256_mul_ps(fscal,dy32);
2113             tz               = _mm256_mul_ps(fscal,dz32);
2114
2115             /* Update vectorial force */
2116             fix3             = _mm256_add_ps(fix3,tx);
2117             fiy3             = _mm256_add_ps(fiy3,ty);
2118             fiz3             = _mm256_add_ps(fiz3,tz);
2119
2120             fjx2             = _mm256_add_ps(fjx2,tx);
2121             fjy2             = _mm256_add_ps(fjy2,ty);
2122             fjz2             = _mm256_add_ps(fjz2,tz);
2123
2124             /**************************
2125              * CALCULATE INTERACTIONS *
2126              **************************/
2127
2128             /* REACTION-FIELD ELECTROSTATICS */
2129             felec            = _mm256_mul_ps(qq33,_mm256_sub_ps(_mm256_mul_ps(rinv33,rinvsq33),krf2));
2130
2131             fscal            = felec;
2132
2133             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2134
2135             /* Calculate temporary vectorial force */
2136             tx               = _mm256_mul_ps(fscal,dx33);
2137             ty               = _mm256_mul_ps(fscal,dy33);
2138             tz               = _mm256_mul_ps(fscal,dz33);
2139
2140             /* Update vectorial force */
2141             fix3             = _mm256_add_ps(fix3,tx);
2142             fiy3             = _mm256_add_ps(fiy3,ty);
2143             fiz3             = _mm256_add_ps(fiz3,tz);
2144
2145             fjx3             = _mm256_add_ps(fjx3,tx);
2146             fjy3             = _mm256_add_ps(fjy3,ty);
2147             fjz3             = _mm256_add_ps(fjz3,tz);
2148
2149             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2150             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2151             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2152             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2153             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2154             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2155             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2156             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2157
2158             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2159                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2160                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2161
2162             /* Inner loop uses 295 flops */
2163         }
2164
2165         /* End of innermost loop */
2166
2167         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2168                                                  f+i_coord_offset,fshift+i_shift_offset);
2169
2170         /* Increment number of inner iterations */
2171         inneriter                  += j_index_end - j_index_start;
2172
2173         /* Outer loop uses 24 flops */
2174     }
2175
2176     /* Increment number of outer iterations */
2177     outeriter        += nri;
2178
2179     /* Update outer/inner flops */
2180
2181     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
2182 }