src/gromacs/gmxlib/nonbonded/nb_kernel_avx_256_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_avx_256_single.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*
  36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "config.h"
  41
  42 #include <math.h>
  43
  44 #include "../nb_kernel.h"
  45 #include "gromacs/gmxlib/nrnb.h"
  46
  47 #include "kernelutil_x86_avx_256_single.h"
  48
  49 /*
  50  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
  51  * Electrostatics interaction: ReactionField
  52  * VdW interaction:            LennardJones
  53  * Geometry:                   Water3-Water3
  54  * Calculate force/pot:        PotentialAndForce
  55  */
  56 void
  57 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_single
  58                     (t_nblist                    * gmx_restrict       nlist,
  59                      rvec                        * gmx_restrict          xx,
  60                      rvec                        * gmx_restrict          ff,
  61                      struct t_forcerec           * gmx_restrict          fr,
  62                      t_mdatoms                   * gmx_restrict     mdatoms,
  63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
  64                      t_nrnb                      * gmx_restrict        nrnb)
  65 {
  66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  67      * just 0 for non-waters.
  68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
  69      * jnr indices corresponding to data put in the four positions in the SIMD register.
  70      */
  71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  73     int              jnrA,jnrB,jnrC,jnrD;
  74     int              jnrE,jnrF,jnrG,jnrH;
  75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
  77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
  79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  80     real             rcutoff_scalar;
  81     real             *shiftvec,*fshift,*x,*f;
  82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
  83     real             scratch[4*DIM];
  84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
  85     real *           vdwioffsetptr0;
  86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  87     real *           vdwioffsetptr1;
  88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  89     real *           vdwioffsetptr2;
  90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
  92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
  94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
  95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
  96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
  97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
  99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
 100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
 107     real             *charge;
 108     int              nvdwtype;
 109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 110     int              *vdwtype;
 111     real             *vdwparam;
 112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
 113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
 114     __m256           dummy_mask,cutoff_mask;
 115     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
 116     __m256           one     = _mm256_set1_ps(1.0);
 117     __m256           two     = _mm256_set1_ps(2.0);
 118     x                = xx[0];
 119     f                = ff[0];
 120
 121     nri              = nlist->nri;
 122     iinr             = nlist->iinr;
 123     jindex           = nlist->jindex;
 124     jjnr             = nlist->jjnr;
 125     shiftidx         = nlist->shift;
 126     gid              = nlist->gid;
 127     shiftvec         = fr->shift_vec[0];
 128     fshift           = fr->fshift[0];
 129     facel            = _mm256_set1_ps(fr->ic->epsfac);
 130     charge           = mdatoms->chargeA;
 131     krf              = _mm256_set1_ps(fr->ic->k_rf);
 132     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
 133     crf              = _mm256_set1_ps(fr->ic->c_rf);
 134     nvdwtype         = fr->ntype;
 135     vdwparam         = fr->nbfp;
 136     vdwtype          = mdatoms->typeA;
 137
 138     /* Setup water-specific parameters */
 139     inr              = nlist->iinr[0];
 140     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
 141     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
 142     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
 143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
 144
 145     jq0              = _mm256_set1_ps(charge[inr+0]);
 146     jq1              = _mm256_set1_ps(charge[inr+1]);
 147     jq2              = _mm256_set1_ps(charge[inr+2]);
 148     vdwjidx0A        = 2*vdwtype[inr+0];
 149     qq00             = _mm256_mul_ps(iq0,jq0);
 150     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
 151     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
 152     qq01             = _mm256_mul_ps(iq0,jq1);
 153     qq02             = _mm256_mul_ps(iq0,jq2);
 154     qq10             = _mm256_mul_ps(iq1,jq0);
 155     qq11             = _mm256_mul_ps(iq1,jq1);
 156     qq12             = _mm256_mul_ps(iq1,jq2);
 157     qq20             = _mm256_mul_ps(iq2,jq0);
 158     qq21             = _mm256_mul_ps(iq2,jq1);
 159     qq22             = _mm256_mul_ps(iq2,jq2);
 160
 161     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 162     rcutoff_scalar   = fr->ic->rcoulomb;
 163     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
 164     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
 165
 166     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
 167     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
 168
 169     /* Avoid stupid compiler warnings */
 170     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
 171     j_coord_offsetA = 0;
 172     j_coord_offsetB = 0;
 173     j_coord_offsetC = 0;
 174     j_coord_offsetD = 0;
 175     j_coord_offsetE = 0;
 176     j_coord_offsetF = 0;
 177     j_coord_offsetG = 0;
 178     j_coord_offsetH = 0;
 179
 180     outeriter        = 0;
 181     inneriter        = 0;
 182
 183     for(iidx=0;iidx<4*DIM;iidx++)
 184     {
 185         scratch[iidx] = 0.0;
 186     }
 187
 188     /* Start outer loop over neighborlists */
 189     for(iidx=0; iidx<nri; iidx++)
 190     {
 191         /* Load shift vector for this list */
 192         i_shift_offset   = DIM*shiftidx[iidx];
 193
 194         /* Load limits for loop over neighbors */
 195         j_index_start    = jindex[iidx];
 196         j_index_end      = jindex[iidx+1];
 197
 198         /* Get outer coordinate index */
 199         inr              = iinr[iidx];
 200         i_coord_offset   = DIM*inr;
 201
 202         /* Load i particle coords and add shift vector */
 203         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 204                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 205
 206         fix0             = _mm256_setzero_ps();
 207         fiy0             = _mm256_setzero_ps();
 208         fiz0             = _mm256_setzero_ps();
 209         fix1             = _mm256_setzero_ps();
 210         fiy1             = _mm256_setzero_ps();
 211         fiz1             = _mm256_setzero_ps();
 212         fix2             = _mm256_setzero_ps();
 213         fiy2             = _mm256_setzero_ps();
 214         fiz2             = _mm256_setzero_ps();
 215
 216         /* Reset potential sums */
 217         velecsum         = _mm256_setzero_ps();
 218         vvdwsum          = _mm256_setzero_ps();
 219
 220         /* Start inner kernel loop */
 221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
 222         {
 223
 224             /* Get j neighbor index, and coordinate index */
 225             jnrA             = jjnr[jidx];
 226             jnrB             = jjnr[jidx+1];
 227             jnrC             = jjnr[jidx+2];
 228             jnrD             = jjnr[jidx+3];
 229             jnrE             = jjnr[jidx+4];
 230             jnrF             = jjnr[jidx+5];
 231             jnrG             = jjnr[jidx+6];
 232             jnrH             = jjnr[jidx+7];
 233             j_coord_offsetA  = DIM*jnrA;
 234             j_coord_offsetB  = DIM*jnrB;
 235             j_coord_offsetC  = DIM*jnrC;
 236             j_coord_offsetD  = DIM*jnrD;
 237             j_coord_offsetE  = DIM*jnrE;
 238             j_coord_offsetF  = DIM*jnrF;
 239             j_coord_offsetG  = DIM*jnrG;
 240             j_coord_offsetH  = DIM*jnrH;
 241
 242             /* load j atom coordinates */
 243             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 244                                                  x+j_coord_offsetC,x+j_coord_offsetD,
 245                                                  x+j_coord_offsetE,x+j_coord_offsetF,
 246                                                  x+j_coord_offsetG,x+j_coord_offsetH,
 247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 248
 249             /* Calculate displacement vector */
 250             dx00             = _mm256_sub_ps(ix0,jx0);
 251             dy00             = _mm256_sub_ps(iy0,jy0);
 252             dz00             = _mm256_sub_ps(iz0,jz0);
 253             dx01             = _mm256_sub_ps(ix0,jx1);
 254             dy01             = _mm256_sub_ps(iy0,jy1);
 255             dz01             = _mm256_sub_ps(iz0,jz1);
 256             dx02             = _mm256_sub_ps(ix0,jx2);
 257             dy02             = _mm256_sub_ps(iy0,jy2);
 258             dz02             = _mm256_sub_ps(iz0,jz2);
 259             dx10             = _mm256_sub_ps(ix1,jx0);
 260             dy10             = _mm256_sub_ps(iy1,jy0);
 261             dz10             = _mm256_sub_ps(iz1,jz0);
 262             dx11             = _mm256_sub_ps(ix1,jx1);
 263             dy11             = _mm256_sub_ps(iy1,jy1);
 264             dz11             = _mm256_sub_ps(iz1,jz1);
 265             dx12             = _mm256_sub_ps(ix1,jx2);
 266             dy12             = _mm256_sub_ps(iy1,jy2);
 267             dz12             = _mm256_sub_ps(iz1,jz2);
 268             dx20             = _mm256_sub_ps(ix2,jx0);
 269             dy20             = _mm256_sub_ps(iy2,jy0);
 270             dz20             = _mm256_sub_ps(iz2,jz0);
 271             dx21             = _mm256_sub_ps(ix2,jx1);
 272             dy21             = _mm256_sub_ps(iy2,jy1);
 273             dz21             = _mm256_sub_ps(iz2,jz1);
 274             dx22             = _mm256_sub_ps(ix2,jx2);
 275             dy22             = _mm256_sub_ps(iy2,jy2);
 276             dz22             = _mm256_sub_ps(iz2,jz2);
 277
 278             /* Calculate squared distance and things based on it */
 279             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 280             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
 281             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
 282             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
 283             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
 284             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
 285             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
 286             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
 287             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
 288
 289             rinv00           = avx256_invsqrt_f(rsq00);
 290             rinv01           = avx256_invsqrt_f(rsq01);
 291             rinv02           = avx256_invsqrt_f(rsq02);
 292             rinv10           = avx256_invsqrt_f(rsq10);
 293             rinv11           = avx256_invsqrt_f(rsq11);
 294             rinv12           = avx256_invsqrt_f(rsq12);
 295             rinv20           = avx256_invsqrt_f(rsq20);
 296             rinv21           = avx256_invsqrt_f(rsq21);
 297             rinv22           = avx256_invsqrt_f(rsq22);
 298
 299             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 300             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
 301             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
 302             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
 303             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
 304             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
 305             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
 306             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
 307             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
 308
 309             fjx0             = _mm256_setzero_ps();
 310             fjy0             = _mm256_setzero_ps();
 311             fjz0             = _mm256_setzero_ps();
 312             fjx1             = _mm256_setzero_ps();
 313             fjy1             = _mm256_setzero_ps();
 314             fjz1             = _mm256_setzero_ps();
 315             fjx2             = _mm256_setzero_ps();
 316             fjy2             = _mm256_setzero_ps();
 317             fjz2             = _mm256_setzero_ps();
 318
 319             /**************************
 320              * CALCULATE INTERACTIONS *
 321              **************************/
 322
 323             if (gmx_mm256_any_lt(rsq00,rcutoff2))
 324             {
 325
 326             /* REACTION-FIELD ELECTROSTATICS */
 327             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
 328             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
 329
 330             /* LENNARD-JONES DISPERSION/REPULSION */
 331
 332             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 333             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
 334             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
 335             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 336                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
 337             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
 338
 339             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
 340
 341             /* Update potential sum for this i atom from the interaction with this j atom. */
 342             velec            = _mm256_and_ps(velec,cutoff_mask);
 343             velecsum         = _mm256_add_ps(velecsum,velec);
 344             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
 345             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 346
 347             fscal            = _mm256_add_ps(felec,fvdw);
 348
 349             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 350
 351             /* Calculate temporary vectorial force */
 352             tx               = _mm256_mul_ps(fscal,dx00);
 353             ty               = _mm256_mul_ps(fscal,dy00);
 354             tz               = _mm256_mul_ps(fscal,dz00);
 355
 356             /* Update vectorial force */
 357             fix0             = _mm256_add_ps(fix0,tx);
 358             fiy0             = _mm256_add_ps(fiy0,ty);
 359             fiz0             = _mm256_add_ps(fiz0,tz);
 360
 361             fjx0             = _mm256_add_ps(fjx0,tx);
 362             fjy0             = _mm256_add_ps(fjy0,ty);
 363             fjz0             = _mm256_add_ps(fjz0,tz);
 364
 365             }
 366
 367             /**************************
 368              * CALCULATE INTERACTIONS *
 369              **************************/
 370
 371             if (gmx_mm256_any_lt(rsq01,rcutoff2))
 372             {
 373
 374             /* REACTION-FIELD ELECTROSTATICS */
 375             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
 376             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
 377
 378             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
 379
 380             /* Update potential sum for this i atom from the interaction with this j atom. */
 381             velec            = _mm256_and_ps(velec,cutoff_mask);
 382             velecsum         = _mm256_add_ps(velecsum,velec);
 383
 384             fscal            = felec;
 385
 386             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 387
 388             /* Calculate temporary vectorial force */
 389             tx               = _mm256_mul_ps(fscal,dx01);
 390             ty               = _mm256_mul_ps(fscal,dy01);
 391             tz               = _mm256_mul_ps(fscal,dz01);
 392
 393             /* Update vectorial force */
 394             fix0             = _mm256_add_ps(fix0,tx);
 395             fiy0             = _mm256_add_ps(fiy0,ty);
 396             fiz0             = _mm256_add_ps(fiz0,tz);
 397
 398             fjx1             = _mm256_add_ps(fjx1,tx);
 399             fjy1             = _mm256_add_ps(fjy1,ty);
 400             fjz1             = _mm256_add_ps(fjz1,tz);
 401
 402             }
 403
 404             /**************************
 405              * CALCULATE INTERACTIONS *
 406              **************************/
 407
 408             if (gmx_mm256_any_lt(rsq02,rcutoff2))
 409             {
 410
 411             /* REACTION-FIELD ELECTROSTATICS */
 412             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
 413             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
 414
 415             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
 416
 417             /* Update potential sum for this i atom from the interaction with this j atom. */
 418             velec            = _mm256_and_ps(velec,cutoff_mask);
 419             velecsum         = _mm256_add_ps(velecsum,velec);
 420
 421             fscal            = felec;
 422
 423             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 424
 425             /* Calculate temporary vectorial force */
 426             tx               = _mm256_mul_ps(fscal,dx02);
 427             ty               = _mm256_mul_ps(fscal,dy02);
 428             tz               = _mm256_mul_ps(fscal,dz02);
 429
 430             /* Update vectorial force */
 431             fix0             = _mm256_add_ps(fix0,tx);
 432             fiy0             = _mm256_add_ps(fiy0,ty);
 433             fiz0             = _mm256_add_ps(fiz0,tz);
 434
 435             fjx2             = _mm256_add_ps(fjx2,tx);
 436             fjy2             = _mm256_add_ps(fjy2,ty);
 437             fjz2             = _mm256_add_ps(fjz2,tz);
 438
 439             }
 440
 441             /**************************
 442              * CALCULATE INTERACTIONS *
 443              **************************/
 444
 445             if (gmx_mm256_any_lt(rsq10,rcutoff2))
 446             {
 447
 448             /* REACTION-FIELD ELECTROSTATICS */
 449             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
 450             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
 451
 452             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
 453
 454             /* Update potential sum for this i atom from the interaction with this j atom. */
 455             velec            = _mm256_and_ps(velec,cutoff_mask);
 456             velecsum         = _mm256_add_ps(velecsum,velec);
 457
 458             fscal            = felec;
 459
 460             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 461
 462             /* Calculate temporary vectorial force */
 463             tx               = _mm256_mul_ps(fscal,dx10);
 464             ty               = _mm256_mul_ps(fscal,dy10);
 465             tz               = _mm256_mul_ps(fscal,dz10);
 466
 467             /* Update vectorial force */
 468             fix1             = _mm256_add_ps(fix1,tx);
 469             fiy1             = _mm256_add_ps(fiy1,ty);
 470             fiz1             = _mm256_add_ps(fiz1,tz);
 471
 472             fjx0             = _mm256_add_ps(fjx0,tx);
 473             fjy0             = _mm256_add_ps(fjy0,ty);
 474             fjz0             = _mm256_add_ps(fjz0,tz);
 475
 476             }
 477
 478             /**************************
 479              * CALCULATE INTERACTIONS *
 480              **************************/
 481
 482             if (gmx_mm256_any_lt(rsq11,rcutoff2))
 483             {
 484
 485             /* REACTION-FIELD ELECTROSTATICS */
 486             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
 487             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
 488
 489             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
 490
 491             /* Update potential sum for this i atom from the interaction with this j atom. */
 492             velec            = _mm256_and_ps(velec,cutoff_mask);
 493             velecsum         = _mm256_add_ps(velecsum,velec);
 494
 495             fscal            = felec;
 496
 497             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 498
 499             /* Calculate temporary vectorial force */
 500             tx               = _mm256_mul_ps(fscal,dx11);
 501             ty               = _mm256_mul_ps(fscal,dy11);
 502             tz               = _mm256_mul_ps(fscal,dz11);
 503
 504             /* Update vectorial force */
 505             fix1             = _mm256_add_ps(fix1,tx);
 506             fiy1             = _mm256_add_ps(fiy1,ty);
 507             fiz1             = _mm256_add_ps(fiz1,tz);
 508
 509             fjx1             = _mm256_add_ps(fjx1,tx);
 510             fjy1             = _mm256_add_ps(fjy1,ty);
 511             fjz1             = _mm256_add_ps(fjz1,tz);
 512
 513             }
 514
 515             /**************************
 516              * CALCULATE INTERACTIONS *
 517              **************************/
 518
 519             if (gmx_mm256_any_lt(rsq12,rcutoff2))
 520             {
 521
 522             /* REACTION-FIELD ELECTROSTATICS */
 523             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
 524             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
 525
 526             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
 527
 528             /* Update potential sum for this i atom from the interaction with this j atom. */
 529             velec            = _mm256_and_ps(velec,cutoff_mask);
 530             velecsum         = _mm256_add_ps(velecsum,velec);
 531
 532             fscal            = felec;
 533
 534             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 535
 536             /* Calculate temporary vectorial force */
 537             tx               = _mm256_mul_ps(fscal,dx12);
 538             ty               = _mm256_mul_ps(fscal,dy12);
 539             tz               = _mm256_mul_ps(fscal,dz12);
 540
 541             /* Update vectorial force */
 542             fix1             = _mm256_add_ps(fix1,tx);
 543             fiy1             = _mm256_add_ps(fiy1,ty);
 544             fiz1             = _mm256_add_ps(fiz1,tz);
 545
 546             fjx2             = _mm256_add_ps(fjx2,tx);
 547             fjy2             = _mm256_add_ps(fjy2,ty);
 548             fjz2             = _mm256_add_ps(fjz2,tz);
 549
 550             }
 551
 552             /**************************
 553              * CALCULATE INTERACTIONS *
 554              **************************/
 555
 556             if (gmx_mm256_any_lt(rsq20,rcutoff2))
 557             {
 558
 559             /* REACTION-FIELD ELECTROSTATICS */
 560             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
 561             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
 562
 563             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
 564
 565             /* Update potential sum for this i atom from the interaction with this j atom. */
 566             velec            = _mm256_and_ps(velec,cutoff_mask);
 567             velecsum         = _mm256_add_ps(velecsum,velec);
 568
 569             fscal            = felec;
 570
 571             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 572
 573             /* Calculate temporary vectorial force */
 574             tx               = _mm256_mul_ps(fscal,dx20);
 575             ty               = _mm256_mul_ps(fscal,dy20);
 576             tz               = _mm256_mul_ps(fscal,dz20);
 577
 578             /* Update vectorial force */
 579             fix2             = _mm256_add_ps(fix2,tx);
 580             fiy2             = _mm256_add_ps(fiy2,ty);
 581             fiz2             = _mm256_add_ps(fiz2,tz);
 582
 583             fjx0             = _mm256_add_ps(fjx0,tx);
 584             fjy0             = _mm256_add_ps(fjy0,ty);
 585             fjz0             = _mm256_add_ps(fjz0,tz);
 586
 587             }
 588
 589             /**************************
 590              * CALCULATE INTERACTIONS *
 591              **************************/
 592
 593             if (gmx_mm256_any_lt(rsq21,rcutoff2))
 594             {
 595
 596             /* REACTION-FIELD ELECTROSTATICS */
 597             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
 598             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
 599
 600             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
 601
 602             /* Update potential sum for this i atom from the interaction with this j atom. */
 603             velec            = _mm256_and_ps(velec,cutoff_mask);
 604             velecsum         = _mm256_add_ps(velecsum,velec);
 605
 606             fscal            = felec;
 607
 608             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 609
 610             /* Calculate temporary vectorial force */
 611             tx               = _mm256_mul_ps(fscal,dx21);
 612             ty               = _mm256_mul_ps(fscal,dy21);
 613             tz               = _mm256_mul_ps(fscal,dz21);
 614
 615             /* Update vectorial force */
 616             fix2             = _mm256_add_ps(fix2,tx);
 617             fiy2             = _mm256_add_ps(fiy2,ty);
 618             fiz2             = _mm256_add_ps(fiz2,tz);
 619
 620             fjx1             = _mm256_add_ps(fjx1,tx);
 621             fjy1             = _mm256_add_ps(fjy1,ty);
 622             fjz1             = _mm256_add_ps(fjz1,tz);
 623
 624             }
 625
 626             /**************************
 627              * CALCULATE INTERACTIONS *
 628              **************************/
 629
 630             if (gmx_mm256_any_lt(rsq22,rcutoff2))
 631             {
 632
 633             /* REACTION-FIELD ELECTROSTATICS */
 634             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
 635             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
 636
 637             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
 638
 639             /* Update potential sum for this i atom from the interaction with this j atom. */
 640             velec            = _mm256_and_ps(velec,cutoff_mask);
 641             velecsum         = _mm256_add_ps(velecsum,velec);
 642
 643             fscal            = felec;
 644
 645             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 646
 647             /* Calculate temporary vectorial force */
 648             tx               = _mm256_mul_ps(fscal,dx22);
 649             ty               = _mm256_mul_ps(fscal,dy22);
 650             tz               = _mm256_mul_ps(fscal,dz22);
 651
 652             /* Update vectorial force */
 653             fix2             = _mm256_add_ps(fix2,tx);
 654             fiy2             = _mm256_add_ps(fiy2,ty);
 655             fiz2             = _mm256_add_ps(fiz2,tz);
 656
 657             fjx2             = _mm256_add_ps(fjx2,tx);
 658             fjy2             = _mm256_add_ps(fjy2,ty);
 659             fjz2             = _mm256_add_ps(fjz2,tz);
 660
 661             }
 662
 663             fjptrA             = f+j_coord_offsetA;
 664             fjptrB             = f+j_coord_offsetB;
 665             fjptrC             = f+j_coord_offsetC;
 666             fjptrD             = f+j_coord_offsetD;
 667             fjptrE             = f+j_coord_offsetE;
 668             fjptrF             = f+j_coord_offsetF;
 669             fjptrG             = f+j_coord_offsetG;
 670             fjptrH             = f+j_coord_offsetH;
 671
 672             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
 673                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 674
 675             /* Inner loop uses 342 flops */
 676         }
 677
 678         if(jidx<j_index_end)
 679         {
 680
 681             /* Get j neighbor index, and coordinate index */
 682             jnrlistA         = jjnr[jidx];
 683             jnrlistB         = jjnr[jidx+1];
 684             jnrlistC         = jjnr[jidx+2];
 685             jnrlistD         = jjnr[jidx+3];
 686             jnrlistE         = jjnr[jidx+4];
 687             jnrlistF         = jjnr[jidx+5];
 688             jnrlistG         = jjnr[jidx+6];
 689             jnrlistH         = jjnr[jidx+7];
 690             /* Sign of each element will be negative for non-real atoms.
 691              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 692              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 693              */
 694             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
 695                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
 696
 697             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 698             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 699             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 700             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 701             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
 702             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
 703             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
 704             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
 705             j_coord_offsetA  = DIM*jnrA;
 706             j_coord_offsetB  = DIM*jnrB;
 707             j_coord_offsetC  = DIM*jnrC;
 708             j_coord_offsetD  = DIM*jnrD;
 709             j_coord_offsetE  = DIM*jnrE;
 710             j_coord_offsetF  = DIM*jnrF;
 711             j_coord_offsetG  = DIM*jnrG;
 712             j_coord_offsetH  = DIM*jnrH;
 713
 714             /* load j atom coordinates */
 715             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 716                                                  x+j_coord_offsetC,x+j_coord_offsetD,
 717                                                  x+j_coord_offsetE,x+j_coord_offsetF,
 718                                                  x+j_coord_offsetG,x+j_coord_offsetH,
 719                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 720
 721             /* Calculate displacement vector */
 722             dx00             = _mm256_sub_ps(ix0,jx0);
 723             dy00             = _mm256_sub_ps(iy0,jy0);
 724             dz00             = _mm256_sub_ps(iz0,jz0);
 725             dx01             = _mm256_sub_ps(ix0,jx1);
 726             dy01             = _mm256_sub_ps(iy0,jy1);
 727             dz01             = _mm256_sub_ps(iz0,jz1);
 728             dx02             = _mm256_sub_ps(ix0,jx2);
 729             dy02             = _mm256_sub_ps(iy0,jy2);
 730             dz02             = _mm256_sub_ps(iz0,jz2);
 731             dx10             = _mm256_sub_ps(ix1,jx0);
 732             dy10             = _mm256_sub_ps(iy1,jy0);
 733             dz10             = _mm256_sub_ps(iz1,jz0);
 734             dx11             = _mm256_sub_ps(ix1,jx1);
 735             dy11             = _mm256_sub_ps(iy1,jy1);
 736             dz11             = _mm256_sub_ps(iz1,jz1);
 737             dx12             = _mm256_sub_ps(ix1,jx2);
 738             dy12             = _mm256_sub_ps(iy1,jy2);
 739             dz12             = _mm256_sub_ps(iz1,jz2);
 740             dx20             = _mm256_sub_ps(ix2,jx0);
 741             dy20             = _mm256_sub_ps(iy2,jy0);
 742             dz20             = _mm256_sub_ps(iz2,jz0);
 743             dx21             = _mm256_sub_ps(ix2,jx1);
 744             dy21             = _mm256_sub_ps(iy2,jy1);
 745             dz21             = _mm256_sub_ps(iz2,jz1);
 746             dx22             = _mm256_sub_ps(ix2,jx2);
 747             dy22             = _mm256_sub_ps(iy2,jy2);
 748             dz22             = _mm256_sub_ps(iz2,jz2);
 749
 750             /* Calculate squared distance and things based on it */
 751             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
 752             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
 753             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
 754             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
 755             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
 756             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
 757             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
 758             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
 759             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
 760
 761             rinv00           = avx256_invsqrt_f(rsq00);
 762             rinv01           = avx256_invsqrt_f(rsq01);
 763             rinv02           = avx256_invsqrt_f(rsq02);
 764             rinv10           = avx256_invsqrt_f(rsq10);
 765             rinv11           = avx256_invsqrt_f(rsq11);
 766             rinv12           = avx256_invsqrt_f(rsq12);
 767             rinv20           = avx256_invsqrt_f(rsq20);
 768             rinv21           = avx256_invsqrt_f(rsq21);
 769             rinv22           = avx256_invsqrt_f(rsq22);
 770
 771             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
 772             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
 773             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
 774             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
 775             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
 776             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
 777             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
 778             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
 779             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
 780
 781             fjx0             = _mm256_setzero_ps();
 782             fjy0             = _mm256_setzero_ps();
 783             fjz0             = _mm256_setzero_ps();
 784             fjx1             = _mm256_setzero_ps();
 785             fjy1             = _mm256_setzero_ps();
 786             fjz1             = _mm256_setzero_ps();
 787             fjx2             = _mm256_setzero_ps();
 788             fjy2             = _mm256_setzero_ps();
 789             fjz2             = _mm256_setzero_ps();
 790
 791             /**************************
 792              * CALCULATE INTERACTIONS *
 793              **************************/
 794
 795             if (gmx_mm256_any_lt(rsq00,rcutoff2))
 796             {
 797
 798             /* REACTION-FIELD ELECTROSTATICS */
 799             velec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
 800             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
 801
 802             /* LENNARD-JONES DISPERSION/REPULSION */
 803
 804             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 805             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
 806             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
 807             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
 808                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
 809             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
 810
 811             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
 812
 813             /* Update potential sum for this i atom from the interaction with this j atom. */
 814             velec            = _mm256_and_ps(velec,cutoff_mask);
 815             velec            = _mm256_andnot_ps(dummy_mask,velec);
 816             velecsum         = _mm256_add_ps(velecsum,velec);
 817             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
 818             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
 819             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
 820
 821             fscal            = _mm256_add_ps(felec,fvdw);
 822
 823             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 824
 825             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 826
 827             /* Calculate temporary vectorial force */
 828             tx               = _mm256_mul_ps(fscal,dx00);
 829             ty               = _mm256_mul_ps(fscal,dy00);
 830             tz               = _mm256_mul_ps(fscal,dz00);
 831
 832             /* Update vectorial force */
 833             fix0             = _mm256_add_ps(fix0,tx);
 834             fiy0             = _mm256_add_ps(fiy0,ty);
 835             fiz0             = _mm256_add_ps(fiz0,tz);
 836
 837             fjx0             = _mm256_add_ps(fjx0,tx);
 838             fjy0             = _mm256_add_ps(fjy0,ty);
 839             fjz0             = _mm256_add_ps(fjz0,tz);
 840
 841             }
 842
 843             /**************************
 844              * CALCULATE INTERACTIONS *
 845              **************************/
 846
 847             if (gmx_mm256_any_lt(rsq01,rcutoff2))
 848             {
 849
 850             /* REACTION-FIELD ELECTROSTATICS */
 851             velec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
 852             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
 853
 854             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
 855
 856             /* Update potential sum for this i atom from the interaction with this j atom. */
 857             velec            = _mm256_and_ps(velec,cutoff_mask);
 858             velec            = _mm256_andnot_ps(dummy_mask,velec);
 859             velecsum         = _mm256_add_ps(velecsum,velec);
 860
 861             fscal            = felec;
 862
 863             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 864
 865             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 866
 867             /* Calculate temporary vectorial force */
 868             tx               = _mm256_mul_ps(fscal,dx01);
 869             ty               = _mm256_mul_ps(fscal,dy01);
 870             tz               = _mm256_mul_ps(fscal,dz01);
 871
 872             /* Update vectorial force */
 873             fix0             = _mm256_add_ps(fix0,tx);
 874             fiy0             = _mm256_add_ps(fiy0,ty);
 875             fiz0             = _mm256_add_ps(fiz0,tz);
 876
 877             fjx1             = _mm256_add_ps(fjx1,tx);
 878             fjy1             = _mm256_add_ps(fjy1,ty);
 879             fjz1             = _mm256_add_ps(fjz1,tz);
 880
 881             }
 882
 883             /**************************
 884              * CALCULATE INTERACTIONS *
 885              **************************/
 886
 887             if (gmx_mm256_any_lt(rsq02,rcutoff2))
 888             {
 889
 890             /* REACTION-FIELD ELECTROSTATICS */
 891             velec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
 892             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
 893
 894             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
 895
 896             /* Update potential sum for this i atom from the interaction with this j atom. */
 897             velec            = _mm256_and_ps(velec,cutoff_mask);
 898             velec            = _mm256_andnot_ps(dummy_mask,velec);
 899             velecsum         = _mm256_add_ps(velecsum,velec);
 900
 901             fscal            = felec;
 902
 903             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 904
 905             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 906
 907             /* Calculate temporary vectorial force */
 908             tx               = _mm256_mul_ps(fscal,dx02);
 909             ty               = _mm256_mul_ps(fscal,dy02);
 910             tz               = _mm256_mul_ps(fscal,dz02);
 911
 912             /* Update vectorial force */
 913             fix0             = _mm256_add_ps(fix0,tx);
 914             fiy0             = _mm256_add_ps(fiy0,ty);
 915             fiz0             = _mm256_add_ps(fiz0,tz);
 916
 917             fjx2             = _mm256_add_ps(fjx2,tx);
 918             fjy2             = _mm256_add_ps(fjy2,ty);
 919             fjz2             = _mm256_add_ps(fjz2,tz);
 920
 921             }
 922
 923             /**************************
 924              * CALCULATE INTERACTIONS *
 925              **************************/
 926
 927             if (gmx_mm256_any_lt(rsq10,rcutoff2))
 928             {
 929
 930             /* REACTION-FIELD ELECTROSTATICS */
 931             velec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
 932             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
 933
 934             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
 935
 936             /* Update potential sum for this i atom from the interaction with this j atom. */
 937             velec            = _mm256_and_ps(velec,cutoff_mask);
 938             velec            = _mm256_andnot_ps(dummy_mask,velec);
 939             velecsum         = _mm256_add_ps(velecsum,velec);
 940
 941             fscal            = felec;
 942
 943             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 944
 945             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 946
 947             /* Calculate temporary vectorial force */
 948             tx               = _mm256_mul_ps(fscal,dx10);
 949             ty               = _mm256_mul_ps(fscal,dy10);
 950             tz               = _mm256_mul_ps(fscal,dz10);
 951
 952             /* Update vectorial force */
 953             fix1             = _mm256_add_ps(fix1,tx);
 954             fiy1             = _mm256_add_ps(fiy1,ty);
 955             fiz1             = _mm256_add_ps(fiz1,tz);
 956
 957             fjx0             = _mm256_add_ps(fjx0,tx);
 958             fjy0             = _mm256_add_ps(fjy0,ty);
 959             fjz0             = _mm256_add_ps(fjz0,tz);
 960
 961             }
 962
 963             /**************************
 964              * CALCULATE INTERACTIONS *
 965              **************************/
 966
 967             if (gmx_mm256_any_lt(rsq11,rcutoff2))
 968             {
 969
 970             /* REACTION-FIELD ELECTROSTATICS */
 971             velec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
 972             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
 973
 974             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
 975
 976             /* Update potential sum for this i atom from the interaction with this j atom. */
 977             velec            = _mm256_and_ps(velec,cutoff_mask);
 978             velec            = _mm256_andnot_ps(dummy_mask,velec);
 979             velecsum         = _mm256_add_ps(velecsum,velec);
 980
 981             fscal            = felec;
 982
 983             fscal            = _mm256_and_ps(fscal,cutoff_mask);
 984
 985             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
 986
 987             /* Calculate temporary vectorial force */
 988             tx               = _mm256_mul_ps(fscal,dx11);
 989             ty               = _mm256_mul_ps(fscal,dy11);
 990             tz               = _mm256_mul_ps(fscal,dz11);
 991
 992             /* Update vectorial force */
 993             fix1             = _mm256_add_ps(fix1,tx);
 994             fiy1             = _mm256_add_ps(fiy1,ty);
 995             fiz1             = _mm256_add_ps(fiz1,tz);
 996
 997             fjx1             = _mm256_add_ps(fjx1,tx);
 998             fjy1             = _mm256_add_ps(fjy1,ty);
 999             fjz1             = _mm256_add_ps(fjz1,tz);
1000
1001             }
1002
1003             /**************************
1004              * CALCULATE INTERACTIONS *
1005              **************************/
1006
1007             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1008             {
1009
1010             /* REACTION-FIELD ELECTROSTATICS */
1011             velec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1012             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1013
1014             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm256_and_ps(velec,cutoff_mask);
1018             velec            = _mm256_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm256_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1024
1025             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1026
1027             /* Calculate temporary vectorial force */
1028             tx               = _mm256_mul_ps(fscal,dx12);
1029             ty               = _mm256_mul_ps(fscal,dy12);
1030             tz               = _mm256_mul_ps(fscal,dz12);
1031
1032             /* Update vectorial force */
1033             fix1             = _mm256_add_ps(fix1,tx);
1034             fiy1             = _mm256_add_ps(fiy1,ty);
1035             fiz1             = _mm256_add_ps(fiz1,tz);
1036
1037             fjx2             = _mm256_add_ps(fjx2,tx);
1038             fjy2             = _mm256_add_ps(fjy2,ty);
1039             fjz2             = _mm256_add_ps(fjz2,tz);
1040
1041             }
1042
1043             /**************************
1044              * CALCULATE INTERACTIONS *
1045              **************************/
1046
1047             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1048             {
1049
1050             /* REACTION-FIELD ELECTROSTATICS */
1051             velec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1052             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1053
1054             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1055
1056             /* Update potential sum for this i atom from the interaction with this j atom. */
1057             velec            = _mm256_and_ps(velec,cutoff_mask);
1058             velec            = _mm256_andnot_ps(dummy_mask,velec);
1059             velecsum         = _mm256_add_ps(velecsum,velec);
1060
1061             fscal            = felec;
1062
1063             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1064
1065             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1066
1067             /* Calculate temporary vectorial force */
1068             tx               = _mm256_mul_ps(fscal,dx20);
1069             ty               = _mm256_mul_ps(fscal,dy20);
1070             tz               = _mm256_mul_ps(fscal,dz20);
1071
1072             /* Update vectorial force */
1073             fix2             = _mm256_add_ps(fix2,tx);
1074             fiy2             = _mm256_add_ps(fiy2,ty);
1075             fiz2             = _mm256_add_ps(fiz2,tz);
1076
1077             fjx0             = _mm256_add_ps(fjx0,tx);
1078             fjy0             = _mm256_add_ps(fjy0,ty);
1079             fjz0             = _mm256_add_ps(fjz0,tz);
1080
1081             }
1082
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1088             {
1089
1090             /* REACTION-FIELD ELECTROSTATICS */
1091             velec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1092             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1093
1094             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1095
1096             /* Update potential sum for this i atom from the interaction with this j atom. */
1097             velec            = _mm256_and_ps(velec,cutoff_mask);
1098             velec            = _mm256_andnot_ps(dummy_mask,velec);
1099             velecsum         = _mm256_add_ps(velecsum,velec);
1100
1101             fscal            = felec;
1102
1103             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1104
1105             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1106
1107             /* Calculate temporary vectorial force */
1108             tx               = _mm256_mul_ps(fscal,dx21);
1109             ty               = _mm256_mul_ps(fscal,dy21);
1110             tz               = _mm256_mul_ps(fscal,dz21);
1111
1112             /* Update vectorial force */
1113             fix2             = _mm256_add_ps(fix2,tx);
1114             fiy2             = _mm256_add_ps(fiy2,ty);
1115             fiz2             = _mm256_add_ps(fiz2,tz);
1116
1117             fjx1             = _mm256_add_ps(fjx1,tx);
1118             fjy1             = _mm256_add_ps(fjy1,ty);
1119             fjz1             = _mm256_add_ps(fjz1,tz);
1120
1121             }
1122
1123             /**************************
1124              * CALCULATE INTERACTIONS *
1125              **************************/
1126
1127             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1128             {
1129
1130             /* REACTION-FIELD ELECTROSTATICS */
1131             velec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1132             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1133
1134             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1135
1136             /* Update potential sum for this i atom from the interaction with this j atom. */
1137             velec            = _mm256_and_ps(velec,cutoff_mask);
1138             velec            = _mm256_andnot_ps(dummy_mask,velec);
1139             velecsum         = _mm256_add_ps(velecsum,velec);
1140
1141             fscal            = felec;
1142
1143             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1144
1145             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1146
1147             /* Calculate temporary vectorial force */
1148             tx               = _mm256_mul_ps(fscal,dx22);
1149             ty               = _mm256_mul_ps(fscal,dy22);
1150             tz               = _mm256_mul_ps(fscal,dz22);
1151
1152             /* Update vectorial force */
1153             fix2             = _mm256_add_ps(fix2,tx);
1154             fiy2             = _mm256_add_ps(fiy2,ty);
1155             fiz2             = _mm256_add_ps(fiz2,tz);
1156
1157             fjx2             = _mm256_add_ps(fjx2,tx);
1158             fjy2             = _mm256_add_ps(fjy2,ty);
1159             fjz2             = _mm256_add_ps(fjz2,tz);
1160
1161             }
1162
1163             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1164             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1165             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1166             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1167             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1168             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1169             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1170             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1171
1172             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1173                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1174
1175             /* Inner loop uses 342 flops */
1176         }
1177
1178         /* End of innermost loop */
1179
1180         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1181                                                  f+i_coord_offset,fshift+i_shift_offset);
1182
1183         ggid                        = gid[iidx];
1184         /* Update potential energies */
1185         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1186         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1187
1188         /* Increment number of inner iterations */
1189         inneriter                  += j_index_end - j_index_start;
1190
1191         /* Outer loop uses 20 flops */
1192     }
1193
1194     /* Increment number of outer iterations */
1195     outeriter        += nri;
1196
1197     /* Update outer/inner flops */
1198
1199     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1200 }
1201 /*
1202  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1203  * Electrostatics interaction: ReactionField
1204  * VdW interaction:            LennardJones
1205  * Geometry:                   Water3-Water3
1206  * Calculate force/pot:        Force
1207  */
1208 void
1209 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_single
1210                     (t_nblist                    * gmx_restrict       nlist,
1211                      rvec                        * gmx_restrict          xx,
1212                      rvec                        * gmx_restrict          ff,
1213                      struct t_forcerec           * gmx_restrict          fr,
1214                      t_mdatoms                   * gmx_restrict     mdatoms,
1215                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1216                      t_nrnb                      * gmx_restrict        nrnb)
1217 {
1218     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1219      * just 0 for non-waters.
1220      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1221      * jnr indices corresponding to data put in the four positions in the SIMD register.
1222      */
1223     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1224     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1225     int              jnrA,jnrB,jnrC,jnrD;
1226     int              jnrE,jnrF,jnrG,jnrH;
1227     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1228     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1229     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1230     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1231     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1232     real             rcutoff_scalar;
1233     real             *shiftvec,*fshift,*x,*f;
1234     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1235     real             scratch[4*DIM];
1236     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1237     real *           vdwioffsetptr0;
1238     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1239     real *           vdwioffsetptr1;
1240     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1241     real *           vdwioffsetptr2;
1242     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1243     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1244     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1245     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1246     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1247     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1248     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1249     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1250     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1251     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1252     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1253     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1254     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1255     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1256     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1257     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1258     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1259     real             *charge;
1260     int              nvdwtype;
1261     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1262     int              *vdwtype;
1263     real             *vdwparam;
1264     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1265     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1266     __m256           dummy_mask,cutoff_mask;
1267     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1268     __m256           one     = _mm256_set1_ps(1.0);
1269     __m256           two     = _mm256_set1_ps(2.0);
1270     x                = xx[0];
1271     f                = ff[0];
1272
1273     nri              = nlist->nri;
1274     iinr             = nlist->iinr;
1275     jindex           = nlist->jindex;
1276     jjnr             = nlist->jjnr;
1277     shiftidx         = nlist->shift;
1278     gid              = nlist->gid;
1279     shiftvec         = fr->shift_vec[0];
1280     fshift           = fr->fshift[0];
1281     facel            = _mm256_set1_ps(fr->ic->epsfac);
1282     charge           = mdatoms->chargeA;
1283     krf              = _mm256_set1_ps(fr->ic->k_rf);
1284     krf2             = _mm256_set1_ps(fr->ic->k_rf*2.0);
1285     crf              = _mm256_set1_ps(fr->ic->c_rf);
1286     nvdwtype         = fr->ntype;
1287     vdwparam         = fr->nbfp;
1288     vdwtype          = mdatoms->typeA;
1289
1290     /* Setup water-specific parameters */
1291     inr              = nlist->iinr[0];
1292     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1293     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1294     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1295     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1296
1297     jq0              = _mm256_set1_ps(charge[inr+0]);
1298     jq1              = _mm256_set1_ps(charge[inr+1]);
1299     jq2              = _mm256_set1_ps(charge[inr+2]);
1300     vdwjidx0A        = 2*vdwtype[inr+0];
1301     qq00             = _mm256_mul_ps(iq0,jq0);
1302     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1303     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1304     qq01             = _mm256_mul_ps(iq0,jq1);
1305     qq02             = _mm256_mul_ps(iq0,jq2);
1306     qq10             = _mm256_mul_ps(iq1,jq0);
1307     qq11             = _mm256_mul_ps(iq1,jq1);
1308     qq12             = _mm256_mul_ps(iq1,jq2);
1309     qq20             = _mm256_mul_ps(iq2,jq0);
1310     qq21             = _mm256_mul_ps(iq2,jq1);
1311     qq22             = _mm256_mul_ps(iq2,jq2);
1312
1313     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1314     rcutoff_scalar   = fr->ic->rcoulomb;
1315     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1316     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1317
1318     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1319     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
1320
1321     /* Avoid stupid compiler warnings */
1322     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1323     j_coord_offsetA = 0;
1324     j_coord_offsetB = 0;
1325     j_coord_offsetC = 0;
1326     j_coord_offsetD = 0;
1327     j_coord_offsetE = 0;
1328     j_coord_offsetF = 0;
1329     j_coord_offsetG = 0;
1330     j_coord_offsetH = 0;
1331
1332     outeriter        = 0;
1333     inneriter        = 0;
1334
1335     for(iidx=0;iidx<4*DIM;iidx++)
1336     {
1337         scratch[iidx] = 0.0;
1338     }
1339
1340     /* Start outer loop over neighborlists */
1341     for(iidx=0; iidx<nri; iidx++)
1342     {
1343         /* Load shift vector for this list */
1344         i_shift_offset   = DIM*shiftidx[iidx];
1345
1346         /* Load limits for loop over neighbors */
1347         j_index_start    = jindex[iidx];
1348         j_index_end      = jindex[iidx+1];
1349
1350         /* Get outer coordinate index */
1351         inr              = iinr[iidx];
1352         i_coord_offset   = DIM*inr;
1353
1354         /* Load i particle coords and add shift vector */
1355         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1356                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1357
1358         fix0             = _mm256_setzero_ps();
1359         fiy0             = _mm256_setzero_ps();
1360         fiz0             = _mm256_setzero_ps();
1361         fix1             = _mm256_setzero_ps();
1362         fiy1             = _mm256_setzero_ps();
1363         fiz1             = _mm256_setzero_ps();
1364         fix2             = _mm256_setzero_ps();
1365         fiy2             = _mm256_setzero_ps();
1366         fiz2             = _mm256_setzero_ps();
1367
1368         /* Start inner kernel loop */
1369         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1370         {
1371
1372             /* Get j neighbor index, and coordinate index */
1373             jnrA             = jjnr[jidx];
1374             jnrB             = jjnr[jidx+1];
1375             jnrC             = jjnr[jidx+2];
1376             jnrD             = jjnr[jidx+3];
1377             jnrE             = jjnr[jidx+4];
1378             jnrF             = jjnr[jidx+5];
1379             jnrG             = jjnr[jidx+6];
1380             jnrH             = jjnr[jidx+7];
1381             j_coord_offsetA  = DIM*jnrA;
1382             j_coord_offsetB  = DIM*jnrB;
1383             j_coord_offsetC  = DIM*jnrC;
1384             j_coord_offsetD  = DIM*jnrD;
1385             j_coord_offsetE  = DIM*jnrE;
1386             j_coord_offsetF  = DIM*jnrF;
1387             j_coord_offsetG  = DIM*jnrG;
1388             j_coord_offsetH  = DIM*jnrH;
1389
1390             /* load j atom coordinates */
1391             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1392                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1393                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1394                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1395                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1396
1397             /* Calculate displacement vector */
1398             dx00             = _mm256_sub_ps(ix0,jx0);
1399             dy00             = _mm256_sub_ps(iy0,jy0);
1400             dz00             = _mm256_sub_ps(iz0,jz0);
1401             dx01             = _mm256_sub_ps(ix0,jx1);
1402             dy01             = _mm256_sub_ps(iy0,jy1);
1403             dz01             = _mm256_sub_ps(iz0,jz1);
1404             dx02             = _mm256_sub_ps(ix0,jx2);
1405             dy02             = _mm256_sub_ps(iy0,jy2);
1406             dz02             = _mm256_sub_ps(iz0,jz2);
1407             dx10             = _mm256_sub_ps(ix1,jx0);
1408             dy10             = _mm256_sub_ps(iy1,jy0);
1409             dz10             = _mm256_sub_ps(iz1,jz0);
1410             dx11             = _mm256_sub_ps(ix1,jx1);
1411             dy11             = _mm256_sub_ps(iy1,jy1);
1412             dz11             = _mm256_sub_ps(iz1,jz1);
1413             dx12             = _mm256_sub_ps(ix1,jx2);
1414             dy12             = _mm256_sub_ps(iy1,jy2);
1415             dz12             = _mm256_sub_ps(iz1,jz2);
1416             dx20             = _mm256_sub_ps(ix2,jx0);
1417             dy20             = _mm256_sub_ps(iy2,jy0);
1418             dz20             = _mm256_sub_ps(iz2,jz0);
1419             dx21             = _mm256_sub_ps(ix2,jx1);
1420             dy21             = _mm256_sub_ps(iy2,jy1);
1421             dz21             = _mm256_sub_ps(iz2,jz1);
1422             dx22             = _mm256_sub_ps(ix2,jx2);
1423             dy22             = _mm256_sub_ps(iy2,jy2);
1424             dz22             = _mm256_sub_ps(iz2,jz2);
1425
1426             /* Calculate squared distance and things based on it */
1427             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1428             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1429             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1430             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1431             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1432             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1433             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1434             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1435             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1436
1437             rinv00           = avx256_invsqrt_f(rsq00);
1438             rinv01           = avx256_invsqrt_f(rsq01);
1439             rinv02           = avx256_invsqrt_f(rsq02);
1440             rinv10           = avx256_invsqrt_f(rsq10);
1441             rinv11           = avx256_invsqrt_f(rsq11);
1442             rinv12           = avx256_invsqrt_f(rsq12);
1443             rinv20           = avx256_invsqrt_f(rsq20);
1444             rinv21           = avx256_invsqrt_f(rsq21);
1445             rinv22           = avx256_invsqrt_f(rsq22);
1446
1447             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1448             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1449             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1450             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1451             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1452             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1453             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1454             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1455             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1456
1457             fjx0             = _mm256_setzero_ps();
1458             fjy0             = _mm256_setzero_ps();
1459             fjz0             = _mm256_setzero_ps();
1460             fjx1             = _mm256_setzero_ps();
1461             fjy1             = _mm256_setzero_ps();
1462             fjz1             = _mm256_setzero_ps();
1463             fjx2             = _mm256_setzero_ps();
1464             fjy2             = _mm256_setzero_ps();
1465             fjz2             = _mm256_setzero_ps();
1466
1467             /**************************
1468              * CALCULATE INTERACTIONS *
1469              **************************/
1470
1471             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1472             {
1473
1474             /* REACTION-FIELD ELECTROSTATICS */
1475             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1476
1477             /* LENNARD-JONES DISPERSION/REPULSION */
1478
1479             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1480             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1481
1482             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1483
1484             fscal            = _mm256_add_ps(felec,fvdw);
1485
1486             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1487
1488             /* Calculate temporary vectorial force */
1489             tx               = _mm256_mul_ps(fscal,dx00);
1490             ty               = _mm256_mul_ps(fscal,dy00);
1491             tz               = _mm256_mul_ps(fscal,dz00);
1492
1493             /* Update vectorial force */
1494             fix0             = _mm256_add_ps(fix0,tx);
1495             fiy0             = _mm256_add_ps(fiy0,ty);
1496             fiz0             = _mm256_add_ps(fiz0,tz);
1497
1498             fjx0             = _mm256_add_ps(fjx0,tx);
1499             fjy0             = _mm256_add_ps(fjy0,ty);
1500             fjz0             = _mm256_add_ps(fjz0,tz);
1501
1502             }
1503
1504             /**************************
1505              * CALCULATE INTERACTIONS *
1506              **************************/
1507
1508             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1509             {
1510
1511             /* REACTION-FIELD ELECTROSTATICS */
1512             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1513
1514             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1515
1516             fscal            = felec;
1517
1518             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm256_mul_ps(fscal,dx01);
1522             ty               = _mm256_mul_ps(fscal,dy01);
1523             tz               = _mm256_mul_ps(fscal,dz01);
1524
1525             /* Update vectorial force */
1526             fix0             = _mm256_add_ps(fix0,tx);
1527             fiy0             = _mm256_add_ps(fiy0,ty);
1528             fiz0             = _mm256_add_ps(fiz0,tz);
1529
1530             fjx1             = _mm256_add_ps(fjx1,tx);
1531             fjy1             = _mm256_add_ps(fjy1,ty);
1532             fjz1             = _mm256_add_ps(fjz1,tz);
1533
1534             }
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1541             {
1542
1543             /* REACTION-FIELD ELECTROSTATICS */
1544             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1545
1546             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1547
1548             fscal            = felec;
1549
1550             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1551
1552             /* Calculate temporary vectorial force */
1553             tx               = _mm256_mul_ps(fscal,dx02);
1554             ty               = _mm256_mul_ps(fscal,dy02);
1555             tz               = _mm256_mul_ps(fscal,dz02);
1556
1557             /* Update vectorial force */
1558             fix0             = _mm256_add_ps(fix0,tx);
1559             fiy0             = _mm256_add_ps(fiy0,ty);
1560             fiz0             = _mm256_add_ps(fiz0,tz);
1561
1562             fjx2             = _mm256_add_ps(fjx2,tx);
1563             fjy2             = _mm256_add_ps(fjy2,ty);
1564             fjz2             = _mm256_add_ps(fjz2,tz);
1565
1566             }
1567
1568             /**************************
1569              * CALCULATE INTERACTIONS *
1570              **************************/
1571
1572             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1573             {
1574
1575             /* REACTION-FIELD ELECTROSTATICS */
1576             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1577
1578             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1579
1580             fscal            = felec;
1581
1582             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1583
1584             /* Calculate temporary vectorial force */
1585             tx               = _mm256_mul_ps(fscal,dx10);
1586             ty               = _mm256_mul_ps(fscal,dy10);
1587             tz               = _mm256_mul_ps(fscal,dz10);
1588
1589             /* Update vectorial force */
1590             fix1             = _mm256_add_ps(fix1,tx);
1591             fiy1             = _mm256_add_ps(fiy1,ty);
1592             fiz1             = _mm256_add_ps(fiz1,tz);
1593
1594             fjx0             = _mm256_add_ps(fjx0,tx);
1595             fjy0             = _mm256_add_ps(fjy0,ty);
1596             fjz0             = _mm256_add_ps(fjz0,tz);
1597
1598             }
1599
1600             /**************************
1601              * CALCULATE INTERACTIONS *
1602              **************************/
1603
1604             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1605             {
1606
1607             /* REACTION-FIELD ELECTROSTATICS */
1608             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1609
1610             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1611
1612             fscal            = felec;
1613
1614             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1615
1616             /* Calculate temporary vectorial force */
1617             tx               = _mm256_mul_ps(fscal,dx11);
1618             ty               = _mm256_mul_ps(fscal,dy11);
1619             tz               = _mm256_mul_ps(fscal,dz11);
1620
1621             /* Update vectorial force */
1622             fix1             = _mm256_add_ps(fix1,tx);
1623             fiy1             = _mm256_add_ps(fiy1,ty);
1624             fiz1             = _mm256_add_ps(fiz1,tz);
1625
1626             fjx1             = _mm256_add_ps(fjx1,tx);
1627             fjy1             = _mm256_add_ps(fjy1,ty);
1628             fjz1             = _mm256_add_ps(fjz1,tz);
1629
1630             }
1631
1632             /**************************
1633              * CALCULATE INTERACTIONS *
1634              **************************/
1635
1636             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1637             {
1638
1639             /* REACTION-FIELD ELECTROSTATICS */
1640             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1641
1642             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1643
1644             fscal            = felec;
1645
1646             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1647
1648             /* Calculate temporary vectorial force */
1649             tx               = _mm256_mul_ps(fscal,dx12);
1650             ty               = _mm256_mul_ps(fscal,dy12);
1651             tz               = _mm256_mul_ps(fscal,dz12);
1652
1653             /* Update vectorial force */
1654             fix1             = _mm256_add_ps(fix1,tx);
1655             fiy1             = _mm256_add_ps(fiy1,ty);
1656             fiz1             = _mm256_add_ps(fiz1,tz);
1657
1658             fjx2             = _mm256_add_ps(fjx2,tx);
1659             fjy2             = _mm256_add_ps(fjy2,ty);
1660             fjz2             = _mm256_add_ps(fjz2,tz);
1661
1662             }
1663
1664             /**************************
1665              * CALCULATE INTERACTIONS *
1666              **************************/
1667
1668             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1669             {
1670
1671             /* REACTION-FIELD ELECTROSTATICS */
1672             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1673
1674             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1675
1676             fscal            = felec;
1677
1678             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1679
1680             /* Calculate temporary vectorial force */
1681             tx               = _mm256_mul_ps(fscal,dx20);
1682             ty               = _mm256_mul_ps(fscal,dy20);
1683             tz               = _mm256_mul_ps(fscal,dz20);
1684
1685             /* Update vectorial force */
1686             fix2             = _mm256_add_ps(fix2,tx);
1687             fiy2             = _mm256_add_ps(fiy2,ty);
1688             fiz2             = _mm256_add_ps(fiz2,tz);
1689
1690             fjx0             = _mm256_add_ps(fjx0,tx);
1691             fjy0             = _mm256_add_ps(fjy0,ty);
1692             fjz0             = _mm256_add_ps(fjz0,tz);
1693
1694             }
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1701             {
1702
1703             /* REACTION-FIELD ELECTROSTATICS */
1704             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1705
1706             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1707
1708             fscal            = felec;
1709
1710             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm256_mul_ps(fscal,dx21);
1714             ty               = _mm256_mul_ps(fscal,dy21);
1715             tz               = _mm256_mul_ps(fscal,dz21);
1716
1717             /* Update vectorial force */
1718             fix2             = _mm256_add_ps(fix2,tx);
1719             fiy2             = _mm256_add_ps(fiy2,ty);
1720             fiz2             = _mm256_add_ps(fiz2,tz);
1721
1722             fjx1             = _mm256_add_ps(fjx1,tx);
1723             fjy1             = _mm256_add_ps(fjy1,ty);
1724             fjz1             = _mm256_add_ps(fjz1,tz);
1725
1726             }
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1733             {
1734
1735             /* REACTION-FIELD ELECTROSTATICS */
1736             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1737
1738             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1739
1740             fscal            = felec;
1741
1742             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1743
1744             /* Calculate temporary vectorial force */
1745             tx               = _mm256_mul_ps(fscal,dx22);
1746             ty               = _mm256_mul_ps(fscal,dy22);
1747             tz               = _mm256_mul_ps(fscal,dz22);
1748
1749             /* Update vectorial force */
1750             fix2             = _mm256_add_ps(fix2,tx);
1751             fiy2             = _mm256_add_ps(fiy2,ty);
1752             fiz2             = _mm256_add_ps(fiz2,tz);
1753
1754             fjx2             = _mm256_add_ps(fjx2,tx);
1755             fjy2             = _mm256_add_ps(fjy2,ty);
1756             fjz2             = _mm256_add_ps(fjz2,tz);
1757
1758             }
1759
1760             fjptrA             = f+j_coord_offsetA;
1761             fjptrB             = f+j_coord_offsetB;
1762             fjptrC             = f+j_coord_offsetC;
1763             fjptrD             = f+j_coord_offsetD;
1764             fjptrE             = f+j_coord_offsetE;
1765             fjptrF             = f+j_coord_offsetF;
1766             fjptrG             = f+j_coord_offsetG;
1767             fjptrH             = f+j_coord_offsetH;
1768
1769             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1770                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1771
1772             /* Inner loop uses 277 flops */
1773         }
1774
1775         if(jidx<j_index_end)
1776         {
1777
1778             /* Get j neighbor index, and coordinate index */
1779             jnrlistA         = jjnr[jidx];
1780             jnrlistB         = jjnr[jidx+1];
1781             jnrlistC         = jjnr[jidx+2];
1782             jnrlistD         = jjnr[jidx+3];
1783             jnrlistE         = jjnr[jidx+4];
1784             jnrlistF         = jjnr[jidx+5];
1785             jnrlistG         = jjnr[jidx+6];
1786             jnrlistH         = jjnr[jidx+7];
1787             /* Sign of each element will be negative for non-real atoms.
1788              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1789              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1790              */
1791             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1792                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1793
1794             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1795             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1796             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1797             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1798             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1799             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1800             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1801             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1802             j_coord_offsetA  = DIM*jnrA;
1803             j_coord_offsetB  = DIM*jnrB;
1804             j_coord_offsetC  = DIM*jnrC;
1805             j_coord_offsetD  = DIM*jnrD;
1806             j_coord_offsetE  = DIM*jnrE;
1807             j_coord_offsetF  = DIM*jnrF;
1808             j_coord_offsetG  = DIM*jnrG;
1809             j_coord_offsetH  = DIM*jnrH;
1810
1811             /* load j atom coordinates */
1812             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1813                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1814                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1815                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1816                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1817
1818             /* Calculate displacement vector */
1819             dx00             = _mm256_sub_ps(ix0,jx0);
1820             dy00             = _mm256_sub_ps(iy0,jy0);
1821             dz00             = _mm256_sub_ps(iz0,jz0);
1822             dx01             = _mm256_sub_ps(ix0,jx1);
1823             dy01             = _mm256_sub_ps(iy0,jy1);
1824             dz01             = _mm256_sub_ps(iz0,jz1);
1825             dx02             = _mm256_sub_ps(ix0,jx2);
1826             dy02             = _mm256_sub_ps(iy0,jy2);
1827             dz02             = _mm256_sub_ps(iz0,jz2);
1828             dx10             = _mm256_sub_ps(ix1,jx0);
1829             dy10             = _mm256_sub_ps(iy1,jy0);
1830             dz10             = _mm256_sub_ps(iz1,jz0);
1831             dx11             = _mm256_sub_ps(ix1,jx1);
1832             dy11             = _mm256_sub_ps(iy1,jy1);
1833             dz11             = _mm256_sub_ps(iz1,jz1);
1834             dx12             = _mm256_sub_ps(ix1,jx2);
1835             dy12             = _mm256_sub_ps(iy1,jy2);
1836             dz12             = _mm256_sub_ps(iz1,jz2);
1837             dx20             = _mm256_sub_ps(ix2,jx0);
1838             dy20             = _mm256_sub_ps(iy2,jy0);
1839             dz20             = _mm256_sub_ps(iz2,jz0);
1840             dx21             = _mm256_sub_ps(ix2,jx1);
1841             dy21             = _mm256_sub_ps(iy2,jy1);
1842             dz21             = _mm256_sub_ps(iz2,jz1);
1843             dx22             = _mm256_sub_ps(ix2,jx2);
1844             dy22             = _mm256_sub_ps(iy2,jy2);
1845             dz22             = _mm256_sub_ps(iz2,jz2);
1846
1847             /* Calculate squared distance and things based on it */
1848             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1849             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1850             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1851             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1852             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1853             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1854             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1855             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1856             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1857
1858             rinv00           = avx256_invsqrt_f(rsq00);
1859             rinv01           = avx256_invsqrt_f(rsq01);
1860             rinv02           = avx256_invsqrt_f(rsq02);
1861             rinv10           = avx256_invsqrt_f(rsq10);
1862             rinv11           = avx256_invsqrt_f(rsq11);
1863             rinv12           = avx256_invsqrt_f(rsq12);
1864             rinv20           = avx256_invsqrt_f(rsq20);
1865             rinv21           = avx256_invsqrt_f(rsq21);
1866             rinv22           = avx256_invsqrt_f(rsq22);
1867
1868             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1869             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1870             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1871             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1872             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1873             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1874             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1875             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1876             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1877
1878             fjx0             = _mm256_setzero_ps();
1879             fjy0             = _mm256_setzero_ps();
1880             fjz0             = _mm256_setzero_ps();
1881             fjx1             = _mm256_setzero_ps();
1882             fjy1             = _mm256_setzero_ps();
1883             fjz1             = _mm256_setzero_ps();
1884             fjx2             = _mm256_setzero_ps();
1885             fjy2             = _mm256_setzero_ps();
1886             fjz2             = _mm256_setzero_ps();
1887
1888             /**************************
1889              * CALCULATE INTERACTIONS *
1890              **************************/
1891
1892             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1893             {
1894
1895             /* REACTION-FIELD ELECTROSTATICS */
1896             felec            = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1897
1898             /* LENNARD-JONES DISPERSION/REPULSION */
1899
1900             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1901             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1902
1903             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1904
1905             fscal            = _mm256_add_ps(felec,fvdw);
1906
1907             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1908
1909             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1910
1911             /* Calculate temporary vectorial force */
1912             tx               = _mm256_mul_ps(fscal,dx00);
1913             ty               = _mm256_mul_ps(fscal,dy00);
1914             tz               = _mm256_mul_ps(fscal,dz00);
1915
1916             /* Update vectorial force */
1917             fix0             = _mm256_add_ps(fix0,tx);
1918             fiy0             = _mm256_add_ps(fiy0,ty);
1919             fiz0             = _mm256_add_ps(fiz0,tz);
1920
1921             fjx0             = _mm256_add_ps(fjx0,tx);
1922             fjy0             = _mm256_add_ps(fjy0,ty);
1923             fjz0             = _mm256_add_ps(fjz0,tz);
1924
1925             }
1926
1927             /**************************
1928              * CALCULATE INTERACTIONS *
1929              **************************/
1930
1931             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1932             {
1933
1934             /* REACTION-FIELD ELECTROSTATICS */
1935             felec            = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1936
1937             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1938
1939             fscal            = felec;
1940
1941             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1942
1943             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1944
1945             /* Calculate temporary vectorial force */
1946             tx               = _mm256_mul_ps(fscal,dx01);
1947             ty               = _mm256_mul_ps(fscal,dy01);
1948             tz               = _mm256_mul_ps(fscal,dz01);
1949
1950             /* Update vectorial force */
1951             fix0             = _mm256_add_ps(fix0,tx);
1952             fiy0             = _mm256_add_ps(fiy0,ty);
1953             fiz0             = _mm256_add_ps(fiz0,tz);
1954
1955             fjx1             = _mm256_add_ps(fjx1,tx);
1956             fjy1             = _mm256_add_ps(fjy1,ty);
1957             fjz1             = _mm256_add_ps(fjz1,tz);
1958
1959             }
1960
1961             /**************************
1962              * CALCULATE INTERACTIONS *
1963              **************************/
1964
1965             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1966             {
1967
1968             /* REACTION-FIELD ELECTROSTATICS */
1969             felec            = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1970
1971             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1976
1977             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1978
1979             /* Calculate temporary vectorial force */
1980             tx               = _mm256_mul_ps(fscal,dx02);
1981             ty               = _mm256_mul_ps(fscal,dy02);
1982             tz               = _mm256_mul_ps(fscal,dz02);
1983
1984             /* Update vectorial force */
1985             fix0             = _mm256_add_ps(fix0,tx);
1986             fiy0             = _mm256_add_ps(fiy0,ty);
1987             fiz0             = _mm256_add_ps(fiz0,tz);
1988
1989             fjx2             = _mm256_add_ps(fjx2,tx);
1990             fjy2             = _mm256_add_ps(fjy2,ty);
1991             fjz2             = _mm256_add_ps(fjz2,tz);
1992
1993             }
1994
1995             /**************************
1996              * CALCULATE INTERACTIONS *
1997              **************************/
1998
1999             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2000             {
2001
2002             /* REACTION-FIELD ELECTROSTATICS */
2003             felec            = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2004
2005             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2006
2007             fscal            = felec;
2008
2009             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2010
2011             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2012
2013             /* Calculate temporary vectorial force */
2014             tx               = _mm256_mul_ps(fscal,dx10);
2015             ty               = _mm256_mul_ps(fscal,dy10);
2016             tz               = _mm256_mul_ps(fscal,dz10);
2017
2018             /* Update vectorial force */
2019             fix1             = _mm256_add_ps(fix1,tx);
2020             fiy1             = _mm256_add_ps(fiy1,ty);
2021             fiz1             = _mm256_add_ps(fiz1,tz);
2022
2023             fjx0             = _mm256_add_ps(fjx0,tx);
2024             fjy0             = _mm256_add_ps(fjy0,ty);
2025             fjz0             = _mm256_add_ps(fjz0,tz);
2026
2027             }
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2034             {
2035
2036             /* REACTION-FIELD ELECTROSTATICS */
2037             felec            = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2038
2039             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2040
2041             fscal            = felec;
2042
2043             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2044
2045             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2046
2047             /* Calculate temporary vectorial force */
2048             tx               = _mm256_mul_ps(fscal,dx11);
2049             ty               = _mm256_mul_ps(fscal,dy11);
2050             tz               = _mm256_mul_ps(fscal,dz11);
2051
2052             /* Update vectorial force */
2053             fix1             = _mm256_add_ps(fix1,tx);
2054             fiy1             = _mm256_add_ps(fiy1,ty);
2055             fiz1             = _mm256_add_ps(fiz1,tz);
2056
2057             fjx1             = _mm256_add_ps(fjx1,tx);
2058             fjy1             = _mm256_add_ps(fjy1,ty);
2059             fjz1             = _mm256_add_ps(fjz1,tz);
2060
2061             }
2062
2063             /**************************
2064              * CALCULATE INTERACTIONS *
2065              **************************/
2066
2067             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2068             {
2069
2070             /* REACTION-FIELD ELECTROSTATICS */
2071             felec            = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2072
2073             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2074
2075             fscal            = felec;
2076
2077             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2078
2079             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm256_mul_ps(fscal,dx12);
2083             ty               = _mm256_mul_ps(fscal,dy12);
2084             tz               = _mm256_mul_ps(fscal,dz12);
2085
2086             /* Update vectorial force */
2087             fix1             = _mm256_add_ps(fix1,tx);
2088             fiy1             = _mm256_add_ps(fiy1,ty);
2089             fiz1             = _mm256_add_ps(fiz1,tz);
2090
2091             fjx2             = _mm256_add_ps(fjx2,tx);
2092             fjy2             = _mm256_add_ps(fjy2,ty);
2093             fjz2             = _mm256_add_ps(fjz2,tz);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2102             {
2103
2104             /* REACTION-FIELD ELECTROSTATICS */
2105             felec            = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2106
2107             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2108
2109             fscal            = felec;
2110
2111             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2112
2113             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2114
2115             /* Calculate temporary vectorial force */
2116             tx               = _mm256_mul_ps(fscal,dx20);
2117             ty               = _mm256_mul_ps(fscal,dy20);
2118             tz               = _mm256_mul_ps(fscal,dz20);
2119
2120             /* Update vectorial force */
2121             fix2             = _mm256_add_ps(fix2,tx);
2122             fiy2             = _mm256_add_ps(fiy2,ty);
2123             fiz2             = _mm256_add_ps(fiz2,tz);
2124
2125             fjx0             = _mm256_add_ps(fjx0,tx);
2126             fjy0             = _mm256_add_ps(fjy0,ty);
2127             fjz0             = _mm256_add_ps(fjz0,tz);
2128
2129             }
2130
2131             /**************************
2132              * CALCULATE INTERACTIONS *
2133              **************************/
2134
2135             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2136             {
2137
2138             /* REACTION-FIELD ELECTROSTATICS */
2139             felec            = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2140
2141             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2142
2143             fscal            = felec;
2144
2145             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2146
2147             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2148
2149             /* Calculate temporary vectorial force */
2150             tx               = _mm256_mul_ps(fscal,dx21);
2151             ty               = _mm256_mul_ps(fscal,dy21);
2152             tz               = _mm256_mul_ps(fscal,dz21);
2153
2154             /* Update vectorial force */
2155             fix2             = _mm256_add_ps(fix2,tx);
2156             fiy2             = _mm256_add_ps(fiy2,ty);
2157             fiz2             = _mm256_add_ps(fiz2,tz);
2158
2159             fjx1             = _mm256_add_ps(fjx1,tx);
2160             fjy1             = _mm256_add_ps(fjy1,ty);
2161             fjz1             = _mm256_add_ps(fjz1,tz);
2162
2163             }
2164
2165             /**************************
2166              * CALCULATE INTERACTIONS *
2167              **************************/
2168
2169             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2170             {
2171
2172             /* REACTION-FIELD ELECTROSTATICS */
2173             felec            = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2174
2175             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2176
2177             fscal            = felec;
2178
2179             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2180
2181             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2182
2183             /* Calculate temporary vectorial force */
2184             tx               = _mm256_mul_ps(fscal,dx22);
2185             ty               = _mm256_mul_ps(fscal,dy22);
2186             tz               = _mm256_mul_ps(fscal,dz22);
2187
2188             /* Update vectorial force */
2189             fix2             = _mm256_add_ps(fix2,tx);
2190             fiy2             = _mm256_add_ps(fiy2,ty);
2191             fiz2             = _mm256_add_ps(fiz2,tz);
2192
2193             fjx2             = _mm256_add_ps(fjx2,tx);
2194             fjy2             = _mm256_add_ps(fjy2,ty);
2195             fjz2             = _mm256_add_ps(fjz2,tz);
2196
2197             }
2198
2199             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2200             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2201             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2202             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2203             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2204             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2205             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2206             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2207
2208             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2209                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2210
2211             /* Inner loop uses 277 flops */
2212         }
2213
2214         /* End of innermost loop */
2215
2216         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2217                                                  f+i_coord_offset,fshift+i_shift_offset);
2218
2219         /* Increment number of inner iterations */
2220         inneriter                  += j_index_end - j_index_start;
2221
2222         /* Outer loop uses 18 flops */
2223     }
2224
2225     /* Increment number of outer iterations */
2226     outeriter        += nri;
2227
2228     /* Update outer/inner flops */
2229
2230     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
2231 }