src/gromacs/nbnxm/pairlist.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "pairlist.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cmath>
  44 #include <cstring>
  45
  46 #include <algorithm>
  47
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/utilities.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdlib/ns.h"
  55 #include "gromacs/mdtypes/group.h"
  56 #include "gromacs/mdtypes/md_enums.h"
  57 #include "gromacs/nbnxm/atomdata.h"
  58 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  59 #include "gromacs/nbnxm/nbnxm.h"
  60 #include "gromacs/nbnxm/nbnxm_geometry.h"
  61 #include "gromacs/nbnxm/nbnxm_simd.h"
  62 #include "gromacs/nbnxm/pairlistset.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/pbcutil/pbc.h"
  65 #include "gromacs/simd/simd.h"
  66 #include "gromacs/simd/vector_operations.h"
  67 #include "gromacs/topology/block.h"
  68 #include "gromacs/utility/exceptions.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxomp.h"
  71 #include "gromacs/utility/smalloc.h"
  72
  73 #include "grid.h"
  74 #include "internal.h"
  75 #include "pairlistwork.h"
  76
  77 using namespace gmx;                        // TODO: Remove when this file is moved into gmx namespace
  78
  79 using BoundingBox   = Nbnxm::BoundingBox;   // TODO: Remove when refactoring this file
  80 using BoundingBox1D = Nbnxm::BoundingBox1D; // TODO: Remove when refactoring this file
  81
  82 using Grid          = Nbnxm::Grid;          // TODO: Remove when refactoring this file
  83
  84 // Convience alias for partial Nbnxn namespace usage
  85 using InteractionLocality = Nbnxm::InteractionLocality;
  86
  87 /* We shift the i-particles backward for PBC.
  88  * This leads to more conditionals than shifting forward.
  89  * We do this to get more balanced pair lists.
  90  */
  91 constexpr bool c_pbcShiftBackward = true;
  92
  93
  94 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
  95 {
  96     for (int i = 0; i < enbsCCnr; i++)
  97     {
  98         cc[i].count = 0;
  99         cc[i].c     = 0;
 100     }
 101 }
 102
 103 static double Mcyc_av(const nbnxn_cycle_t *cc)
 104 {
 105     return static_cast<double>(cc->c)*1e-6/cc->count;
 106 }
 107
 108 static void nbs_cycle_print(FILE *fp, const nbnxn_search *nbs)
 109 {
 110     fprintf(fp, "\n");
 111     fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
 112             nbs->cc[enbsCCgrid].count,
 113             Mcyc_av(&nbs->cc[enbsCCgrid]),
 114             Mcyc_av(&nbs->cc[enbsCCsearch]),
 115             Mcyc_av(&nbs->cc[enbsCCreducef]));
 116
 117     if (nbs->work.size() > 1)
 118     {
 119         if (nbs->cc[enbsCCcombine].count > 0)
 120         {
 121             fprintf(fp, " comb %5.2f",
 122                     Mcyc_av(&nbs->cc[enbsCCcombine]));
 123         }
 124         fprintf(fp, " s. th");
 125         for (const nbnxn_search_work_t &work : nbs->work)
 126         {
 127             fprintf(fp, " %4.1f",
 128                     Mcyc_av(&work.cc[enbsCCsearch]));
 129         }
 130     }
 131     fprintf(fp, "\n");
 132 }
 133
 134 /* Layout for the nonbonded NxN pair lists */
 135 enum class NbnxnLayout
 136 {
 137     NoSimd4x4, // i-cluster size 4, j-cluster size 4
 138     Simd4xN,   // i-cluster size 4, j-cluster size SIMD width
 139     Simd2xNN,  // i-cluster size 4, j-cluster size half SIMD width
 140     Gpu8x8x8   // i-cluster size 8, j-cluster size 8 + super-clustering
 141 };
 142
 143 #if GMX_SIMD
 144 /* Returns the j-cluster size */
 145 template <NbnxnLayout layout>
 146 static constexpr int jClusterSize()
 147 {
 148     static_assert(layout == NbnxnLayout::NoSimd4x4 || layout == NbnxnLayout::Simd4xN || layout == NbnxnLayout::Simd2xNN, "Currently jClusterSize only supports CPU layouts");
 149
 150     return layout == NbnxnLayout::Simd4xN ? GMX_SIMD_REAL_WIDTH : (layout == NbnxnLayout::Simd2xNN ? GMX_SIMD_REAL_WIDTH/2 : c_nbnxnCpuIClusterSize);
 151 }
 152
 153 /*! \brief Returns the j-cluster index given the i-cluster index.
 154  *
 155  * \tparam    jClusterSize      The number of atoms in a j-cluster
 156  * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
 157  * \param[in] ci                The i-cluster index
 158  */
 159 template <int jClusterSize, int jSubClusterIndex>
 160 static inline int cjFromCi(int ci)
 161 {
 162     static_assert(jClusterSize == c_nbnxnCpuIClusterSize/2 || jClusterSize == c_nbnxnCpuIClusterSize || jClusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 163
 164     static_assert(jSubClusterIndex == 0 || jSubClusterIndex == 1,
 165                   "Only sub-cluster indices 0 and 1 are supported");
 166
 167     if (jClusterSize == c_nbnxnCpuIClusterSize/2)
 168     {
 169         if (jSubClusterIndex == 0)
 170         {
 171             return ci << 1;
 172         }
 173         else
 174         {
 175             return ((ci + 1) << 1) - 1;
 176         }
 177     }
 178     else if (jClusterSize == c_nbnxnCpuIClusterSize)
 179     {
 180         return ci;
 181     }
 182     else
 183     {
 184         return ci >> 1;
 185     }
 186 }
 187
 188 /*! \brief Returns the j-cluster index given the i-cluster index.
 189  *
 190  * \tparam    layout            The pair-list layout
 191  * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
 192  * \param[in] ci                The i-cluster index
 193  */
 194 template <NbnxnLayout layout, int jSubClusterIndex>
 195 static inline int cjFromCi(int ci)
 196 {
 197     constexpr int clusterSize = jClusterSize<layout>();
 198
 199     return cjFromCi<clusterSize, jSubClusterIndex>(ci);
 200 }
 201
 202 /* Returns the nbnxn coordinate data index given the i-cluster index */
 203 template <NbnxnLayout layout>
 204 static inline int xIndexFromCi(int ci)
 205 {
 206     constexpr int clusterSize = jClusterSize<layout>();
 207
 208     static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 209
 210     if (clusterSize <= c_nbnxnCpuIClusterSize)
 211     {
 212         /* Coordinates are stored packed in groups of 4 */
 213         return ci*STRIDE_P4;
 214     }
 215     else
 216     {
 217         /* Coordinates packed in 8, i-cluster size is half the packing width */
 218         return (ci >> 1)*STRIDE_P8 + (ci & 1)*(c_packX8 >> 1);
 219     }
 220 }
 221
 222 /* Returns the nbnxn coordinate data index given the j-cluster index */
 223 template <NbnxnLayout layout>
 224 static inline int xIndexFromCj(int cj)
 225 {
 226     constexpr int clusterSize = jClusterSize<layout>();
 227
 228     static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 229
 230     if (clusterSize == c_nbnxnCpuIClusterSize/2)
 231     {
 232         /* Coordinates are stored packed in groups of 4 */
 233         return (cj >> 1)*STRIDE_P4 + (cj & 1)*(c_packX4 >> 1);
 234     }
 235     else if (clusterSize == c_nbnxnCpuIClusterSize)
 236     {
 237         /* Coordinates are stored packed in groups of 4 */
 238         return cj*STRIDE_P4;
 239     }
 240     else
 241     {
 242         /* Coordinates are stored packed in groups of 8 */
 243         return cj*STRIDE_P8;
 244     }
 245 }
 246 #endif //GMX_SIMD
 247
 248 /* Initializes a single nbnxn_pairlist_t data structure */
 249 static void nbnxn_init_pairlist_fep(t_nblist *nl)
 250 {
 251     nl->type        = GMX_NBLIST_INTERACTION_FREE_ENERGY;
 252     nl->igeometry   = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 253     /* The interaction functions are set in the free energy kernel fuction */
 254     nl->ivdw        = -1;
 255     nl->ivdwmod     = -1;
 256     nl->ielec       = -1;
 257     nl->ielecmod    = -1;
 258
 259     nl->maxnri      = 0;
 260     nl->maxnrj      = 0;
 261     nl->nri         = 0;
 262     nl->nrj         = 0;
 263     nl->iinr        = nullptr;
 264     nl->gid         = nullptr;
 265     nl->shift       = nullptr;
 266     nl->jindex      = nullptr;
 267     nl->jjnr        = nullptr;
 268     nl->excl_fep    = nullptr;
 269
 270 }
 271
 272 static void free_nblist(t_nblist *nl)
 273 {
 274     sfree(nl->iinr);
 275     sfree(nl->gid);
 276     sfree(nl->shift);
 277     sfree(nl->jindex);
 278     sfree(nl->jjnr);
 279     sfree(nl->excl_fep);
 280 }
 281
 282 nbnxn_search_work_t::nbnxn_search_work_t() :
 283     cp0({{0}}
 284         ),
 285     buffer_flags({0, nullptr, 0}),
 286     ndistc(0),
 287     nbl_fep(new t_nblist),
 288     cp1({{0}})
 289 {
 290     nbnxn_init_pairlist_fep(nbl_fep.get());
 291
 292     nbs_cycle_clear(cc);
 293 }
 294
 295 nbnxn_search_work_t::~nbnxn_search_work_t()
 296 {
 297     sfree(buffer_flags.flag);
 298
 299     free_nblist(nbl_fep.get());
 300 }
 301
 302 nbnxn_search::DomainSetup::DomainSetup(const int                 ePBC,
 303                                        const ivec               *numDDCells,
 304                                        const gmx_domdec_zones_t *ddZones) :
 305     ePBC(ePBC),
 306     haveDomDec(numDDCells != nullptr),
 307     zones(ddZones)
 308 {
 309     for (int d = 0; d < DIM; d++)
 310     {
 311         haveDomDecPerDim[d] = (numDDCells != nullptr && (*numDDCells)[d] > 1);
 312     }
 313 }
 314
 315 nbnxn_search::nbnxn_search(const int                 ePBC,
 316                            const ivec               *numDDCells,
 317                            const gmx_domdec_zones_t *ddZones,
 318                            const PairlistType        pairlistType,
 319                            const bool                haveFep,
 320                            const int                 maxNumThreads) :
 321     domainSetup_(ePBC, numDDCells, ddZones),
 322     gridSet_(domainSetup_.haveDomDecPerDim, pairlistType, haveFep, maxNumThreads),
 323     search_count(0),
 324     work(maxNumThreads)
 325 {
 326     print_cycles = (getenv("GMX_NBNXN_CYCLE") != nullptr);
 327     nbs_cycle_clear(cc);
 328 }
 329
 330 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 331                               int                   natoms)
 332 {
 333     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 334     if (flags->nflag > flags->flag_nalloc)
 335     {
 336         flags->flag_nalloc = over_alloc_large(flags->nflag);
 337         srenew(flags->flag, flags->flag_nalloc);
 338     }
 339     for (int b = 0; b < flags->nflag; b++)
 340     {
 341         bitmask_clear(&(flags->flag[b]));
 342     }
 343 }
 344
 345 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
 346  *
 347  * Given a cutoff distance between atoms, this functions returns the cutoff
 348  * distance2 between a bounding box of a group of atoms and a grid cell.
 349  * Since atoms can be geometrically outside of the cell they have been
 350  * assigned to (when atom groups instead of individual atoms are assigned
 351  * to cells), this distance returned can be larger than the input.
 352  */
 353 static real
 354 listRangeForBoundingBoxToGridCell(real                    rlist,
 355                                   const Grid::Dimensions &gridDims)
 356 {
 357     return rlist + gridDims.maxAtomGroupRadius;
 358
 359 }
 360 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
 361  *
 362  * Given a cutoff distance between atoms, this functions returns the cutoff
 363  * distance2 between two grid cells.
 364  * Since atoms can be geometrically outside of the cell they have been
 365  * assigned to (when atom groups instead of individual atoms are assigned
 366  * to cells), this distance returned can be larger than the input.
 367  */
 368 static real
 369 listRangeForGridCellToGridCell(real                    rlist,
 370                                const Grid::Dimensions &iGridDims,
 371                                const Grid::Dimensions &jGridDims)
 372 {
 373     return rlist + iGridDims.maxAtomGroupRadius + jGridDims.maxAtomGroupRadius;
 374 }
 375
 376 /* Determines the cell range along one dimension that
 377  * the bounding box b0 - b1 sees.
 378  */
 379 template<int dim>
 380 static void get_cell_range(real b0, real b1,
 381                            const Grid::Dimensions &jGridDims,
 382                            real d2, real rlist, int *cf, int *cl)
 383 {
 384     real listRangeBBToCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGridDims));
 385     real distanceInCells    = (b0 - jGridDims.lowerCorner[dim])*jGridDims.invCellSize[dim];
 386     *cf                     = std::max(static_cast<int>(distanceInCells), 0);
 387
 388     while (*cf > 0 &&
 389            d2 + gmx::square((b0 - jGridDims.lowerCorner[dim]) - (*cf - 1 + 1)*jGridDims.cellSize[dim]) < listRangeBBToCell2)
 390     {
 391         (*cf)--;
 392     }
 393
 394     *cl = std::min(static_cast<int>((b1 - jGridDims.lowerCorner[dim])*jGridDims.invCellSize[dim]), jGridDims.numCells[dim] - 1);
 395     while (*cl < jGridDims.numCells[dim] - 1 &&
 396            d2 + gmx::square((*cl + 1)*jGridDims.cellSize[dim] - (b1 - jGridDims.lowerCorner[dim])) < listRangeBBToCell2)
 397     {
 398         (*cl)++;
 399     }
 400 }
 401
 402 /* Reference code calculating the distance^2 between two bounding boxes */
 403 /*
 404    static float box_dist2(float bx0, float bx1, float by0,
 405                        float by1, float bz0, float bz1,
 406                        const BoundingBox *bb)
 407    {
 408     float d2;
 409     float dl, dh, dm, dm0;
 410
 411     d2 = 0;
 412
 413     dl  = bx0 - bb->upper.x;
 414     dh  = bb->lower.x - bx1;
 415     dm  = std::max(dl, dh);
 416     dm0 = std::max(dm, 0.0f);
 417     d2 += dm0*dm0;
 418
 419     dl  = by0 - bb->upper.y;
 420     dh  = bb->lower.y - by1;
 421     dm  = std::max(dl, dh);
 422     dm0 = std::max(dm, 0.0f);
 423     d2 += dm0*dm0;
 424
 425     dl  = bz0 - bb->upper.z;
 426     dh  = bb->lower.z - bz1;
 427     dm  = std::max(dl, dh);
 428     dm0 = std::max(dm, 0.0f);
 429     d2 += dm0*dm0;
 430
 431     return d2;
 432    }
 433  */
 434
 435 #if !NBNXN_SEARCH_BB_SIMD4
 436
 437 /*! \brief Plain C code calculating the distance^2 between two bounding boxes in xyz0 format
 438  *
 439  * \param[in] bb_i  First bounding box
 440  * \param[in] bb_j  Second bounding box
 441  */
 442 static float clusterBoundingBoxDistance2(const BoundingBox &bb_i,
 443                                          const BoundingBox &bb_j)
 444 {
 445     float dl   = bb_i.lower.x - bb_j.upper.x;
 446     float dh   = bb_j.lower.x - bb_i.upper.x;
 447     float dm   = std::max(dl, dh);
 448     float dm0  = std::max(dm, 0.0f);
 449     float d2   = dm0*dm0;
 450
 451     dl         = bb_i.lower.y - bb_j.upper.y;
 452     dh         = bb_j.lower.y - bb_i.upper.y;
 453     dm         = std::max(dl, dh);
 454     dm0        = std::max(dm, 0.0f);
 455     d2        += dm0*dm0;
 456
 457     dl         = bb_i.lower.z - bb_j.upper.z;
 458     dh         = bb_j.lower.z - bb_i.upper.z;
 459     dm         = std::max(dl, dh);
 460     dm0        = std::max(dm, 0.0f);
 461     d2        += dm0*dm0;
 462
 463     return d2;
 464 }
 465
 466 #else /* NBNXN_SEARCH_BB_SIMD4 */
 467
 468 /*! \brief 4-wide SIMD code calculating the distance^2 between two bounding boxes in xyz0 format
 469  *
 470  * \param[in] bb_i  First bounding box, should be aligned for 4-wide SIMD
 471  * \param[in] bb_j  Second bounding box, should be aligned for 4-wide SIMD
 472  */
 473 static float clusterBoundingBoxDistance2(const BoundingBox &bb_i,
 474                                          const BoundingBox &bb_j)
 475 {
 476     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 477     using namespace gmx;
 478
 479     const Simd4Float bb_i_S0 = load4(bb_i.lower.ptr());
 480     const Simd4Float bb_i_S1 = load4(bb_i.upper.ptr());
 481     const Simd4Float bb_j_S0 = load4(bb_j.lower.ptr());
 482     const Simd4Float bb_j_S1 = load4(bb_j.upper.ptr());
 483
 484     const Simd4Float dl_S    = bb_i_S0 - bb_j_S1;
 485     const Simd4Float dh_S    = bb_j_S0 - bb_i_S1;
 486
 487     const Simd4Float dm_S    = max(dl_S, dh_S);
 488     const Simd4Float dm0_S   = max(dm_S, simd4SetZeroF());
 489
 490     return dotProduct(dm0_S, dm0_S);
 491 }
 492
 493 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 494 template <int boundingBoxStart>
 495 static inline void gmx_simdcall
 496 clusterBoundingBoxDistance2_xxxx_simd4_inner(const float      *bb_i,
 497                                              float            *d2,
 498                                              const Simd4Float  xj_l,
 499                                              const Simd4Float  yj_l,
 500                                              const Simd4Float  zj_l,
 501                                              const Simd4Float  xj_h,
 502                                              const Simd4Float  yj_h,
 503                                              const Simd4Float  zj_h)
 504 {
 505     constexpr int    stride = c_packedBoundingBoxesDimSize;
 506
 507     const int        shi  = boundingBoxStart*Nbnxm::c_numBoundingBoxBounds1D*DIM;
 508
 509     const Simd4Float zero = setZero();
 510
 511     const Simd4Float xi_l = load4(bb_i + shi + 0*stride);
 512     const Simd4Float yi_l = load4(bb_i + shi + 1*stride);
 513     const Simd4Float zi_l = load4(bb_i + shi + 2*stride);
 514     const Simd4Float xi_h = load4(bb_i + shi + 3*stride);
 515     const Simd4Float yi_h = load4(bb_i + shi + 4*stride);
 516     const Simd4Float zi_h = load4(bb_i + shi + 5*stride);
 517
 518     const Simd4Float dx_0 = xi_l - xj_h;
 519     const Simd4Float dy_0 = yi_l - yj_h;
 520     const Simd4Float dz_0 = zi_l - zj_h;
 521
 522     const Simd4Float dx_1 = xj_l - xi_h;
 523     const Simd4Float dy_1 = yj_l - yi_h;
 524     const Simd4Float dz_1 = zj_l - zi_h;
 525
 526     const Simd4Float mx   = max(dx_0, dx_1);
 527     const Simd4Float my   = max(dy_0, dy_1);
 528     const Simd4Float mz   = max(dz_0, dz_1);
 529
 530     const Simd4Float m0x  = max(mx, zero);
 531     const Simd4Float m0y  = max(my, zero);
 532     const Simd4Float m0z  = max(mz, zero);
 533
 534     const Simd4Float d2x  = m0x * m0x;
 535     const Simd4Float d2y  = m0y * m0y;
 536     const Simd4Float d2z  = m0z * m0z;
 537
 538     const Simd4Float d2s  = d2x + d2y;
 539     const Simd4Float d2t  = d2s + d2z;
 540
 541     store4(d2 + boundingBoxStart, d2t);
 542 }
 543
 544 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
 545 static void
 546 clusterBoundingBoxDistance2_xxxx_simd4(const float *bb_j,
 547                                        const int    nsi,
 548                                        const float *bb_i,
 549                                        float       *d2)
 550 {
 551     constexpr int    stride = c_packedBoundingBoxesDimSize;
 552
 553     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 554     using namespace gmx;
 555
 556     const Simd4Float xj_l = Simd4Float(bb_j[0*stride]);
 557     const Simd4Float yj_l = Simd4Float(bb_j[1*stride]);
 558     const Simd4Float zj_l = Simd4Float(bb_j[2*stride]);
 559     const Simd4Float xj_h = Simd4Float(bb_j[3*stride]);
 560     const Simd4Float yj_h = Simd4Float(bb_j[4*stride]);
 561     const Simd4Float zj_h = Simd4Float(bb_j[5*stride]);
 562
 563     /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
 564      * But as we know the number of iterations is 1 or 2, we unroll manually.
 565      */
 566     clusterBoundingBoxDistance2_xxxx_simd4_inner<0>(bb_i, d2,
 567                                                     xj_l, yj_l, zj_l,
 568                                                     xj_h, yj_h, zj_h);
 569     if (stride < nsi)
 570     {
 571         clusterBoundingBoxDistance2_xxxx_simd4_inner<stride>(bb_i, d2,
 572                                                              xj_l, yj_l, zj_l,
 573                                                              xj_h, yj_h, zj_h);
 574     }
 575 }
 576
 577 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 578
 579
 580 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
 581 static inline gmx_bool
 582 clusterpair_in_range(const NbnxnPairlistGpuWork &work,
 583                      int si,
 584                      int csj, int stride, const real *x_j,
 585                      real rlist2)
 586 {
 587 #if !GMX_SIMD4_HAVE_REAL
 588
 589     /* Plain C version.
 590      * All coordinates are stored as xyzxyz...
 591      */
 592
 593     const real *x_i = work.iSuperClusterData.x.data();
 594
 595     for (int i = 0; i < c_nbnxnGpuClusterSize; i++)
 596     {
 597         int i0 = (si*c_nbnxnGpuClusterSize + i)*DIM;
 598         for (int j = 0; j < c_nbnxnGpuClusterSize; j++)
 599         {
 600             int  j0 = (csj*c_nbnxnGpuClusterSize + j)*stride;
 601
 602             real d2 = gmx::square(x_i[i0  ] - x_j[j0  ]) + gmx::square(x_i[i0+1] - x_j[j0+1]) + gmx::square(x_i[i0+2] - x_j[j0+2]);
 603
 604             if (d2 < rlist2)
 605             {
 606                 return TRUE;
 607             }
 608         }
 609     }
 610
 611     return FALSE;
 612
 613 #else /* !GMX_SIMD4_HAVE_REAL */
 614
 615     /* 4-wide SIMD version.
 616      * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
 617      * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
 618      */
 619     static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
 620                   "A cluster is hard-coded to 4/8 atoms.");
 621
 622     Simd4Real   rc2_S      = Simd4Real(rlist2);
 623
 624     const real *x_i        = work.iSuperClusterData.xSimd.data();
 625
 626     int         dim_stride = c_nbnxnGpuClusterSize*DIM;
 627     Simd4Real   ix_S0      = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
 628     Simd4Real   iy_S0      = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
 629     Simd4Real   iz_S0      = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
 630
 631     Simd4Real   ix_S1, iy_S1, iz_S1;
 632     if (c_nbnxnGpuClusterSize == 8)
 633     {
 634         ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
 635         iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
 636         iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
 637     }
 638     /* We loop from the outer to the inner particles to maximize
 639      * the chance that we find a pair in range quickly and return.
 640      */
 641     int j0 = csj*c_nbnxnGpuClusterSize;
 642     int j1 = j0 + c_nbnxnGpuClusterSize - 1;
 643     while (j0 < j1)
 644     {
 645         Simd4Real jx0_S, jy0_S, jz0_S;
 646         Simd4Real jx1_S, jy1_S, jz1_S;
 647
 648         Simd4Real dx_S0, dy_S0, dz_S0;
 649         Simd4Real dx_S1, dy_S1, dz_S1;
 650         Simd4Real dx_S2, dy_S2, dz_S2;
 651         Simd4Real dx_S3, dy_S3, dz_S3;
 652
 653         Simd4Real rsq_S0;
 654         Simd4Real rsq_S1;
 655         Simd4Real rsq_S2;
 656         Simd4Real rsq_S3;
 657
 658         Simd4Bool wco_S0;
 659         Simd4Bool wco_S1;
 660         Simd4Bool wco_S2;
 661         Simd4Bool wco_S3;
 662         Simd4Bool wco_any_S01, wco_any_S23, wco_any_S;
 663
 664         jx0_S = Simd4Real(x_j[j0*stride+0]);
 665         jy0_S = Simd4Real(x_j[j0*stride+1]);
 666         jz0_S = Simd4Real(x_j[j0*stride+2]);
 667
 668         jx1_S = Simd4Real(x_j[j1*stride+0]);
 669         jy1_S = Simd4Real(x_j[j1*stride+1]);
 670         jz1_S = Simd4Real(x_j[j1*stride+2]);
 671
 672         /* Calculate distance */
 673         dx_S0            = ix_S0 - jx0_S;
 674         dy_S0            = iy_S0 - jy0_S;
 675         dz_S0            = iz_S0 - jz0_S;
 676         dx_S2            = ix_S0 - jx1_S;
 677         dy_S2            = iy_S0 - jy1_S;
 678         dz_S2            = iz_S0 - jz1_S;
 679         if (c_nbnxnGpuClusterSize == 8)
 680         {
 681             dx_S1            = ix_S1 - jx0_S;
 682             dy_S1            = iy_S1 - jy0_S;
 683             dz_S1            = iz_S1 - jz0_S;
 684             dx_S3            = ix_S1 - jx1_S;
 685             dy_S3            = iy_S1 - jy1_S;
 686             dz_S3            = iz_S1 - jz1_S;
 687         }
 688
 689         /* rsq = dx*dx+dy*dy+dz*dz */
 690         rsq_S0           = norm2(dx_S0, dy_S0, dz_S0);
 691         rsq_S2           = norm2(dx_S2, dy_S2, dz_S2);
 692         if (c_nbnxnGpuClusterSize == 8)
 693         {
 694             rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
 695             rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
 696         }
 697
 698         wco_S0           = (rsq_S0 < rc2_S);
 699         wco_S2           = (rsq_S2 < rc2_S);
 700         if (c_nbnxnGpuClusterSize == 8)
 701         {
 702             wco_S1           = (rsq_S1 < rc2_S);
 703             wco_S3           = (rsq_S3 < rc2_S);
 704         }
 705         if (c_nbnxnGpuClusterSize == 8)
 706         {
 707             wco_any_S01      = wco_S0 || wco_S1;
 708             wco_any_S23      = wco_S2 || wco_S3;
 709             wco_any_S        = wco_any_S01 || wco_any_S23;
 710         }
 711         else
 712         {
 713             wco_any_S = wco_S0 || wco_S2;
 714         }
 715
 716         if (anyTrue(wco_any_S))
 717         {
 718             return TRUE;
 719         }
 720
 721         j0++;
 722         j1--;
 723     }
 724
 725     return FALSE;
 726
 727 #endif /* !GMX_SIMD4_HAVE_REAL */
 728 }
 729
 730 /* Returns the j-cluster index for index cjIndex in a cj list */
 731 static inline int nblCj(gmx::ArrayRef<const nbnxn_cj_t> cjList,
 732                         int                             cjIndex)
 733 {
 734     return cjList[cjIndex].cj;
 735 }
 736
 737 /* Returns the j-cluster index for index cjIndex in a cj4 list */
 738 static inline int nblCj(gmx::ArrayRef<const nbnxn_cj4_t> cj4List,
 739                         int                              cjIndex)
 740 {
 741     return cj4List[cjIndex/c_nbnxnGpuJgroupSize].cj[cjIndex & (c_nbnxnGpuJgroupSize - 1)];
 742 }
 743
 744 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 745 static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
 746 {
 747     return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
 748 }
 749
 750 /* Initializes a single NbnxnPairlistCpu data structure */
 751 static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
 752 {
 753     nbl->na_ci       = c_nbnxnCpuIClusterSize;
 754     nbl->na_cj       = 0;
 755     nbl->ci.clear();
 756     nbl->ciOuter.clear();
 757     nbl->ncjInUse    = 0;
 758     nbl->cj.clear();
 759     nbl->cjOuter.clear();
 760     nbl->nci_tot     = 0;
 761
 762     nbl->work        = new NbnxnPairlistCpuWork();
 763 }
 764
 765 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
 766     na_ci(c_nbnxnGpuClusterSize),
 767     na_cj(c_nbnxnGpuClusterSize),
 768     na_sc(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize),
 769     rlist(0),
 770     sci({}, {pinningPolicy}),
 771     cj4({}, {pinningPolicy}),
 772     excl({}, {pinningPolicy}),
 773     nci_tot(0)
 774 {
 775     static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
 776                   "The search code assumes that the a super-cluster matches a search grid cell");
 777
 778     static_assert(sizeof(cj4[0].imei[0].imask)*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell,
 779                   "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
 780
 781     static_assert(sizeof(excl[0])*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The GPU exclusion mask does not contain a sufficient number of bits");
 782
 783     // We always want a first entry without any exclusions
 784     excl.resize(1);
 785
 786     work = new NbnxnPairlistGpuWork();
 787 }
 788
 789 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list)
 790 {
 791     nbl_list->bSimple   =
 792         (nbl_list->params.pairlistType == PairlistType::Simple4x2 ||
 793          nbl_list->params.pairlistType == PairlistType::Simple4x4 ||
 794          nbl_list->params.pairlistType == PairlistType::Simple4x8);
 795     // Currently GPU lists are always combined
 796     nbl_list->bCombined = !nbl_list->bSimple;
 797
 798     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 799
 800     if (!nbl_list->bCombined &&
 801         nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 802     {
 803         gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 804                   nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
 805     }
 806
 807     if (nbl_list->bSimple)
 808     {
 809         snew(nbl_list->nbl, nbl_list->nnbl);
 810         if (nbl_list->nnbl > 1)
 811         {
 812             snew(nbl_list->nbl_work, nbl_list->nnbl);
 813         }
 814     }
 815     else
 816     {
 817         snew(nbl_list->nblGpu, nbl_list->nnbl);
 818     }
 819     nbl_list->nbl_fep.resize(nbl_list->nnbl);
 820     /* Execute in order to avoid memory interleaving between threads */
 821 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 822     for (int i = 0; i < nbl_list->nnbl; i++)
 823     {
 824         try
 825         {
 826             /* Allocate the nblist data structure locally on each thread
 827              * to optimize memory access for NUMA architectures.
 828              */
 829             if (nbl_list->bSimple)
 830             {
 831                 nbl_list->nbl[i] = new NbnxnPairlistCpu();
 832
 833                 nbnxn_init_pairlist(nbl_list->nbl[i]);
 834                 if (nbl_list->nnbl > 1)
 835                 {
 836                     nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
 837                     nbnxn_init_pairlist(nbl_list->nbl_work[i]);
 838                 }
 839             }
 840             else
 841             {
 842                 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 843                 auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
 844
 845                 nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
 846             }
 847
 848             snew(nbl_list->nbl_fep[i], 1);
 849             nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
 850         }
 851         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 852     }
 853 }
 854
 855 /* Print statistics of a pair list, used for debug output */
 856 static void print_nblist_statistics(FILE *fp, const NbnxnPairlistCpu *nbl,
 857                                     const nbnxn_search *nbs, real rl)
 858 {
 859     const Grid             &grid = nbs->gridSet().grids()[0];
 860     const Grid::Dimensions &dims = grid.dimensions();
 861
 862     fprintf(fp, "nbl nci %zu ncj %d\n",
 863             nbl->ci.size(), nbl->ncjInUse);
 864     const int    numAtomsJCluster = grid.geometry().numAtomsJCluster;
 865     const double numAtomsPerCell  = nbl->ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
 866     fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 867             nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid.numCells()),
 868             numAtomsPerCell,
 869             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 870
 871     fprintf(fp, "nbl average j cell list length %.1f\n",
 872             0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
 873
 874     int cs[SHIFTS] = { 0 };
 875     int npexcl     = 0;
 876     for (const nbnxn_ci_t &ciEntry : nbl->ci)
 877     {
 878         cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
 879             ciEntry.cj_ind_end - ciEntry.cj_ind_start;
 880
 881         int j = ciEntry.cj_ind_start;
 882         while (j < ciEntry.cj_ind_end &&
 883                nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
 884         {
 885             npexcl++;
 886             j++;
 887         }
 888     }
 889     fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
 890             nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
 891     for (int s = 0; s < SHIFTS; s++)
 892     {
 893         if (cs[s] > 0)
 894         {
 895             fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
 896         }
 897     }
 898 }
 899
 900 /* Print statistics of a pair lists, used for debug output */
 901 static void print_nblist_statistics(FILE *fp, const NbnxnPairlistGpu *nbl,
 902                                     const nbnxn_search *nbs, real rl)
 903 {
 904     const Grid             &grid = nbs->gridSet().grids()[0];
 905     const Grid::Dimensions &dims = grid.dimensions();
 906
 907     fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
 908             nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
 909     const int    numAtomsCluster = grid.geometry().numAtomsICluster;
 910     const double numAtomsPerCell = nbl->nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
 911     fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 912             nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid.numClusters()),
 913             numAtomsPerCell,
 914             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 915
 916     double sum_nsp  = 0;
 917     double sum_nsp2 = 0;
 918     int    nsp_max  = 0;
 919     int    c[c_gpuNumClusterPerCell + 1] = { 0 };
 920     for (const nbnxn_sci_t &sci : nbl->sci)
 921     {
 922         int nsp = 0;
 923         for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
 924         {
 925             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
 926             {
 927                 int b = 0;
 928                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
 929                 {
 930                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
 931                     {
 932                         b++;
 933                     }
 934                 }
 935                 nsp += b;
 936                 c[b]++;
 937             }
 938         }
 939         sum_nsp  += nsp;
 940         sum_nsp2 += nsp*nsp;
 941         nsp_max   = std::max(nsp_max, nsp);
 942     }
 943     if (!nbl->sci.empty())
 944     {
 945         sum_nsp  /= nbl->sci.size();
 946         sum_nsp2 /= nbl->sci.size();
 947     }
 948     fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
 949             sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
 950
 951     if (!nbl->cj4.empty())
 952     {
 953         for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
 954         {
 955             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
 956                     b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
 957         }
 958     }
 959 }
 960
 961 /* Returns a pointer to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
 962  * Generates a new exclusion entry when the j-cluster-group uses
 963  * the default all-interaction mask at call time, so the returned mask
 964  * can be modified when needed.
 965  */
 966 static nbnxn_excl_t *get_exclusion_mask(NbnxnPairlistGpu *nbl,
 967                                         int               cj4,
 968                                         int               warp)
 969 {
 970     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 971     {
 972         /* No exclusions set, make a new list entry */
 973         const size_t oldSize = nbl->excl.size();
 974         GMX_ASSERT(oldSize >= 1, "We should always have entry [0]");
 975         /* Add entry with default values: no exclusions */
 976         nbl->excl.resize(oldSize + 1);
 977         nbl->cj4[cj4].imei[warp].excl_ind = oldSize;
 978     }
 979
 980     return &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 981 }
 982
 983 static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu *nbl,
 984                                                int cj4_ind, int sj_offset,
 985                                                int i_cluster_in_cell)
 986 {
 987     nbnxn_excl_t *excl[c_nbnxnGpuClusterpairSplit];
 988
 989     /* Here we only set the set self and double pair exclusions */
 990
 991     /* Reserve extra elements, so the resize() in get_exclusion_mask()
 992      * will not invalidate excl entries in the loop below
 993      */
 994     nbl->excl.reserve(nbl->excl.size() + c_nbnxnGpuClusterpairSplit);
 995     for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
 996     {
 997         excl[w] = get_exclusion_mask(nbl, cj4_ind, w);
 998     }
 999
1000     /* Only minor < major bits set */
1001     for (int ej = 0; ej < nbl->na_ci; ej++)
1002     {
1003         int w = (ej>>2);
1004         for (int ei = ej; ei < nbl->na_ci; ei++)
1005         {
1006             excl[w]->pair[(ej & (c_nbnxnGpuJgroupSize-1))*nbl->na_ci + ei] &=
1007                 ~(1U << (sj_offset*c_gpuNumClusterPerCell + i_cluster_in_cell));
1008         }
1009     }
1010 }
1011
1012 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
1013 static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
1014 {
1015     return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
1016 }
1017
1018 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
1019 gmx_unused static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
1020 {
1021     return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
1022             (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
1023              NBNXN_INTERACTION_MASK_ALL));
1024 }
1025
1026 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
1027 gmx_unused static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
1028 {
1029     return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
1030 }
1031
1032 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
1033 gmx_unused static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
1034 {
1035     return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
1036             (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
1037              NBNXN_INTERACTION_MASK_ALL));
1038 }
1039
1040 #if GMX_SIMD
1041 #if GMX_SIMD_REAL_WIDTH == 2
1042 #define get_imask_simd_4xn  get_imask_simd_j2
1043 #endif
1044 #if GMX_SIMD_REAL_WIDTH == 4
1045 #define get_imask_simd_4xn  get_imask_simd_j4
1046 #endif
1047 #if GMX_SIMD_REAL_WIDTH == 8
1048 #define get_imask_simd_4xn  get_imask_simd_j8
1049 #define get_imask_simd_2xnn get_imask_simd_j4
1050 #endif
1051 #if GMX_SIMD_REAL_WIDTH == 16
1052 #define get_imask_simd_2xnn get_imask_simd_j8
1053 #endif
1054 #endif
1055
1056 /* Plain C code for checking and adding cluster-pairs to the list.
1057  *
1058  * \param[in]     gridj               The j-grid
1059  * \param[in,out] nbl                 The pair-list to store the cluster pairs in
1060  * \param[in]     icluster            The index of the i-cluster
1061  * \param[in]     jclusterFirst       The first cluster in the j-range
1062  * \param[in]     jclusterLast        The last cluster in the j-range
1063  * \param[in]     excludeSubDiagonal  Exclude atom pairs with i-index > j-index
1064  * \param[in]     x_j                 Coordinates for the j-atom, in xyz format
1065  * \param[in]     rlist2              The squared list cut-off
1066  * \param[in]     rbb2                The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
1067  * \param[in,out] numDistanceChecks   The number of distance checks performed
1068  */
1069 static void
1070 makeClusterListSimple(const Grid               &jGrid,
1071                       NbnxnPairlistCpu *        nbl,
1072                       int                       icluster,
1073                       int                       jclusterFirst,
1074                       int                       jclusterLast,
1075                       bool                      excludeSubDiagonal,
1076                       const real * gmx_restrict x_j,
1077                       real                      rlist2,
1078                       float                     rbb2,
1079                       int * gmx_restrict        numDistanceChecks)
1080 {
1081     const BoundingBox * gmx_restrict bb_ci = nbl->work->iClusterData.bb.data();
1082     const real * gmx_restrict        x_ci  = nbl->work->iClusterData.x.data();
1083
1084     gmx_bool                         InRange;
1085
1086     InRange = FALSE;
1087     while (!InRange && jclusterFirst <= jclusterLast)
1088     {
1089         real d2  = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterFirst]);
1090         *numDistanceChecks += 2;
1091
1092         /* Check if the distance is within the distance where
1093          * we use only the bounding box distance rbb,
1094          * or within the cut-off and there is at least one atom pair
1095          * within the cut-off.
1096          */
1097         if (d2 < rbb2)
1098         {
1099             InRange = TRUE;
1100         }
1101         else if (d2 < rlist2)
1102         {
1103             int cjf_gl = jGrid.cellOffset() + jclusterFirst;
1104             for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
1105             {
1106                 for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
1107                 {
1108                     InRange = InRange ||
1109                         (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
1110                          gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
1111                          gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
1112                 }
1113             }
1114             *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
1115         }
1116         if (!InRange)
1117         {
1118             jclusterFirst++;
1119         }
1120     }
1121     if (!InRange)
1122     {
1123         return;
1124     }
1125
1126     InRange = FALSE;
1127     while (!InRange && jclusterLast > jclusterFirst)
1128     {
1129         real d2  = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterLast]);
1130         *numDistanceChecks += 2;
1131
1132         /* Check if the distance is within the distance where
1133          * we use only the bounding box distance rbb,
1134          * or within the cut-off and there is at least one atom pair
1135          * within the cut-off.
1136          */
1137         if (d2 < rbb2)
1138         {
1139             InRange = TRUE;
1140         }
1141         else if (d2 < rlist2)
1142         {
1143             int cjl_gl = jGrid.cellOffset() + jclusterLast;
1144             for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
1145             {
1146                 for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
1147                 {
1148                     InRange = InRange ||
1149                         (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
1150                          gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
1151                          gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
1152                 }
1153             }
1154             *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
1155         }
1156         if (!InRange)
1157         {
1158             jclusterLast--;
1159         }
1160     }
1161
1162     if (jclusterFirst <= jclusterLast)
1163     {
1164         for (int jcluster = jclusterFirst; jcluster <= jclusterLast; jcluster++)
1165         {
1166             /* Store cj and the interaction mask */
1167             nbnxn_cj_t cjEntry;
1168             cjEntry.cj   = jGrid.cellOffset() + jcluster;
1169             cjEntry.excl = get_imask(excludeSubDiagonal, icluster, jcluster);
1170             nbl->cj.push_back(cjEntry);
1171         }
1172         /* Increase the closing index in the i list */
1173         nbl->ci.back().cj_ind_end = nbl->cj.size();
1174     }
1175 }
1176
1177 #ifdef GMX_NBNXN_SIMD_4XN
1178 #include "gromacs/nbnxm/pairlist_simd_4xm.h"
1179 #endif
1180 #ifdef GMX_NBNXN_SIMD_2XNN
1181 #include "gromacs/nbnxm/pairlist_simd_2xmm.h"
1182 #endif
1183
1184 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1185  * Checks bounding box distances and possibly atom pair distances.
1186  */
1187 static void make_cluster_list_supersub(const Grid         &iGrid,
1188                                        const Grid         &jGrid,
1189                                        NbnxnPairlistGpu   *nbl,
1190                                        const int           sci,
1191                                        const int           scj,
1192                                        const bool          excludeSubDiagonal,
1193                                        const int           stride,
1194                                        const real         *x,
1195                                        const real          rlist2,
1196                                        const float         rbb2,
1197                                        int                *numDistanceChecks)
1198 {
1199     NbnxnPairlistGpuWork &work   = *nbl->work;
1200
1201 #if NBNXN_BBXXXX
1202     const float          *pbb_ci = work.iSuperClusterData.bbPacked.data();
1203 #else
1204     const BoundingBox    *bb_ci  = work.iSuperClusterData.bb.data();
1205 #endif
1206
1207     assert(c_nbnxnGpuClusterSize == iGrid.geometry().numAtomsICluster);
1208     assert(c_nbnxnGpuClusterSize == jGrid.geometry().numAtomsICluster);
1209
1210     /* We generate the pairlist mainly based on bounding-box distances
1211      * and do atom pair distance based pruning on the GPU.
1212      * Only if a j-group contains a single cluster-pair, we try to prune
1213      * that pair based on atom distances on the CPU to avoid empty j-groups.
1214      */
1215 #define PRUNE_LIST_CPU_ONE 1
1216 #define PRUNE_LIST_CPU_ALL 0
1217
1218 #if PRUNE_LIST_CPU_ONE
1219     int  ci_last = -1;
1220 #endif
1221
1222     float *d2l = work.distanceBuffer.data();
1223
1224     for (int subc = 0; subc < jGrid.numClustersPerCell()[scj]; subc++)
1225     {
1226         const int    cj4_ind   = work.cj_ind/c_nbnxnGpuJgroupSize;
1227         const int    cj_offset = work.cj_ind - cj4_ind*c_nbnxnGpuJgroupSize;
1228         const int    cj        = scj*c_gpuNumClusterPerCell + subc;
1229
1230         const int    cj_gl     = jGrid.cellOffset()*c_gpuNumClusterPerCell + cj;
1231
1232         int          ci1;
1233         if (excludeSubDiagonal && sci == scj)
1234         {
1235             ci1 = subc + 1;
1236         }
1237         else
1238         {
1239             ci1 = iGrid.numClustersPerCell()[sci];
1240         }
1241
1242 #if NBNXN_BBXXXX
1243         /* Determine all ci1 bb distances in one call with SIMD4 */
1244         const int offset = packedBoundingBoxesIndex(cj) + (cj & (c_packedBoundingBoxesDimSize - 1));
1245         clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + offset,
1246                                                ci1, pbb_ci, d2l);
1247         *numDistanceChecks += c_nbnxnGpuClusterSize*2;
1248 #endif
1249
1250         int          npair = 0;
1251         unsigned int imask = 0;
1252         /* We use a fixed upper-bound instead of ci1 to help optimization */
1253         for (int ci = 0; ci < c_gpuNumClusterPerCell; ci++)
1254         {
1255             if (ci == ci1)
1256             {
1257                 break;
1258             }
1259
1260 #if !NBNXN_BBXXXX
1261             /* Determine the bb distance between ci and cj */
1262             d2l[ci]             = clusterBoundingBoxDistance2(bb_ci[ci], jGrid.jBoundingBoxes()[cj]);
1263             *numDistanceChecks += 2;
1264 #endif
1265             float d2 = d2l[ci];
1266
1267 #if PRUNE_LIST_CPU_ALL
1268             /* Check if the distance is within the distance where
1269              * we use only the bounding box distance rbb,
1270              * or within the cut-off and there is at least one atom pair
1271              * within the cut-off. This check is very costly.
1272              */
1273             *numDistanceChecks += c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize;
1274             if (d2 < rbb2 ||
1275                 (d2 < rlist2 &&
1276                  clusterpair_in_range(work, ci, cj_gl, stride, x, rlist2)))
1277 #else
1278             /* Check if the distance between the two bounding boxes
1279              * in within the pair-list cut-off.
1280              */
1281             if (d2 < rlist2)
1282 #endif
1283             {
1284                 /* Flag this i-subcell to be taken into account */
1285                 imask |= (1U << (cj_offset*c_gpuNumClusterPerCell + ci));
1286
1287 #if PRUNE_LIST_CPU_ONE
1288                 ci_last = ci;
1289 #endif
1290
1291                 npair++;
1292             }
1293         }
1294
1295 #if PRUNE_LIST_CPU_ONE
1296         /* If we only found 1 pair, check if any atoms are actually
1297          * within the cut-off, so we could get rid of it.
1298          */
1299         if (npair == 1 && d2l[ci_last] >= rbb2 &&
1300             !clusterpair_in_range(work, ci_last, cj_gl, stride, x, rlist2))
1301         {
1302             imask &= ~(1U << (cj_offset*c_gpuNumClusterPerCell + ci_last));
1303             npair--;
1304         }
1305 #endif
1306
1307         if (npair > 0)
1308         {
1309             /* We have at least one cluster pair: add a j-entry */
1310             if (static_cast<size_t>(cj4_ind) == nbl->cj4.size())
1311             {
1312                 nbl->cj4.resize(nbl->cj4.size() + 1);
1313             }
1314             nbnxn_cj4_t *cj4   = &nbl->cj4[cj4_ind];
1315
1316             cj4->cj[cj_offset] = cj_gl;
1317
1318             /* Set the exclusions for the ci==sj entry.
1319              * Here we don't bother to check if this entry is actually flagged,
1320              * as it will nearly always be in the list.
1321              */
1322             if (excludeSubDiagonal && sci == scj)
1323             {
1324                 set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, subc);
1325             }
1326
1327             /* Copy the cluster interaction mask to the list */
1328             for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
1329             {
1330                 cj4->imei[w].imask |= imask;
1331             }
1332
1333             nbl->work->cj_ind++;
1334
1335             /* Keep the count */
1336             nbl->nci_tot += npair;
1337
1338             /* Increase the closing index in i super-cell list */
1339             nbl->sci.back().cj4_ind_end =
1340                 (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
1341         }
1342     }
1343 }
1344
1345 /* Returns how many contiguous j-clusters we have starting in the i-list */
1346 template <typename CjListType>
1347 static int numContiguousJClusters(const int                       cjIndexStart,
1348                                   const int                       cjIndexEnd,
1349                                   gmx::ArrayRef<const CjListType> cjList)
1350 {
1351     const int firstJCluster = nblCj(cjList, cjIndexStart);
1352
1353     int       numContiguous = 0;
1354
1355     while (cjIndexStart + numContiguous < cjIndexEnd &&
1356            nblCj(cjList, cjIndexStart + numContiguous) == firstJCluster + numContiguous)
1357     {
1358         numContiguous++;
1359     }
1360
1361     return numContiguous;
1362 }
1363
1364 /*! \internal
1365  * \brief Helper struct for efficient searching for excluded atoms in a j-list
1366  */
1367 struct JListRanges
1368 {
1369     /*! \brief Constructs a j-list range from \p cjList with the given index range */
1370     template <typename CjListType>
1371     JListRanges(int                             cjIndexStart,
1372                 int                             cjIndexEnd,
1373                 gmx::ArrayRef<const CjListType> cjList);
1374
1375     int cjIndexStart; //!< The start index in the j-list
1376     int cjIndexEnd;   //!< The end index in the j-list
1377     int cjFirst;      //!< The j-cluster with index cjIndexStart
1378     int cjLast;       //!< The j-cluster with index cjIndexEnd-1
1379     int numDirect;    //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1380 };
1381
1382 #ifndef DOXYGEN
1383 template <typename CjListType>
1384 JListRanges::JListRanges(int                             cjIndexStart,
1385                          int                             cjIndexEnd,
1386                          gmx::ArrayRef<const CjListType> cjList) :
1387     cjIndexStart(cjIndexStart),
1388     cjIndexEnd(cjIndexEnd)
1389 {
1390     GMX_ASSERT(cjIndexEnd > cjIndexStart, "JListRanges should only be called with non-empty lists");
1391
1392     cjFirst   = nblCj(cjList, cjIndexStart);
1393     cjLast    = nblCj(cjList, cjIndexEnd - 1);
1394
1395     /* Determine how many contiguous j-cells we have starting
1396      * from the first i-cell. This number can be used to directly
1397      * calculate j-cell indices for excluded atoms.
1398      */
1399     numDirect = numContiguousJClusters(cjIndexStart, cjIndexEnd, cjList);
1400 }
1401 #endif // !DOXYGEN
1402
1403 /* Return the index of \p jCluster in the given range or -1 when not present
1404  *
1405  * Note: This code is executed very often and therefore performance is
1406  *       important. It should be inlined and fully optimized.
1407  */
1408 template <typename CjListType>
1409 static inline int
1410 findJClusterInJList(int                              jCluster,
1411                     const JListRanges               &ranges,
1412                     gmx::ArrayRef<const CjListType>  cjList)
1413 {
1414     int index;
1415
1416     if (jCluster < ranges.cjFirst + ranges.numDirect)
1417     {
1418         /* We can calculate the index directly using the offset */
1419         index = ranges.cjIndexStart + jCluster - ranges.cjFirst;
1420     }
1421     else
1422     {
1423         /* Search for jCluster using bisection */
1424         index           = -1;
1425         int rangeStart  = ranges.cjIndexStart + ranges.numDirect;
1426         int rangeEnd    = ranges.cjIndexEnd;
1427         int rangeMiddle;
1428         while (index == -1 && rangeStart < rangeEnd)
1429         {
1430             rangeMiddle = (rangeStart + rangeEnd) >> 1;
1431
1432             const int clusterMiddle = nblCj(cjList, rangeMiddle);
1433
1434             if (jCluster == clusterMiddle)
1435             {
1436                 index      = rangeMiddle;
1437             }
1438             else if (jCluster < clusterMiddle)
1439             {
1440                 rangeEnd   = rangeMiddle;
1441             }
1442             else
1443             {
1444                 rangeStart = rangeMiddle + 1;
1445             }
1446         }
1447     }
1448
1449     return index;
1450 }
1451
1452 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1453
1454 /* Return the i-entry in the list we are currently operating on */
1455 static nbnxn_ci_t *getOpenIEntry(NbnxnPairlistCpu *nbl)
1456 {
1457     return &nbl->ci.back();
1458 }
1459
1460 /* Return the i-entry in the list we are currently operating on */
1461 static nbnxn_sci_t *getOpenIEntry(NbnxnPairlistGpu *nbl)
1462 {
1463     return &nbl->sci.back();
1464 }
1465
1466 /* Set all atom-pair exclusions for a simple type list i-entry
1467  *
1468  * Set all atom-pair exclusions from the topology stored in exclusions
1469  * as masks in the pair-list for simple list entry iEntry.
1470  */
1471 static void
1472 setExclusionsForIEntry(const Nbnxm::GridSet &gridSet,
1473                        NbnxnPairlistCpu     *nbl,
1474                        gmx_bool              diagRemoved,
1475                        int                   na_cj_2log,
1476                        const nbnxn_ci_t     &iEntry,
1477                        const t_blocka       &exclusions)
1478 {
1479     if (iEntry.cj_ind_end == iEntry.cj_ind_start)
1480     {
1481         /* Empty list: no exclusions */
1482         return;
1483     }
1484
1485     const JListRanges        ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, gmx::makeConstArrayRef(nbl->cj));
1486
1487     const int                iCluster = iEntry.ci;
1488
1489     gmx::ArrayRef<const int> cell        = gridSet.cells();
1490     gmx::ArrayRef<const int> atomIndices = gridSet.atomIndices();
1491
1492     /* Loop over the atoms in the i-cluster */
1493     for (int i = 0; i < nbl->na_ci; i++)
1494     {
1495         const int iIndex = iCluster*nbl->na_ci + i;
1496         const int iAtom  = atomIndices[iIndex];
1497         if (iAtom >= 0)
1498         {
1499             /* Loop over the topology-based exclusions for this i-atom */
1500             for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
1501             {
1502                 const int jAtom = exclusions.a[exclIndex];
1503
1504                 if (jAtom == iAtom)
1505                 {
1506                     /* The self exclusion are already set, save some time */
1507                     continue;
1508                 }
1509
1510                 /* Get the index of the j-atom in the nbnxn atom data */
1511                 const int jIndex = cell[jAtom];
1512
1513                 /* Without shifts we only calculate interactions j>i
1514                  * for one-way pair-lists.
1515                  */
1516                 if (diagRemoved && jIndex <= iIndex)
1517                 {
1518                     continue;
1519                 }
1520
1521                 const int jCluster = (jIndex >> na_cj_2log);
1522
1523                 /* Could the cluster se be in our list? */
1524                 if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
1525                 {
1526                     const int index =
1527                         findJClusterInJList(jCluster, ranges,
1528                                             gmx::makeConstArrayRef(nbl->cj));
1529
1530                     if (index >= 0)
1531                     {
1532                         /* We found an exclusion, clear the corresponding
1533                          * interaction bit.
1534                          */
1535                         const int innerJ     = jIndex - (jCluster << na_cj_2log);
1536
1537                         nbl->cj[index].excl &= ~(1U << ((i << na_cj_2log) + innerJ));
1538                     }
1539                 }
1540             }
1541         }
1542     }
1543 }
1544
1545 /* Add a new i-entry to the FEP list and copy the i-properties */
1546 static inline void fep_list_new_nri_copy(t_nblist *nlist)
1547 {
1548     /* Add a new i-entry */
1549     nlist->nri++;
1550
1551     assert(nlist->nri < nlist->maxnri);
1552
1553     /* Duplicate the last i-entry, except for jindex, which continues */
1554     nlist->iinr[nlist->nri]   = nlist->iinr[nlist->nri-1];
1555     nlist->shift[nlist->nri]  = nlist->shift[nlist->nri-1];
1556     nlist->gid[nlist->nri]    = nlist->gid[nlist->nri-1];
1557     nlist->jindex[nlist->nri] = nlist->nrj;
1558 }
1559
1560 /* For load balancing of the free-energy lists over threads, we set
1561  * the maximum nrj size of an i-entry to 40. This leads to good
1562  * load balancing in the worst case scenario of a single perturbed
1563  * particle on 16 threads, while not introducing significant overhead.
1564  * Note that half of the perturbed pairs will anyhow end up in very small lists,
1565  * since non perturbed i-particles will see few perturbed j-particles).
1566  */
1567 const int max_nrj_fep = 40;
1568
1569 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1570  * singularities for overlapping particles (0/0), since the charges and
1571  * LJ parameters have been zeroed in the nbnxn data structure.
1572  * Simultaneously make a group pair list for the perturbed pairs.
1573  */
1574 static void make_fep_list(gmx::ArrayRef<const int>  atomIndices,
1575                           const nbnxn_atomdata_t   *nbat,
1576                           NbnxnPairlistCpu         *nbl,
1577                           gmx_bool                  bDiagRemoved,
1578                           nbnxn_ci_t               *nbl_ci,
1579                           real gmx_unused           shx,
1580                           real gmx_unused           shy,
1581                           real gmx_unused           shz,
1582                           real gmx_unused           rlist_fep2,
1583                           const Grid               &iGrid,
1584                           const Grid               &jGrid,
1585                           t_nblist                 *nlist)
1586 {
1587     int      ci, cj_ind_start, cj_ind_end, cja, cjr;
1588     int      nri_max;
1589     int      gid_i = 0, gid_j, gid;
1590     int      egp_shift, egp_mask;
1591     int      gid_cj = 0;
1592     int      ind_i, ind_j, ai, aj;
1593     int      nri;
1594     gmx_bool bFEP_i, bFEP_i_all;
1595
1596     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
1597     {
1598         /* Empty list */
1599         return;
1600     }
1601
1602     ci = nbl_ci->ci;
1603
1604     cj_ind_start = nbl_ci->cj_ind_start;
1605     cj_ind_end   = nbl_ci->cj_ind_end;
1606
1607     /* In worst case we have alternating energy groups
1608      * and create #atom-pair lists, which means we need the size
1609      * of a cluster pair (na_ci*na_cj) times the number of cj's.
1610      */
1611     nri_max = nbl->na_ci*nbl->na_cj*(cj_ind_end - cj_ind_start);
1612     if (nlist->nri + nri_max > nlist->maxnri)
1613     {
1614         nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
1615         reallocate_nblist(nlist);
1616     }
1617
1618     const int numAtomsJCluster = jGrid.geometry().numAtomsJCluster;
1619
1620     const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
1621
1622     const int ngid = nbatParams.nenergrp;
1623
1624     /* TODO: Consider adding a check in grompp and changing this to an assert */
1625     const int numBitsInEnergyGroupIdsForAtomsInJCluster = sizeof(gid_cj)*8;
1626     if (ngid*numAtomsJCluster > numBitsInEnergyGroupIdsForAtomsInJCluster)
1627     {
1628         gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
1629                   iGrid.geometry().numAtomsICluster, numAtomsJCluster,
1630                   (sizeof(gid_cj)*8)/numAtomsJCluster);
1631     }
1632
1633     egp_shift = nbatParams.neg_2log;
1634     egp_mask  = (1 << egp_shift) - 1;
1635
1636     /* Loop over the atoms in the i sub-cell */
1637     bFEP_i_all = TRUE;
1638     for (int i = 0; i < nbl->na_ci; i++)
1639     {
1640         ind_i = ci*nbl->na_ci + i;
1641         ai    = atomIndices[ind_i];
1642         if (ai >= 0)
1643         {
1644             nri                  = nlist->nri;
1645             nlist->jindex[nri+1] = nlist->jindex[nri];
1646             nlist->iinr[nri]     = ai;
1647             /* The actual energy group pair index is set later */
1648             nlist->gid[nri]      = 0;
1649             nlist->shift[nri]    = nbl_ci->shift & NBNXN_CI_SHIFT;
1650
1651             bFEP_i = iGrid.atomIsPerturbed(ci - iGrid.cellOffset(), i);
1652
1653             bFEP_i_all = bFEP_i_all && bFEP_i;
1654
1655             if (nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj)
1656             {
1657                 nlist->maxnrj = over_alloc_small(nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj);
1658                 srenew(nlist->jjnr,     nlist->maxnrj);
1659                 srenew(nlist->excl_fep, nlist->maxnrj);
1660             }
1661
1662             if (ngid > 1)
1663             {
1664                 gid_i = (nbatParams.energrp[ci] >> (egp_shift*i)) & egp_mask;
1665             }
1666
1667             for (int cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)
1668             {
1669                 unsigned int fep_cj;
1670
1671                 cja = nbl->cj[cj_ind].cj;
1672
1673                 if (numAtomsJCluster == jGrid.geometry().numAtomsICluster)
1674                 {
1675                     cjr    = cja - jGrid.cellOffset();
1676                     fep_cj = jGrid.fepBits(cjr);
1677                     if (ngid > 1)
1678                     {
1679                         gid_cj = nbatParams.energrp[cja];
1680                     }
1681                 }
1682                 else if (2*numAtomsJCluster == jGrid.geometry().numAtomsICluster)
1683                 {
1684                     cjr    = cja - jGrid.cellOffset()*2;
1685                     /* Extract half of the ci fep/energrp mask */
1686                     fep_cj = (jGrid.fepBits(cjr >> 1) >> ((cjr & 1)*numAtomsJCluster)) & ((1 << numAtomsJCluster) - 1);
1687                     if (ngid > 1)
1688                     {
1689                         gid_cj = nbatParams.energrp[cja >> 1] >> ((cja & 1)*numAtomsJCluster*egp_shift) & ((1 << (numAtomsJCluster*egp_shift)) - 1);
1690                     }
1691                 }
1692                 else
1693                 {
1694                     cjr    = cja - (jGrid.cellOffset() >> 1);
1695                     /* Combine two ci fep masks/energrp */
1696                     fep_cj = jGrid.fepBits(cjr*2) + (jGrid.fepBits(cjr*2 + 1) << jGrid.geometry().numAtomsICluster);
1697                     if (ngid > 1)
1698                     {
1699                         gid_cj = nbatParams.energrp[cja*2] + (nbatParams.energrp[cja*2+1] << (jGrid.geometry().numAtomsICluster*egp_shift));
1700                     }
1701                 }
1702
1703                 if (bFEP_i || fep_cj != 0)
1704                 {
1705                     for (int j = 0; j < nbl->na_cj; j++)
1706                     {
1707                         /* Is this interaction perturbed and not excluded? */
1708                         ind_j = cja*nbl->na_cj + j;
1709                         aj    = atomIndices[ind_j];
1710                         if (aj >= 0 &&
1711                             (bFEP_i || (fep_cj & (1 << j))) &&
1712                             (!bDiagRemoved || ind_j >= ind_i))
1713                         {
1714                             if (ngid > 1)
1715                             {
1716                                 gid_j = (gid_cj >> (j*egp_shift)) & egp_mask;
1717                                 gid   = GID(gid_i, gid_j, ngid);
1718
1719                                 if (nlist->nrj > nlist->jindex[nri] &&
1720                                     nlist->gid[nri] != gid)
1721                                 {
1722                                     /* Energy group pair changed: new list */
1723                                     fep_list_new_nri_copy(nlist);
1724                                     nri = nlist->nri;
1725                                 }
1726                                 nlist->gid[nri] = gid;
1727                             }
1728
1729                             if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
1730                             {
1731                                 fep_list_new_nri_copy(nlist);
1732                                 nri = nlist->nri;
1733                             }
1734
1735                             /* Add it to the FEP list */
1736                             nlist->jjnr[nlist->nrj]     = aj;
1737                             nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1;
1738                             nlist->nrj++;
1739
1740                             /* Exclude it from the normal list.
1741                              * Note that the charge has been set to zero,
1742                              * but we need to avoid 0/0, as perturbed atoms
1743                              * can be on top of each other.
1744                              */
1745                             nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j));
1746                         }
1747                     }
1748                 }
1749             }
1750
1751             if (nlist->nrj > nlist->jindex[nri])
1752             {
1753                 /* Actually add this new, non-empty, list */
1754                 nlist->nri++;
1755                 nlist->jindex[nlist->nri] = nlist->nrj;
1756             }
1757         }
1758     }
1759
1760     if (bFEP_i_all)
1761     {
1762         /* All interactions are perturbed, we can skip this entry */
1763         nbl_ci->cj_ind_end = cj_ind_start;
1764         nbl->ncjInUse     -= cj_ind_end - cj_ind_start;
1765     }
1766 }
1767
1768 /* Return the index of atom a within a cluster */
1769 static inline int cj_mod_cj4(int cj)
1770 {
1771     return cj & (c_nbnxnGpuJgroupSize - 1);
1772 }
1773
1774 /* Convert a j-cluster to a cj4 group */
1775 static inline int cj_to_cj4(int cj)
1776 {
1777     return cj/c_nbnxnGpuJgroupSize;
1778 }
1779
1780 /* Return the index of an j-atom within a warp */
1781 static inline int a_mod_wj(int a)
1782 {
1783     return a & (c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit - 1);
1784 }
1785
1786 /* As make_fep_list above, but for super/sub lists. */
1787 static void make_fep_list(gmx::ArrayRef<const int>  atomIndices,
1788                           const nbnxn_atomdata_t   *nbat,
1789                           NbnxnPairlistGpu         *nbl,
1790                           gmx_bool                  bDiagRemoved,
1791                           const nbnxn_sci_t        *nbl_sci,
1792                           real                      shx,
1793                           real                      shy,
1794                           real                      shz,
1795                           real                      rlist_fep2,
1796                           const Grid               &iGrid,
1797                           const Grid               &jGrid,
1798                           t_nblist                 *nlist)
1799 {
1800     int                nri_max;
1801     int                c_abs;
1802     int                ind_i, ind_j, ai, aj;
1803     int                nri;
1804     gmx_bool           bFEP_i;
1805     real               xi, yi, zi;
1806     const nbnxn_cj4_t *cj4;
1807
1808     const int          numJClusterGroups = nbl_sci->numJClusterGroups();
1809     if (numJClusterGroups == 0)
1810     {
1811         /* Empty list */
1812         return;
1813     }
1814
1815     const int sci           = nbl_sci->sci;
1816
1817     const int cj4_ind_start = nbl_sci->cj4_ind_start;
1818     const int cj4_ind_end   = nbl_sci->cj4_ind_end;
1819
1820     /* Here we process one super-cell, max #atoms na_sc, versus a list
1821      * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1822      * of size na_cj atoms.
1823      * On the GPU we don't support energy groups (yet).
1824      * So for each of the na_sc i-atoms, we need max one FEP list
1825      * for each max_nrj_fep j-atoms.
1826      */
1827     nri_max = nbl->na_sc*nbl->na_cj*(1 + (numJClusterGroups*c_nbnxnGpuJgroupSize)/max_nrj_fep);
1828     if (nlist->nri + nri_max > nlist->maxnri)
1829     {
1830         nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
1831         reallocate_nblist(nlist);
1832     }
1833
1834     /* Loop over the atoms in the i super-cluster */
1835     for (int c = 0; c < c_gpuNumClusterPerCell; c++)
1836     {
1837         c_abs = sci*c_gpuNumClusterPerCell + c;
1838
1839         for (int i = 0; i < nbl->na_ci; i++)
1840         {
1841             ind_i = c_abs*nbl->na_ci + i;
1842             ai    = atomIndices[ind_i];
1843             if (ai >= 0)
1844             {
1845                 nri                  = nlist->nri;
1846                 nlist->jindex[nri+1] = nlist->jindex[nri];
1847                 nlist->iinr[nri]     = ai;
1848                 /* With GPUs, energy groups are not supported */
1849                 nlist->gid[nri]      = 0;
1850                 nlist->shift[nri]    = nbl_sci->shift & NBNXN_CI_SHIFT;
1851
1852                 bFEP_i = iGrid.atomIsPerturbed(c_abs - iGrid.cellOffset()*c_gpuNumClusterPerCell, i);
1853
1854                 xi = nbat->x()[ind_i*nbat->xstride+XX] + shx;
1855                 yi = nbat->x()[ind_i*nbat->xstride+YY] + shy;
1856                 zi = nbat->x()[ind_i*nbat->xstride+ZZ] + shz;
1857
1858                 const int nrjMax = nlist->nrj + numJClusterGroups*c_nbnxnGpuJgroupSize*nbl->na_cj;
1859                 if (nrjMax > nlist->maxnrj)
1860                 {
1861                     nlist->maxnrj = over_alloc_small(nrjMax);
1862                     srenew(nlist->jjnr,     nlist->maxnrj);
1863                     srenew(nlist->excl_fep, nlist->maxnrj);
1864                 }
1865
1866                 for (int cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)
1867                 {
1868                     cj4 = &nbl->cj4[cj4_ind];
1869
1870                     for (int gcj = 0; gcj < c_nbnxnGpuJgroupSize; gcj++)
1871                     {
1872                         if ((cj4->imei[0].imask & (1U << (gcj*c_gpuNumClusterPerCell + c))) == 0)
1873                         {
1874                             /* Skip this ci for this cj */
1875                             continue;
1876                         }
1877
1878                         const int cjr =
1879                             cj4->cj[gcj] - jGrid.cellOffset()*c_gpuNumClusterPerCell;
1880
1881                         if (bFEP_i || jGrid.clusterIsPerturbed(cjr))
1882                         {
1883                             for (int j = 0; j < nbl->na_cj; j++)
1884                             {
1885                                 /* Is this interaction perturbed and not excluded? */
1886                                 ind_j = (jGrid.cellOffset()*c_gpuNumClusterPerCell + cjr)*nbl->na_cj + j;
1887                                 aj    = atomIndices[ind_j];
1888                                 if (aj >= 0 &&
1889                                     (bFEP_i || jGrid.atomIsPerturbed(cjr, j)) &&
1890                                     (!bDiagRemoved || ind_j >= ind_i))
1891                                 {
1892                                     int           excl_pair;
1893                                     unsigned int  excl_bit;
1894                                     real          dx, dy, dz;
1895
1896                                     const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
1897                                     nbnxn_excl_t *excl  =
1898                                         get_exclusion_mask(nbl, cj4_ind, jHalf);
1899
1900                                     excl_pair = a_mod_wj(j)*nbl->na_ci + i;
1901                                     excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
1902
1903                                     dx = nbat->x()[ind_j*nbat->xstride+XX] - xi;
1904                                     dy = nbat->x()[ind_j*nbat->xstride+YY] - yi;
1905                                     dz = nbat->x()[ind_j*nbat->xstride+ZZ] - zi;
1906
1907                                     /* The unpruned GPU list has more than 2/3
1908                                      * of the atom pairs beyond rlist. Using
1909                                      * this list will cause a lot of overhead
1910                                      * in the CPU FEP kernels, especially
1911                                      * relative to the fast GPU kernels.
1912                                      * So we prune the FEP list here.
1913                                      */
1914                                     if (dx*dx + dy*dy + dz*dz < rlist_fep2)
1915                                     {
1916                                         if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
1917                                         {
1918                                             fep_list_new_nri_copy(nlist);
1919                                             nri = nlist->nri;
1920                                         }
1921
1922                                         /* Add it to the FEP list */
1923                                         nlist->jjnr[nlist->nrj]     = aj;
1924                                         nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0;
1925                                         nlist->nrj++;
1926                                     }
1927
1928                                     /* Exclude it from the normal list.
1929                                      * Note that the charge and LJ parameters have
1930                                      * been set to zero, but we need to avoid 0/0,
1931                                      * as perturbed atoms can be on top of each other.
1932                                      */
1933                                     excl->pair[excl_pair] &= ~excl_bit;
1934                                 }
1935                             }
1936
1937                             /* Note that we could mask out this pair in imask
1938                              * if all i- and/or all j-particles are perturbed.
1939                              * But since the perturbed pairs on the CPU will
1940                              * take an order of magnitude more time, the GPU
1941                              * will finish before the CPU and there is no gain.
1942                              */
1943                         }
1944                     }
1945                 }
1946
1947                 if (nlist->nrj > nlist->jindex[nri])
1948                 {
1949                     /* Actually add this new, non-empty, list */
1950                     nlist->nri++;
1951                     nlist->jindex[nlist->nri] = nlist->nrj;
1952                 }
1953             }
1954         }
1955     }
1956 }
1957
1958 /* Set all atom-pair exclusions for a GPU type list i-entry
1959  *
1960  * Sets all atom-pair exclusions from the topology stored in exclusions
1961  * as masks in the pair-list for i-super-cluster list entry iEntry.
1962  */
1963 static void
1964 setExclusionsForIEntry(const Nbnxm::GridSet &gridSet,
1965                        NbnxnPairlistGpu     *nbl,
1966                        gmx_bool              diagRemoved,
1967                        int gmx_unused        na_cj_2log,
1968                        const nbnxn_sci_t    &iEntry,
1969                        const t_blocka       &exclusions)
1970 {
1971     if (iEntry.numJClusterGroups() == 0)
1972     {
1973         /* Empty list */
1974         return;
1975     }
1976
1977     /* Set the search ranges using start and end j-cluster indices.
1978      * Note that here we can not use cj4_ind_end, since the last cj4
1979      * can be only partially filled, so we use cj_ind.
1980      */
1981     const JListRanges ranges(iEntry.cj4_ind_start*c_nbnxnGpuJgroupSize,
1982                              nbl->work->cj_ind,
1983                              gmx::makeConstArrayRef(nbl->cj4));
1984
1985     GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
1986     constexpr int            c_clusterSize      = c_nbnxnGpuClusterSize;
1987     constexpr int            c_superClusterSize = c_nbnxnGpuNumClusterPerSupercluster*c_nbnxnGpuClusterSize;
1988
1989     const int                iSuperCluster = iEntry.sci;
1990
1991     gmx::ArrayRef<const int> atomIndices   = gridSet.atomIndices();
1992     gmx::ArrayRef<const int> cell          = gridSet.cells();
1993
1994     /* Loop over the atoms in the i super-cluster */
1995     for (int i = 0; i < c_superClusterSize; i++)
1996     {
1997         const int iIndex = iSuperCluster*c_superClusterSize + i;
1998         const int iAtom  = atomIndices[iIndex];
1999         if (iAtom >= 0)
2000         {
2001             const int iCluster = i/c_clusterSize;
2002
2003             /* Loop over the topology-based exclusions for this i-atom */
2004             for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
2005             {
2006                 const int jAtom = exclusions.a[exclIndex];
2007
2008                 if (jAtom == iAtom)
2009                 {
2010                     /* The self exclusions are already set, save some time */
2011                     continue;
2012                 }
2013
2014                 /* Get the index of the j-atom in the nbnxn atom data */
2015                 const int jIndex = cell[jAtom];
2016
2017                 /* Without shifts we only calculate interactions j>i
2018                  * for one-way pair-lists.
2019                  */
2020                 /* NOTE: We would like to use iIndex on the right hand side,
2021                  * but that makes this routine 25% slower with gcc6/7.
2022                  * Even using c_superClusterSize makes it slower.
2023                  * Either of these changes triggers peeling of the exclIndex
2024                  * loop, which apparently leads to far less efficient code.
2025                  */
2026                 if (diagRemoved && jIndex <= iSuperCluster*nbl->na_sc + i)
2027                 {
2028                     continue;
2029                 }
2030
2031                 const int jCluster = jIndex/c_clusterSize;
2032
2033                 /* Check whether the cluster is in our list? */
2034                 if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
2035                 {
2036                     const int index =
2037                         findJClusterInJList(jCluster, ranges,
2038                                             gmx::makeConstArrayRef(nbl->cj4));
2039
2040                     if (index >= 0)
2041                     {
2042                         /* We found an exclusion, clear the corresponding
2043                          * interaction bit.
2044                          */
2045                         const unsigned int pairMask = (1U << (cj_mod_cj4(index)*c_gpuNumClusterPerCell + iCluster));
2046                         /* Check if the i-cluster interacts with the j-cluster */
2047                         if (nbl_imask0(nbl, index) & pairMask)
2048                         {
2049                             const int innerI = (i      & (c_clusterSize - 1));
2050                             const int innerJ = (jIndex & (c_clusterSize - 1));
2051
2052                             /* Determine which j-half (CUDA warp) we are in */
2053                             const int     jHalf = innerJ/(c_clusterSize/c_nbnxnGpuClusterpairSplit);
2054
2055                             nbnxn_excl_t *interactionMask =
2056                                 get_exclusion_mask(nbl, cj_to_cj4(index), jHalf);
2057
2058                             interactionMask->pair[a_mod_wj(innerJ)*c_clusterSize + innerI] &= ~pairMask;
2059                         }
2060                     }
2061                 }
2062             }
2063         }
2064     }
2065 }
2066
2067 /* Make a new ci entry at the back of nbl->ci */
2068 static void addNewIEntry(NbnxnPairlistCpu *nbl, int ci, int shift, int flags)
2069 {
2070     nbnxn_ci_t ciEntry;
2071     ciEntry.ci            = ci;
2072     ciEntry.shift         = shift;
2073     /* Store the interaction flags along with the shift */
2074     ciEntry.shift        |= flags;
2075     ciEntry.cj_ind_start  = nbl->cj.size();
2076     ciEntry.cj_ind_end    = nbl->cj.size();
2077     nbl->ci.push_back(ciEntry);
2078 }
2079
2080 /* Make a new sci entry at index nbl->nsci */
2081 static void addNewIEntry(NbnxnPairlistGpu *nbl, int sci, int shift, int gmx_unused flags)
2082 {
2083     nbnxn_sci_t sciEntry;
2084     sciEntry.sci           = sci;
2085     sciEntry.shift         = shift;
2086     sciEntry.cj4_ind_start = nbl->cj4.size();
2087     sciEntry.cj4_ind_end   = nbl->cj4.size();
2088
2089     nbl->sci.push_back(sciEntry);
2090 }
2091
2092 /* Sort the simple j-list cj on exclusions.
2093  * Entries with exclusions will all be sorted to the beginning of the list.
2094  */
2095 static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
2096                          NbnxnPairlistCpuWork *work)
2097 {
2098     work->cj.resize(ncj);
2099
2100     /* Make a list of the j-cells involving exclusions */
2101     int jnew = 0;
2102     for (int j = 0; j < ncj; j++)
2103     {
2104         if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
2105         {
2106             work->cj[jnew++] = cj[j];
2107         }
2108     }
2109     /* Check if there are exclusions at all or not just the first entry */
2110     if (!((jnew == 0) ||
2111           (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
2112     {
2113         for (int j = 0; j < ncj; j++)
2114         {
2115             if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
2116             {
2117                 work->cj[jnew++] = cj[j];
2118             }
2119         }
2120         for (int j = 0; j < ncj; j++)
2121         {
2122             cj[j] = work->cj[j];
2123         }
2124     }
2125 }
2126
2127 /* Close this simple list i entry */
2128 static void closeIEntry(NbnxnPairlistCpu    *nbl,
2129                         int gmx_unused       sp_max_av,
2130                         gmx_bool gmx_unused  progBal,
2131                         float gmx_unused     nsp_tot_est,
2132                         int gmx_unused       thread,
2133                         int gmx_unused       nthread)
2134 {
2135     nbnxn_ci_t &ciEntry = nbl->ci.back();
2136
2137     /* All content of the new ci entry have already been filled correctly,
2138      * we only need to sort and increase counts or remove the entry when empty.
2139      */
2140     const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
2141     if (jlen > 0)
2142     {
2143         sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
2144
2145         /* The counts below are used for non-bonded pair/flop counts
2146          * and should therefore match the available kernel setups.
2147          */
2148         if (!(ciEntry.shift & NBNXN_CI_DO_COUL(0)))
2149         {
2150             nbl->work->ncj_noq += jlen;
2151         }
2152         else if ((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) ||
2153                  !(ciEntry.shift & NBNXN_CI_DO_LJ(0)))
2154         {
2155             nbl->work->ncj_hlj += jlen;
2156         }
2157     }
2158     else
2159     {
2160         /* Entry is empty: remove it  */
2161         nbl->ci.pop_back();
2162     }
2163 }
2164
2165 /* Split sci entry for load balancing on the GPU.
2166  * Splitting ensures we have enough lists to fully utilize the whole GPU.
2167  * With progBal we generate progressively smaller lists, which improves
2168  * load balancing. As we only know the current count on our own thread,
2169  * we will need to estimate the current total amount of i-entries.
2170  * As the lists get concatenated later, this estimate depends
2171  * both on nthread and our own thread index.
2172  */
2173 static void split_sci_entry(NbnxnPairlistGpu *nbl,
2174                             int nsp_target_av,
2175                             gmx_bool progBal, float nsp_tot_est,
2176                             int thread, int nthread)
2177 {
2178     int nsp_max;
2179
2180     if (progBal)
2181     {
2182         float nsp_est;
2183
2184         /* Estimate the total numbers of ci's of the nblist combined
2185          * over all threads using the target number of ci's.
2186          */
2187         nsp_est = (nsp_tot_est*thread)/nthread + nbl->nci_tot;
2188
2189         /* The first ci blocks should be larger, to avoid overhead.
2190          * The last ci blocks should be smaller, to improve load balancing.
2191          * The factor 3/2 makes the first block 3/2 times the target average
2192          * and ensures that the total number of blocks end up equal to
2193          * that of equally sized blocks of size nsp_target_av.
2194          */
2195         nsp_max = static_cast<int>(nsp_target_av*(nsp_tot_est*1.5/(nsp_est + nsp_tot_est)));
2196     }
2197     else
2198     {
2199         nsp_max = nsp_target_av;
2200     }
2201
2202     const int cj4_start = nbl->sci.back().cj4_ind_start;
2203     const int cj4_end   = nbl->sci.back().cj4_ind_end;
2204     const int j4len     = cj4_end - cj4_start;
2205
2206     if (j4len > 1 && j4len*c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize > nsp_max)
2207     {
2208         /* Modify the last ci entry and process the cj4's again */
2209
2210         int nsp        = 0;
2211         int nsp_sci    = 0;
2212         int nsp_cj4_e  = 0;
2213         int nsp_cj4    = 0;
2214         for (int cj4 = cj4_start; cj4 < cj4_end; cj4++)
2215         {
2216             int nsp_cj4_p = nsp_cj4;
2217             /* Count the number of cluster pairs in this cj4 group */
2218             nsp_cj4   = 0;
2219             for (int p = 0; p < c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize; p++)
2220             {
2221                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
2222             }
2223
2224             /* If adding the current cj4 with nsp_cj4 pairs get us further
2225              * away from our target nsp_max, split the list before this cj4.
2226              */
2227             if (nsp > 0 && nsp_max - nsp < nsp + nsp_cj4 - nsp_max)
2228             {
2229                 /* Split the list at cj4 */
2230                 nbl->sci.back().cj4_ind_end = cj4;
2231                 /* Create a new sci entry */
2232                 nbnxn_sci_t sciNew;
2233                 sciNew.sci           = nbl->sci.back().sci;
2234                 sciNew.shift         = nbl->sci.back().shift;
2235                 sciNew.cj4_ind_start = cj4;
2236                 nbl->sci.push_back(sciNew);
2237
2238                 nsp_sci              = nsp;
2239                 nsp_cj4_e            = nsp_cj4_p;
2240                 nsp                  = 0;
2241             }
2242             nsp += nsp_cj4;
2243         }
2244
2245         /* Put the remaining cj4's in the last sci entry */
2246         nbl->sci.back().cj4_ind_end = cj4_end;
2247
2248         /* Possibly balance out the last two sci's
2249          * by moving the last cj4 of the second last sci.
2250          */
2251         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
2252         {
2253             GMX_ASSERT(nbl->sci.size() >= 2, "We expect at least two elements");
2254             nbl->sci[nbl->sci.size() - 2].cj4_ind_end--;
2255             nbl->sci[nbl->sci.size() - 1].cj4_ind_start--;
2256         }
2257     }
2258 }
2259
2260 /* Clost this super/sub list i entry */
2261 static void closeIEntry(NbnxnPairlistGpu *nbl,
2262                         int nsp_max_av,
2263                         gmx_bool progBal, float nsp_tot_est,
2264                         int thread, int nthread)
2265 {
2266     nbnxn_sci_t &sciEntry = *getOpenIEntry(nbl);
2267
2268     /* All content of the new ci entry have already been filled correctly,
2269      * we only need to, potentially, split or remove the entry when empty.
2270      */
2271     int j4len = sciEntry.numJClusterGroups();
2272     if (j4len > 0)
2273     {
2274         /* We can only have complete blocks of 4 j-entries in a list,
2275          * so round the count up before closing.
2276          */
2277         int ncj4          = (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
2278         nbl->work->cj_ind = ncj4*c_nbnxnGpuJgroupSize;
2279
2280         if (nsp_max_av > 0)
2281         {
2282             /* Measure the size of the new entry and potentially split it */
2283             split_sci_entry(nbl, nsp_max_av, progBal, nsp_tot_est,
2284                             thread, nthread);
2285         }
2286     }
2287     else
2288     {
2289         /* Entry is empty: remove it  */
2290         nbl->sci.pop_back();
2291     }
2292 }
2293
2294 /* Syncs the working array before adding another grid pair to the GPU list */
2295 static void sync_work(NbnxnPairlistCpu gmx_unused *nbl)
2296 {
2297 }
2298
2299 /* Syncs the working array before adding another grid pair to the GPU list */
2300 static void sync_work(NbnxnPairlistGpu *nbl)
2301 {
2302     nbl->work->cj_ind   = nbl->cj4.size()*c_nbnxnGpuJgroupSize;
2303 }
2304
2305 /* Clears an NbnxnPairlistCpu data structure */
2306 static void clear_pairlist(NbnxnPairlistCpu *nbl)
2307 {
2308     nbl->ci.clear();
2309     nbl->cj.clear();
2310     nbl->ncjInUse      = 0;
2311     nbl->nci_tot       = 0;
2312     nbl->ciOuter.clear();
2313     nbl->cjOuter.clear();
2314
2315     nbl->work->ncj_noq = 0;
2316     nbl->work->ncj_hlj = 0;
2317 }
2318
2319 /* Clears an NbnxnPairlistGpu data structure */
2320 static void clear_pairlist(NbnxnPairlistGpu *nbl)
2321 {
2322     nbl->sci.clear();
2323     nbl->cj4.clear();
2324     nbl->excl.resize(1);
2325     nbl->nci_tot = 0;
2326 }
2327
2328 /* Clears a group scheme pair list */
2329 static void clear_pairlist_fep(t_nblist *nl)
2330 {
2331     nl->nri = 0;
2332     nl->nrj = 0;
2333     if (nl->jindex == nullptr)
2334     {
2335         snew(nl->jindex, 1);
2336     }
2337     nl->jindex[0] = 0;
2338 }
2339
2340 /* Sets a simple list i-cell bounding box, including PBC shift */
2341 static inline void set_icell_bb_simple(gmx::ArrayRef<const BoundingBox> bb,
2342                                        int ci,
2343                                        real shx, real shy, real shz,
2344                                        BoundingBox *bb_ci)
2345 {
2346     bb_ci->lower.x = bb[ci].lower.x + shx;
2347     bb_ci->lower.y = bb[ci].lower.y + shy;
2348     bb_ci->lower.z = bb[ci].lower.z + shz;
2349     bb_ci->upper.x = bb[ci].upper.x + shx;
2350     bb_ci->upper.y = bb[ci].upper.y + shy;
2351     bb_ci->upper.z = bb[ci].upper.z + shz;
2352 }
2353
2354 /* Sets a simple list i-cell bounding box, including PBC shift */
2355 static inline void set_icell_bb(const Grid &iGrid,
2356                                 int ci,
2357                                 real shx, real shy, real shz,
2358                                 NbnxnPairlistCpuWork *work)
2359 {
2360     set_icell_bb_simple(iGrid.iBoundingBoxes(), ci, shx, shy, shz,
2361                         &work->iClusterData.bb[0]);
2362 }
2363
2364 #if NBNXN_BBXXXX
2365 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2366 static void set_icell_bbxxxx_supersub(gmx::ArrayRef<const float> bb,
2367                                       int ci,
2368                                       real shx, real shy, real shz,
2369                                       float *bb_ci)
2370 {
2371     constexpr int cellBBStride = packedBoundingBoxesIndex(c_gpuNumClusterPerCell);
2372     constexpr int pbbStride    = c_packedBoundingBoxesDimSize;
2373     const int     ia           = ci*cellBBStride;
2374     for (int m = 0; m < cellBBStride; m += c_packedBoundingBoxesSize)
2375     {
2376         for (int i = 0; i < pbbStride; i++)
2377         {
2378             bb_ci[m + 0*pbbStride + i] = bb[ia + m + 0*pbbStride + i] + shx;
2379             bb_ci[m + 1*pbbStride + i] = bb[ia + m + 1*pbbStride + i] + shy;
2380             bb_ci[m + 2*pbbStride + i] = bb[ia + m + 2*pbbStride + i] + shz;
2381             bb_ci[m + 3*pbbStride + i] = bb[ia + m + 3*pbbStride + i] + shx;
2382             bb_ci[m + 4*pbbStride + i] = bb[ia + m + 4*pbbStride + i] + shy;
2383             bb_ci[m + 5*pbbStride + i] = bb[ia + m + 5*pbbStride + i] + shz;
2384         }
2385     }
2386 }
2387 #endif
2388
2389 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2390 gmx_unused static void set_icell_bb_supersub(gmx::ArrayRef<const BoundingBox> bb,
2391                                              int ci,
2392                                              real shx, real shy, real shz,
2393                                              BoundingBox *bb_ci)
2394 {
2395     for (int i = 0; i < c_gpuNumClusterPerCell; i++)
2396     {
2397         set_icell_bb_simple(bb, ci*c_gpuNumClusterPerCell+i,
2398                             shx, shy, shz,
2399                             &bb_ci[i]);
2400     }
2401 }
2402
2403 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2404 gmx_unused static void set_icell_bb(const Grid &iGrid,
2405                                     int ci,
2406                                     real shx, real shy, real shz,
2407                                     NbnxnPairlistGpuWork *work)
2408 {
2409 #if NBNXN_BBXXXX
2410     set_icell_bbxxxx_supersub(iGrid.packedBoundingBoxes(), ci, shx, shy, shz,
2411                               work->iSuperClusterData.bbPacked.data());
2412 #else
2413     set_icell_bb_supersub(iGrid.iBoundingBoxes(), ci, shx, shy, shz,
2414                           work->iSuperClusterData.bb.data());
2415 #endif
2416 }
2417
2418 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2419 static void icell_set_x_simple(int ci,
2420                                real shx, real shy, real shz,
2421                                int stride, const real *x,
2422                                NbnxnPairlistCpuWork::IClusterData *iClusterData)
2423 {
2424     const int ia = ci*c_nbnxnCpuIClusterSize;
2425
2426     for (int i = 0; i < c_nbnxnCpuIClusterSize; i++)
2427     {
2428         iClusterData->x[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
2429         iClusterData->x[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
2430         iClusterData->x[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
2431     }
2432 }
2433
2434 static void icell_set_x(int ci,
2435                         real shx, real shy, real shz,
2436                         int stride, const real *x,
2437                         const Nbnxm::KernelType kernelType,
2438                         NbnxnPairlistCpuWork *work)
2439 {
2440     switch (kernelType)
2441     {
2442 #if GMX_SIMD
2443 #ifdef GMX_NBNXN_SIMD_4XN
2444         case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
2445             icell_set_x_simd_4xn(ci, shx, shy, shz, stride, x, work);
2446             break;
2447 #endif
2448 #ifdef GMX_NBNXN_SIMD_2XNN
2449         case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
2450             icell_set_x_simd_2xnn(ci, shx, shy, shz, stride, x, work);
2451             break;
2452 #endif
2453 #endif
2454         case Nbnxm::KernelType::Cpu4x4_PlainC:
2455             icell_set_x_simple(ci, shx, shy, shz, stride, x, &work->iClusterData);
2456             break;
2457         default:
2458             GMX_ASSERT(false, "Unhandled case");
2459             break;
2460     }
2461 }
2462
2463 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2464 static void icell_set_x(int ci,
2465                         real shx, real shy, real shz,
2466                         int stride, const real *x,
2467                         Nbnxm::KernelType gmx_unused kernelType,
2468                         NbnxnPairlistGpuWork *work)
2469 {
2470 #if !GMX_SIMD4_HAVE_REAL
2471
2472     real * x_ci = work->iSuperClusterData.x.data();
2473
2474     int    ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize;
2475     for (int i = 0; i < c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize; i++)
2476     {
2477         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
2478         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
2479         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
2480     }
2481
2482 #else /* !GMX_SIMD4_HAVE_REAL */
2483
2484     real * x_ci = work->iSuperClusterData.xSimd.data();
2485
2486     for (int si = 0; si < c_gpuNumClusterPerCell; si++)
2487     {
2488         for (int i = 0; i < c_nbnxnGpuClusterSize; i += GMX_SIMD4_WIDTH)
2489         {
2490             int io = si*c_nbnxnGpuClusterSize + i;
2491             int ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize + io;
2492             for (int j = 0; j < GMX_SIMD4_WIDTH; j++)
2493             {
2494                 x_ci[io*DIM + j + XX*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + XX] + shx;
2495                 x_ci[io*DIM + j + YY*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + YY] + shy;
2496                 x_ci[io*DIM + j + ZZ*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + ZZ] + shz;
2497             }
2498         }
2499     }
2500
2501 #endif /* !GMX_SIMD4_HAVE_REAL */
2502 }
2503
2504 static real minimum_subgrid_size_xy(const Grid &grid)
2505 {
2506     const Grid::Dimensions &dims = grid.dimensions();
2507
2508     if (grid.geometry().isSimple)
2509     {
2510         return std::min(dims.cellSize[XX], dims.cellSize[YY]);
2511     }
2512     else
2513     {
2514         return std::min(dims.cellSize[XX]/c_gpuNumClusterPerCellX,
2515                         dims.cellSize[YY]/c_gpuNumClusterPerCellY);
2516     }
2517 }
2518
2519 static real effective_buffer_1x1_vs_MxN(const Grid &iGrid,
2520                                         const Grid &jGrid)
2521 {
2522     const real eff_1x1_buffer_fac_overest = 0.1;
2523
2524     /* Determine an atom-pair list cut-off buffer size for atom pairs,
2525      * to be added to rlist (including buffer) used for MxN.
2526      * This is for converting an MxN list to a 1x1 list. This means we can't
2527      * use the normal buffer estimate, as we have an MxN list in which
2528      * some atom pairs beyond rlist are missing. We want to capture
2529      * the beneficial effect of buffering by extra pairs just outside rlist,
2530      * while removing the useless pairs that are further away from rlist.
2531      * (Also the buffer could have been set manually not using the estimate.)
2532      * This buffer size is an overestimate.
2533      * We add 10% of the smallest grid sub-cell dimensions.
2534      * Note that the z-size differs per cell and we don't use this,
2535      * so we overestimate.
2536      * With PME, the 10% value gives a buffer that is somewhat larger
2537      * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2538      * Smaller tolerances or using RF lead to a smaller effective buffer,
2539      * so 10% gives a safe overestimate.
2540      */
2541     return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(iGrid) +
2542                                        minimum_subgrid_size_xy(jGrid));
2543 }
2544
2545 /* Estimates the interaction volume^2 for non-local interactions */
2546 static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls, real r)
2547 {
2548     real cl, ca, za;
2549     real vold_est;
2550     real vol2_est_tot;
2551
2552     vol2_est_tot = 0;
2553
2554     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2555      * not home interaction volume^2. As these volumes are not additive,
2556      * this is an overestimate, but it would only be significant in the limit
2557      * of small cells, where we anyhow need to split the lists into
2558      * as small parts as possible.
2559      */
2560
2561     for (int z = 0; z < zones->n; z++)
2562     {
2563         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
2564         {
2565             cl = 0;
2566             ca = 1;
2567             za = 1;
2568             for (int d = 0; d < DIM; d++)
2569             {
2570                 if (zones->shift[z][d] == 0)
2571                 {
2572                     cl += 0.5*ls[d];
2573                     ca *= ls[d];
2574                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
2575                 }
2576             }
2577
2578             /* 4 octants of a sphere */
2579             vold_est  = 0.25*M_PI*r*r*r*r;
2580             /* 4 quarter pie slices on the edges */
2581             vold_est += 4*cl*M_PI/6.0*r*r*r;
2582             /* One rectangular volume on a face */
2583             vold_est += ca*0.5*r*r;
2584
2585             vol2_est_tot += vold_est*za;
2586         }
2587     }
2588
2589     return vol2_est_tot;
2590 }
2591
2592 /* Estimates the average size of a full j-list for super/sub setup */
2593 static void get_nsubpair_target(const nbnxn_search        *nbs,
2594                                 const InteractionLocality  iloc,
2595                                 const real                 rlist,
2596                                 const int                  min_ci_balanced,
2597                                 int                       *nsubpair_target,
2598                                 float                     *nsubpair_tot_est)
2599 {
2600     /* The target value of 36 seems to be the optimum for Kepler.
2601      * Maxwell is less sensitive to the exact value.
2602      */
2603     const int           nsubpair_target_min = 36;
2604     real                r_eff_sup, vol_est, nsp_est, nsp_est_nl;
2605
2606     const Grid         &grid = nbs->gridSet().grids()[0];
2607
2608     /* We don't need to balance list sizes if:
2609      * - We didn't request balancing.
2610      * - The number of grid cells >= the number of lists requested,
2611      *   since we will always generate at least #cells lists.
2612      * - We don't have any cells, since then there won't be any lists.
2613      */
2614     if (min_ci_balanced <= 0 || grid.numCells() >= min_ci_balanced || grid.numCells() == 0)
2615     {
2616         /* nsubpair_target==0 signals no balancing */
2617         *nsubpair_target  = 0;
2618         *nsubpair_tot_est = 0;
2619
2620         return;
2621     }
2622
2623     gmx::RVec               ls;
2624     const int               numAtomsCluster = grid.geometry().numAtomsICluster;
2625     const Grid::Dimensions &dims            = grid.dimensions();
2626
2627     ls[XX] = dims.cellSize[XX]/c_gpuNumClusterPerCellX;
2628     ls[YY] = dims.cellSize[YY]/c_gpuNumClusterPerCellY;
2629     ls[ZZ] = numAtomsCluster/(dims.atomDensity*ls[XX]*ls[YY]);
2630
2631     /* The formulas below are a heuristic estimate of the average nsj per si*/
2632     r_eff_sup = rlist + nbnxn_get_rlist_effective_inc(numAtomsCluster, ls);
2633
2634     if (!nbs->domainSetup().haveDomDec || nbs->domainSetup().zones->n == 1)
2635     {
2636         nsp_est_nl = 0;
2637     }
2638     else
2639     {
2640         nsp_est_nl =
2641             gmx::square(dims.atomDensity/numAtomsCluster)*
2642             nonlocal_vol2(nbs->domainSetup().zones, ls, r_eff_sup);
2643     }
2644
2645     if (iloc == InteractionLocality::Local)
2646     {
2647         /* Sub-cell interacts with itself */
2648         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
2649         /* 6/2 rectangular volume on the faces */
2650         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
2651         /* 12/2 quarter pie slices on the edges */
2652         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*gmx::square(r_eff_sup);
2653         /* 4 octants of a sphere */
2654         vol_est += 0.5*4.0/3.0*M_PI*gmx::power3(r_eff_sup);
2655
2656         /* Estimate the number of cluster pairs as the local number of
2657          * clusters times the volume they interact with times the density.
2658          */
2659         nsp_est = grid.numClusters()*vol_est*dims.atomDensity/numAtomsCluster;
2660
2661         /* Subtract the non-local pair count */
2662         nsp_est -= nsp_est_nl;
2663
2664         /* For small cut-offs nsp_est will be an underesimate.
2665          * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2666          * So to avoid too small or negative nsp_est we set a minimum of
2667          * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2668          * This might be a slight overestimate for small non-periodic groups of
2669          * atoms as will occur for a local domain with DD, but for small
2670          * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2671          * so this overestimation will not matter.
2672          */
2673         nsp_est = std::max(nsp_est, grid.numClusters()*14._real);
2674
2675         if (debug)
2676         {
2677             fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
2678                     nsp_est, nsp_est_nl);
2679         }
2680     }
2681     else
2682     {
2683         nsp_est = nsp_est_nl;
2684     }
2685
2686     /* Thus the (average) maximum j-list size should be as follows.
2687      * Since there is overhead, we shouldn't make the lists too small
2688      * (and we can't chop up j-groups) so we use a minimum target size of 36.
2689      */
2690     *nsubpair_target  = std::max(nsubpair_target_min,
2691                                  roundToInt(nsp_est/min_ci_balanced));
2692     *nsubpair_tot_est = static_cast<int>(nsp_est);
2693
2694     if (debug)
2695     {
2696         fprintf(debug, "nbl nsp estimate %.1f, nsubpair_target %d\n",
2697                 nsp_est, *nsubpair_target);
2698     }
2699 }
2700
2701 /* Debug list print function */
2702 static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
2703 {
2704     for (const nbnxn_ci_t &ciEntry : nbl->ci)
2705     {
2706         fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
2707                 ciEntry.ci, ciEntry.shift,
2708                 ciEntry.cj_ind_end - ciEntry.cj_ind_start);
2709
2710         for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
2711         {
2712             fprintf(fp, "  cj %5d  imask %x\n",
2713                     nbl->cj[j].cj,
2714                     nbl->cj[j].excl);
2715         }
2716     }
2717 }
2718
2719 /* Debug list print function */
2720 static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
2721 {
2722     for (const nbnxn_sci_t &sci : nbl->sci)
2723     {
2724         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
2725                 sci.sci, sci.shift,
2726                 sci.numJClusterGroups());
2727
2728         int ncp = 0;
2729         for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
2730         {
2731             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
2732             {
2733                 fprintf(fp, "  sj %5d  imask %x\n",
2734                         nbl->cj4[j4].cj[j],
2735                         nbl->cj4[j4].imei[0].imask);
2736                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
2737                 {
2738                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
2739                     {
2740                         ncp++;
2741                     }
2742                 }
2743             }
2744         }
2745         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
2746                 sci.sci, sci.shift,
2747                 sci.numJClusterGroups(),
2748                 ncp);
2749     }
2750 }
2751
2752 /* Combine pair lists *nbl generated on multiple threads nblc */
2753 static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
2754                             NbnxnPairlistGpu *nblc)
2755 {
2756     int nsci  = nblc->sci.size();
2757     int ncj4  = nblc->cj4.size();
2758     int nexcl = nblc->excl.size();
2759     for (int i = 0; i < nnbl; i++)
2760     {
2761         nsci  += nbl[i]->sci.size();
2762         ncj4  += nbl[i]->cj4.size();
2763         nexcl += nbl[i]->excl.size();
2764     }
2765
2766     /* Resize with the final, combined size, so we can fill in parallel */
2767     /* NOTE: For better performance we should use default initialization */
2768     nblc->sci.resize(nsci);
2769     nblc->cj4.resize(ncj4);
2770     nblc->excl.resize(nexcl);
2771
2772     /* Each thread should copy its own data to the combined arrays,
2773      * as otherwise data will go back and forth between different caches.
2774      */
2775 #if GMX_OPENMP && !(defined __clang_analyzer__)
2776     int nthreads = gmx_omp_nthreads_get(emntPairsearch);
2777 #endif
2778
2779 #pragma omp parallel for num_threads(nthreads) schedule(static)
2780     for (int n = 0; n < nnbl; n++)
2781     {
2782         try
2783         {
2784             /* Determine the offset in the combined data for our thread.
2785              * Note that the original sizes in nblc are lost.
2786              */
2787             int sci_offset  = nsci;
2788             int cj4_offset  = ncj4;
2789             int excl_offset = nexcl;
2790
2791             for (int i = n; i < nnbl; i++)
2792             {
2793                 sci_offset  -= nbl[i]->sci.size();
2794                 cj4_offset  -= nbl[i]->cj4.size();
2795                 excl_offset -= nbl[i]->excl.size();
2796             }
2797
2798             const NbnxnPairlistGpu &nbli = *nbl[n];
2799
2800             for (size_t i = 0; i < nbli.sci.size(); i++)
2801             {
2802                 nblc->sci[sci_offset + i]                = nbli.sci[i];
2803                 nblc->sci[sci_offset + i].cj4_ind_start += cj4_offset;
2804                 nblc->sci[sci_offset + i].cj4_ind_end   += cj4_offset;
2805             }
2806
2807             for (size_t j4 = 0; j4 < nbli.cj4.size(); j4++)
2808             {
2809                 nblc->cj4[cj4_offset + j4]                   = nbli.cj4[j4];
2810                 nblc->cj4[cj4_offset + j4].imei[0].excl_ind += excl_offset;
2811                 nblc->cj4[cj4_offset + j4].imei[1].excl_ind += excl_offset;
2812             }
2813
2814             for (size_t j4 = 0; j4 < nbli.excl.size(); j4++)
2815             {
2816                 nblc->excl[excl_offset + j4] = nbli.excl[j4];
2817             }
2818         }
2819         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2820     }
2821
2822     for (int n = 0; n < nnbl; n++)
2823     {
2824         nblc->nci_tot += nbl[n]->nci_tot;
2825     }
2826 }
2827
2828 static void balance_fep_lists(const nbnxn_search   *nbs,
2829                               nbnxn_pairlist_set_t *nbl_lists)
2830 {
2831     int       nnbl;
2832     int       nri_tot, nrj_tot, nrj_target;
2833     int       th_dest;
2834     t_nblist *nbld;
2835
2836     nnbl = nbl_lists->nnbl;
2837
2838     if (nnbl == 1)
2839     {
2840         /* Nothing to balance */
2841         return;
2842     }
2843
2844     /* Count the total i-lists and pairs */
2845     nri_tot = 0;
2846     nrj_tot = 0;
2847     for (int th = 0; th < nnbl; th++)
2848     {
2849         nri_tot += nbl_lists->nbl_fep[th]->nri;
2850         nrj_tot += nbl_lists->nbl_fep[th]->nrj;
2851     }
2852
2853     nrj_target = (nrj_tot + nnbl - 1)/nnbl;
2854
2855     assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
2856
2857 #pragma omp parallel for schedule(static) num_threads(nnbl)
2858     for (int th = 0; th < nnbl; th++)
2859     {
2860         try
2861         {
2862             t_nblist *nbl = nbs->work[th].nbl_fep.get();
2863
2864             /* Note that here we allocate for the total size, instead of
2865              * a per-thread esimate (which is hard to obtain).
2866              */
2867             if (nri_tot > nbl->maxnri)
2868             {
2869                 nbl->maxnri = over_alloc_large(nri_tot);
2870                 reallocate_nblist(nbl);
2871             }
2872             if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)
2873             {
2874                 nbl->maxnrj = over_alloc_small(nrj_tot);
2875                 srenew(nbl->jjnr, nbl->maxnrj);
2876                 srenew(nbl->excl_fep, nbl->maxnrj);
2877             }
2878
2879             clear_pairlist_fep(nbl);
2880         }
2881         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2882     }
2883
2884     /* Loop over the source lists and assign and copy i-entries */
2885     th_dest = 0;
2886     nbld    = nbs->work[th_dest].nbl_fep.get();
2887     for (int th = 0; th < nnbl; th++)
2888     {
2889         t_nblist *nbls;
2890
2891         nbls = nbl_lists->nbl_fep[th];
2892
2893         for (int i = 0; i < nbls->nri; i++)
2894         {
2895             int nrj;
2896
2897             /* The number of pairs in this i-entry */
2898             nrj = nbls->jindex[i+1] - nbls->jindex[i];
2899
2900             /* Decide if list th_dest is too large and we should procede
2901              * to the next destination list.
2902              */
2903             if (th_dest+1 < nnbl && nbld->nrj > 0 &&
2904                 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
2905             {
2906                 th_dest++;
2907                 nbld = nbs->work[th_dest].nbl_fep.get();
2908             }
2909
2910             nbld->iinr[nbld->nri]  = nbls->iinr[i];
2911             nbld->gid[nbld->nri]   = nbls->gid[i];
2912             nbld->shift[nbld->nri] = nbls->shift[i];
2913
2914             for (int j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++)
2915             {
2916                 nbld->jjnr[nbld->nrj]     = nbls->jjnr[j];
2917                 nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j];
2918                 nbld->nrj++;
2919             }
2920             nbld->nri++;
2921             nbld->jindex[nbld->nri] = nbld->nrj;
2922         }
2923     }
2924
2925     /* Swap the list pointers */
2926     for (int th = 0; th < nnbl; th++)
2927     {
2928         t_nblist *nbl_tmp      = nbs->work[th].nbl_fep.release();
2929         nbs->work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
2930         nbl_lists->nbl_fep[th] = nbl_tmp;
2931
2932         if (debug)
2933         {
2934             fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
2935                     th,
2936                     nbl_lists->nbl_fep[th]->nri,
2937                     nbl_lists->nbl_fep[th]->nrj);
2938         }
2939     }
2940 }
2941
2942 /* Returns the next ci to be processes by our thread */
2943 static gmx_bool next_ci(const Grid &grid,
2944                         int nth, int ci_block,
2945                         int *ci_x, int *ci_y,
2946                         int *ci_b, int *ci)
2947 {
2948     (*ci_b)++;
2949     (*ci)++;
2950
2951     if (*ci_b == ci_block)
2952     {
2953         /* Jump to the next block assigned to this task */
2954         *ci   += (nth - 1)*ci_block;
2955         *ci_b  = 0;
2956     }
2957
2958     if (*ci >= grid.numCells())
2959     {
2960         return FALSE;
2961     }
2962
2963     while (*ci >= grid.firstCellInColumn(*ci_x*grid.dimensions().numCells[YY] + *ci_y + 1))
2964     {
2965         *ci_y += 1;
2966         if (*ci_y == grid.dimensions().numCells[YY])
2967         {
2968             *ci_x += 1;
2969             *ci_y  = 0;
2970         }
2971     }
2972
2973     return TRUE;
2974 }
2975
2976 /* Returns the distance^2 for which we put cell pairs in the list
2977  * without checking atom pair distances. This is usually < rlist^2.
2978  */
2979 static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
2980                                         const Grid::Dimensions &jGridDims,
2981                                         real                    rlist,
2982                                         gmx_bool                simple)
2983 {
2984     /* If the distance between two sub-cell bounding boxes is less
2985      * than this distance, do not check the distance between
2986      * all particle pairs in the sub-cell, since then it is likely
2987      * that the box pair has atom pairs within the cut-off.
2988      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
2989      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
2990      * Using more than 0.5 gains at most 0.5%.
2991      * If forces are calculated more than twice, the performance gain
2992      * in the force calculation outweighs the cost of checking.
2993      * Note that with subcell lists, the atom-pair distance check
2994      * is only performed when only 1 out of 8 sub-cells in within range,
2995      * this is because the GPU is much faster than the cpu.
2996      */
2997     real bbx, bby;
2998     real rbb2;
2999
3000     bbx = 0.5*(iGridDims.cellSize[XX] + jGridDims.cellSize[XX]);
3001     bby = 0.5*(iGridDims.cellSize[YY] + jGridDims.cellSize[YY]);
3002     if (!simple)
3003     {
3004         bbx /= c_gpuNumClusterPerCellX;
3005         bby /= c_gpuNumClusterPerCellY;
3006     }
3007
3008     rbb2 = std::max(0.0, rlist - 0.5*std::sqrt(bbx*bbx + bby*bby));
3009     rbb2 = rbb2 * rbb2;
3010
3011 #if !GMX_DOUBLE
3012     return rbb2;
3013 #else
3014     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3015 #endif
3016 }
3017
3018 static int get_ci_block_size(const Grid &iGrid,
3019                              gmx_bool bDomDec, int nth)
3020 {
3021     const int ci_block_enum      = 5;
3022     const int ci_block_denom     = 11;
3023     const int ci_block_min_atoms = 16;
3024     int       ci_block;
3025
3026     /* Here we decide how to distribute the blocks over the threads.
3027      * We use prime numbers to try to avoid that the grid size becomes
3028      * a multiple of the number of threads, which would lead to some
3029      * threads getting "inner" pairs and others getting boundary pairs,
3030      * which in turns will lead to load imbalance between threads.
3031      * Set the block size as 5/11/ntask times the average number of cells
3032      * in a y,z slab. This should ensure a quite uniform distribution
3033      * of the grid parts of the different thread along all three grid
3034      * zone boundaries with 3D domain decomposition. At the same time
3035      * the blocks will not become too small.
3036      */
3037     ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*nth);
3038
3039     const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
3040
3041     /* Ensure the blocks are not too small: avoids cache invalidation */
3042     if (ci_block*numAtomsPerCell < ci_block_min_atoms)
3043     {
3044         ci_block = (ci_block_min_atoms + numAtomsPerCell - 1)/numAtomsPerCell;
3045     }
3046
3047     /* Without domain decomposition
3048      * or with less than 3 blocks per task, divide in nth blocks.
3049      */
3050     if (!bDomDec || nth*3*ci_block > iGrid.numCells())
3051     {
3052         ci_block = (iGrid.numCells() + nth - 1)/nth;
3053     }
3054
3055     if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.numCells())
3056     {
3057         /* Some threads have no work. Although reducing the block size
3058          * does not decrease the block count on the first few threads,
3059          * with GPUs better mixing of "upper" cells that have more empty
3060          * clusters results in a somewhat lower max load over all threads.
3061          * Without GPUs the regime of so few atoms per thread is less
3062          * performance relevant, but with 8-wide SIMD the same reasoning
3063          * applies, since the pair list uses 4 i-atom "sub-clusters".
3064          */
3065         ci_block--;
3066     }
3067
3068     return ci_block;
3069 }
3070
3071 /* Returns the number of bits to right-shift a cluster index to obtain
3072  * the corresponding force buffer flag index.
3073  */
3074 static int getBufferFlagShift(int numAtomsPerCluster)
3075 {
3076     int bufferFlagShift = 0;
3077     while ((numAtomsPerCluster << bufferFlagShift) < NBNXN_BUFFERFLAG_SIZE)
3078     {
3079         bufferFlagShift++;
3080     }
3081
3082     return bufferFlagShift;
3083 }
3084
3085 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused &pairlist)
3086 {
3087     return true;
3088 }
3089
3090 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused &pairlist)
3091 {
3092     return false;
3093 }
3094
3095 static void makeClusterListWrapper(NbnxnPairlistCpu       *nbl,
3096                                    const Grid gmx_unused  &iGrid,
3097                                    const int               ci,
3098                                    const Grid             &jGrid,
3099                                    const int               firstCell,
3100                                    const int               lastCell,
3101                                    const bool              excludeSubDiagonal,
3102                                    const nbnxn_atomdata_t *nbat,
3103                                    const real              rlist2,
3104                                    const real              rbb2,
3105                                    const Nbnxm::KernelType kernelType,
3106                                    int                    *numDistanceChecks)
3107 {
3108     switch (kernelType)
3109     {
3110         case Nbnxm::KernelType::Cpu4x4_PlainC:
3111             makeClusterListSimple(jGrid,
3112                                   nbl, ci, firstCell, lastCell,
3113                                   excludeSubDiagonal,
3114                                   nbat->x().data(),
3115                                   rlist2, rbb2,
3116                                   numDistanceChecks);
3117             break;
3118 #ifdef GMX_NBNXN_SIMD_4XN
3119         case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
3120             makeClusterListSimd4xn(jGrid,
3121                                    nbl, ci, firstCell, lastCell,
3122                                    excludeSubDiagonal,
3123                                    nbat->x().data(),
3124                                    rlist2, rbb2,
3125                                    numDistanceChecks);
3126             break;
3127 #endif
3128 #ifdef GMX_NBNXN_SIMD_2XNN
3129         case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
3130             makeClusterListSimd2xnn(jGrid,
3131                                     nbl, ci, firstCell, lastCell,
3132                                     excludeSubDiagonal,
3133                                     nbat->x().data(),
3134                                     rlist2, rbb2,
3135                                     numDistanceChecks);
3136             break;
3137 #endif
3138         default:
3139             GMX_ASSERT(false, "Unhandled kernel type");
3140     }
3141 }
3142
3143 static void makeClusterListWrapper(NbnxnPairlistGpu             *nbl,
3144                                    const Grid &gmx_unused        iGrid,
3145                                    const int                     ci,
3146                                    const Grid                   &jGrid,
3147                                    const int                     firstCell,
3148                                    const int                     lastCell,
3149                                    const bool                    excludeSubDiagonal,
3150                                    const nbnxn_atomdata_t       *nbat,
3151                                    const real                    rlist2,
3152                                    const real                    rbb2,
3153                                    Nbnxm::KernelType gmx_unused  kernelType,
3154                                    int                          *numDistanceChecks)
3155 {
3156     for (int cj = firstCell; cj <= lastCell; cj++)
3157     {
3158         make_cluster_list_supersub(iGrid, jGrid,
3159                                    nbl, ci, cj,
3160                                    excludeSubDiagonal,
3161                                    nbat->xstride, nbat->x().data(),
3162                                    rlist2, rbb2,
3163                                    numDistanceChecks);
3164     }
3165 }
3166
3167 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu &nbl)
3168 {
3169     return nbl.cj.size();
3170 }
3171
3172 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu &nbl)
3173 {
3174     return 0;
3175 }
3176
3177 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu *nbl,
3178                                               int               ncj_old_j)
3179 {
3180     nbl->ncjInUse += nbl->cj.size() - ncj_old_j;
3181 }
3182
3183 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused *nbl,
3184                                               int              gmx_unused  ncj_old_j)
3185 {
3186 }
3187
3188 static void checkListSizeConsistency(const NbnxnPairlistCpu &nbl,
3189                                      const bool              haveFreeEnergy)
3190 {
3191     GMX_RELEASE_ASSERT(static_cast<size_t>(nbl.ncjInUse) == nbl.cj.size() || haveFreeEnergy,
3192                        "Without free-energy all cj pair-list entries should be in use. "
3193                        "Note that subsequent code does not make use of the equality, "
3194                        "this check is only here to catch bugs");
3195 }
3196
3197 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused &nbl,
3198                                      bool gmx_unused                    haveFreeEnergy)
3199 {
3200     /* We currently can not check consistency here */
3201 }
3202
3203 /* Set the buffer flags for newly added entries in the list */
3204 static void setBufferFlags(const NbnxnPairlistCpu &nbl,
3205                            const int               ncj_old_j,
3206                            const int               gridj_flag_shift,
3207                            gmx_bitmask_t          *gridj_flag,
3208                            const int               th)
3209 {
3210     if (gmx::ssize(nbl.cj) > ncj_old_j)
3211     {
3212         int cbFirst = nbl.cj[ncj_old_j].cj >> gridj_flag_shift;
3213         int cbLast  = nbl.cj.back().cj >> gridj_flag_shift;
3214         for (int cb = cbFirst; cb <= cbLast; cb++)
3215         {
3216             bitmask_init_bit(&gridj_flag[cb], th);
3217         }
3218     }
3219 }
3220
3221 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused &nbl,
3222                            int gmx_unused                     ncj_old_j,
3223                            int gmx_unused                     gridj_flag_shift,
3224                            gmx_bitmask_t gmx_unused          *gridj_flag,
3225                            int gmx_unused                     th)
3226 {
3227     GMX_ASSERT(false, "This function should never be called");
3228 }
3229
3230 /* Generates the part of pair-list nbl assigned to our thread */
3231 template <typename T>
3232 static void nbnxn_make_pairlist_part(const nbnxn_search *nbs,
3233                                      const Grid &iGrid,
3234                                      const Grid &jGrid,
3235                                      nbnxn_search_work_t *work,
3236                                      const nbnxn_atomdata_t *nbat,
3237                                      const t_blocka &exclusions,
3238                                      real rlist,
3239                                      const Nbnxm::KernelType kernelType,
3240                                      int ci_block,
3241                                      gmx_bool bFBufferFlag,
3242                                      int nsubpair_max,
3243                                      gmx_bool progBal,
3244                                      float nsubpair_tot_est,
3245                                      int th, int nth,
3246                                      T *nbl,
3247                                      t_nblist *nbl_fep)
3248 {
3249     int               na_cj_2log;
3250     matrix            box;
3251     real              rl_fep2 = 0;
3252     float             rbb2;
3253     int               ci_b, ci, ci_x, ci_y, ci_xy;
3254     ivec              shp;
3255     real              bx0, bx1, by0, by1, bz0, bz1;
3256     real              bz1_frac;
3257     real              d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
3258     int               cxf, cxl, cyf, cyf_x, cyl;
3259     int               numDistanceChecks;
3260     int               gridi_flag_shift = 0, gridj_flag_shift = 0;
3261     gmx_bitmask_t    *gridj_flag       = nullptr;
3262     int               ncj_old_i, ncj_old_j;
3263
3264     nbs_cycle_start(&work->cc[enbsCCsearch]);
3265
3266     if (jGrid.geometry().isSimple != pairlistIsSimple(*nbl) ||
3267         iGrid.geometry().isSimple != pairlistIsSimple(*nbl))
3268     {
3269         gmx_incons("Grid incompatible with pair-list");
3270     }
3271
3272     sync_work(nbl);
3273     GMX_ASSERT(nbl->na_ci == jGrid.geometry().numAtomsICluster,
3274                "The cluster sizes in the list and grid should match");
3275     nbl->na_cj = Nbnxm::JClusterSizePerKernelType[kernelType];
3276     na_cj_2log = get_2log(nbl->na_cj);
3277
3278     nbl->rlist  = rlist;
3279
3280     if (bFBufferFlag)
3281     {
3282         /* Determine conversion of clusters to flag blocks */
3283         gridi_flag_shift = getBufferFlagShift(nbl->na_ci);
3284         gridj_flag_shift = getBufferFlagShift(nbl->na_cj);
3285
3286         gridj_flag       = work->buffer_flags.flag;
3287     }
3288
3289     const Nbnxm::GridSet &gridSet = nbs->gridSet();
3290
3291     gridSet.getBox(box);
3292
3293     const bool            haveFep = gridSet.haveFep();
3294
3295     const real            rlist2  = nbl->rlist*nbl->rlist;
3296
3297     if (haveFep && !pairlistIsSimple(*nbl))
3298     {
3299         /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3300          * We should not simply use rlist, since then we would not have
3301          * the small, effective buffering of the NxN lists.
3302          * The buffer is on overestimate, but the resulting cost for pairs
3303          * beyond rlist is neglible compared to the FEP pairs within rlist.
3304          */
3305         rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(iGrid, jGrid);
3306
3307         if (debug)
3308         {
3309             fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2);
3310         }
3311         rl_fep2 = rl_fep2*rl_fep2;
3312     }
3313
3314     const Grid::Dimensions &iGridDims = iGrid.dimensions();
3315     const Grid::Dimensions &jGridDims = jGrid.dimensions();
3316
3317     rbb2 = boundingbox_only_distance2(iGridDims, jGridDims, nbl->rlist, pairlistIsSimple(*nbl));
3318
3319     if (debug)
3320     {
3321         fprintf(debug, "nbl bounding box only distance %f\n", std::sqrt(rbb2));
3322     }
3323
3324     const bool isIntraGridList = (&iGrid == &jGrid);
3325
3326     /* Set the shift range */
3327     for (int d = 0; d < DIM; d++)
3328     {
3329         /* Check if we need periodicity shifts.
3330          * Without PBC or with domain decomposition we don't need them.
3331          */
3332         if (d >= ePBC2npbcdim(nbs->domainSetup().ePBC) ||
3333             nbs->domainSetup().haveDomDecPerDim[d])
3334         {
3335             shp[d] = 0;
3336         }
3337         else
3338         {
3339             const real listRangeCellToCell =
3340                 listRangeForGridCellToGridCell(rlist, iGrid.dimensions(), jGrid.dimensions());
3341             if (d == XX &&
3342                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < listRangeCellToCell)
3343             {
3344                 shp[d] = 2;
3345             }
3346             else
3347             {
3348                 shp[d] = 1;
3349             }
3350         }
3351     }
3352     const bool bSimple = pairlistIsSimple(*nbl);
3353     gmx::ArrayRef<const BoundingBox> bb_i;
3354 #if NBNXN_BBXXXX
3355     gmx::ArrayRef<const float>       pbb_i;
3356     if (bSimple)
3357     {
3358         bb_i  = iGrid.iBoundingBoxes();
3359     }
3360     else
3361     {
3362         pbb_i = iGrid.packedBoundingBoxes();
3363     }
3364 #else
3365     /* We use the normal bounding box format for both grid types */
3366     bb_i  = iGrid.iBoundingBoxes();
3367 #endif
3368     gmx::ArrayRef<const BoundingBox1D> bbcz_i  = iGrid.zBoundingBoxes();
3369     gmx::ArrayRef<const int>           flags_i = iGrid.clusterFlags();
3370     gmx::ArrayRef<const BoundingBox1D> bbcz_j  = jGrid.zBoundingBoxes();
3371     int                                cell0_i = iGrid.cellOffset();
3372
3373     if (debug)
3374     {
3375         fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
3376                 iGrid.numCells(), iGrid.numCells()/static_cast<double>(iGrid.numColumns()), ci_block);
3377     }
3378
3379     numDistanceChecks = 0;
3380
3381     const real listRangeBBToJCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGrid.dimensions()));
3382
3383     /* Initially ci_b and ci to 1 before where we want them to start,
3384      * as they will both be incremented in next_ci.
3385      */
3386     ci_b = -1;
3387     ci   = th*ci_block - 1;
3388     ci_x = 0;
3389     ci_y = 0;
3390     while (next_ci(iGrid, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
3391     {
3392         if (bSimple && flags_i[ci] == 0)
3393         {
3394             continue;
3395         }
3396
3397         ncj_old_i = getNumSimpleJClustersInList(*nbl);
3398
3399         d2cx = 0;
3400         if (!isIntraGridList && shp[XX] == 0)
3401         {
3402             if (bSimple)
3403             {
3404                 bx1 = bb_i[ci].upper.x;
3405             }
3406             else
3407             {
3408                 bx1 = iGridDims.lowerCorner[XX] + (ci_x+1)*iGridDims.cellSize[XX];
3409             }
3410             if (bx1 < jGridDims.lowerCorner[XX])
3411             {
3412                 d2cx = gmx::square(jGridDims.lowerCorner[XX] - bx1);
3413
3414                 if (d2cx >= listRangeBBToJCell2)
3415                 {
3416                     continue;
3417                 }
3418             }
3419         }
3420
3421         ci_xy = ci_x*iGridDims.numCells[YY] + ci_y;
3422
3423         /* Loop over shift vectors in three dimensions */
3424         for (int tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
3425         {
3426             const real shz = tz*box[ZZ][ZZ];
3427
3428             bz0 = bbcz_i[ci].lower + shz;
3429             bz1 = bbcz_i[ci].upper + shz;
3430
3431             if (tz == 0)
3432             {
3433                 d2z = 0;
3434             }
3435             else if (tz < 0)
3436             {
3437                 d2z = gmx::square(bz1);
3438             }
3439             else
3440             {
3441                 d2z = gmx::square(bz0 - box[ZZ][ZZ]);
3442             }
3443
3444             d2z_cx = d2z + d2cx;
3445
3446             if (d2z_cx >= rlist2)
3447             {
3448                 continue;
3449             }
3450
3451             bz1_frac = bz1/iGrid.numCellsInColumn(ci_xy);
3452             if (bz1_frac < 0)
3453             {
3454                 bz1_frac = 0;
3455             }
3456             /* The check with bz1_frac close to or larger than 1 comes later */
3457
3458             for (int ty = -shp[YY]; ty <= shp[YY]; ty++)
3459             {
3460                 const real shy = ty*box[YY][YY] + tz*box[ZZ][YY];
3461
3462                 if (bSimple)
3463                 {
3464                     by0 = bb_i[ci].lower.y + shy;
3465                     by1 = bb_i[ci].upper.y + shy;
3466                 }
3467                 else
3468                 {
3469                     by0 = iGridDims.lowerCorner[YY] + (ci_y    )*iGridDims.cellSize[YY] + shy;
3470                     by1 = iGridDims.lowerCorner[YY] + (ci_y + 1)*iGridDims.cellSize[YY] + shy;
3471                 }
3472
3473                 get_cell_range<YY>(by0, by1,
3474                                    jGridDims,
3475                                    d2z_cx, rlist,
3476                                    &cyf, &cyl);
3477
3478                 if (cyf > cyl)
3479                 {
3480                     continue;
3481                 }
3482
3483                 d2z_cy = d2z;
3484                 if (by1 < jGridDims.lowerCorner[YY])
3485                 {
3486                     d2z_cy += gmx::square(jGridDims.lowerCorner[YY] - by1);
3487                 }
3488                 else if (by0 > jGridDims.upperCorner[YY])
3489                 {
3490                     d2z_cy += gmx::square(by0 - jGridDims.upperCorner[YY]);
3491                 }
3492
3493                 for (int tx = -shp[XX]; tx <= shp[XX]; tx++)
3494                 {
3495                     const int  shift              = XYZ2IS(tx, ty, tz);
3496
3497                     const bool excludeSubDiagonal = (isIntraGridList && shift == CENTRAL);
3498
3499                     if (c_pbcShiftBackward && isIntraGridList && shift > CENTRAL)
3500                     {
3501                         continue;
3502                     }
3503
3504                     const real shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
3505
3506                     if (bSimple)
3507                     {
3508                         bx0 = bb_i[ci].lower.x + shx;
3509                         bx1 = bb_i[ci].upper.x + shx;
3510                     }
3511                     else
3512                     {
3513                         bx0 = iGridDims.lowerCorner[XX] + (ci_x  )*iGridDims.cellSize[XX] + shx;
3514                         bx1 = iGridDims.lowerCorner[XX] + (ci_x+1)*iGridDims.cellSize[XX] + shx;
3515                     }
3516
3517                     get_cell_range<XX>(bx0, bx1,
3518                                        jGridDims,
3519                                        d2z_cy, rlist,
3520                                        &cxf, &cxl);
3521
3522                     if (cxf > cxl)
3523                     {
3524                         continue;
3525                     }
3526
3527                     addNewIEntry(nbl, cell0_i+ci, shift, flags_i[ci]);
3528
3529                     if ((!c_pbcShiftBackward || excludeSubDiagonal) &&
3530                         cxf < ci_x)
3531                     {
3532                         /* Leave the pairs with i > j.
3533                          * x is the major index, so skip half of it.
3534                          */
3535                         cxf = ci_x;
3536                     }
3537
3538                     set_icell_bb(iGrid, ci, shx, shy, shz,
3539                                  nbl->work);
3540
3541                     icell_set_x(cell0_i+ci, shx, shy, shz,
3542                                 nbat->xstride, nbat->x().data(),
3543                                 kernelType,
3544                                 nbl->work);
3545
3546                     for (int cx = cxf; cx <= cxl; cx++)
3547                     {
3548                         d2zx = d2z;
3549                         if (jGridDims.lowerCorner[XX] + cx*jGridDims.cellSize[XX] > bx1)
3550                         {
3551                             d2zx += gmx::square(jGridDims.lowerCorner[XX] + cx*jGridDims.cellSize[XX] - bx1);
3552                         }
3553                         else if (jGridDims.lowerCorner[XX] + (cx+1)*jGridDims.cellSize[XX] < bx0)
3554                         {
3555                             d2zx += gmx::square(jGridDims.lowerCorner[XX] + (cx+1)*jGridDims.cellSize[XX] - bx0);
3556                         }
3557
3558                         if (isIntraGridList &&
3559                             cx == 0 &&
3560                             (!c_pbcShiftBackward || shift == CENTRAL) &&
3561                             cyf < ci_y)
3562                         {
3563                             /* Leave the pairs with i > j.
3564                              * Skip half of y when i and j have the same x.
3565                              */
3566                             cyf_x = ci_y;
3567                         }
3568                         else
3569                         {
3570                             cyf_x = cyf;
3571                         }
3572
3573                         for (int cy = cyf_x; cy <= cyl; cy++)
3574                         {
3575                             const int columnStart = jGrid.firstCellInColumn(cx*jGridDims.numCells[YY] + cy);
3576                             const int columnEnd   = jGrid.firstCellInColumn(cx*jGridDims.numCells[YY] + cy + 1);
3577
3578                             d2zxy = d2zx;
3579                             if (jGridDims.lowerCorner[YY] + cy*jGridDims.cellSize[YY] > by1)
3580                             {
3581                                 d2zxy += gmx::square(jGridDims.lowerCorner[YY] + cy*jGridDims.cellSize[YY] - by1);
3582                             }
3583                             else if (jGridDims.lowerCorner[YY] + (cy + 1)*jGridDims.cellSize[YY] < by0)
3584                             {
3585                                 d2zxy += gmx::square(jGridDims.lowerCorner[YY] + (cy + 1)*jGridDims.cellSize[YY] - by0);
3586                             }
3587                             if (columnStart < columnEnd && d2zxy < listRangeBBToJCell2)
3588                             {
3589                                 /* To improve efficiency in the common case
3590                                  * of a homogeneous particle distribution,
3591                                  * we estimate the index of the middle cell
3592                                  * in range (midCell). We search down and up
3593                                  * starting from this index.
3594                                  *
3595                                  * Note that the bbcz_j array contains bounds
3596                                  * for i-clusters, thus for clusters of 4 atoms.
3597                                  * For the common case where the j-cluster size
3598                                  * is 8, we could step with a stride of 2,
3599                                  * but we do not do this because it would
3600                                  * complicate this code even more.
3601                                  */
3602                                 int midCell = columnStart + static_cast<int>(bz1_frac*(columnEnd - columnStart));
3603                                 if (midCell >= columnEnd)
3604                                 {
3605                                     midCell = columnEnd - 1;
3606                                 }
3607
3608                                 d2xy = d2zxy - d2z;
3609
3610                                 /* Find the lowest cell that can possibly
3611                                  * be within range.
3612                                  * Check if we hit the bottom of the grid,
3613                                  * if the j-cell is below the i-cell and if so,
3614                                  * if it is within range.
3615                                  */
3616                                 int downTestCell = midCell;
3617                                 while (downTestCell >= columnStart &&
3618                                        (bbcz_j[downTestCell].upper >= bz0 ||
3619                                         d2xy + gmx::square(bbcz_j[downTestCell].upper - bz0) < rlist2))
3620                                 {
3621                                     downTestCell--;
3622                                 }
3623                                 int firstCell = downTestCell + 1;
3624
3625                                 /* Find the highest cell that can possibly
3626                                  * be within range.
3627                                  * Check if we hit the top of the grid,
3628                                  * if the j-cell is above the i-cell and if so,
3629                                  * if it is within range.
3630                                  */
3631                                 int upTestCell = midCell + 1;
3632                                 while (upTestCell < columnEnd &&
3633                                        (bbcz_j[upTestCell].lower <= bz1 ||
3634                                         d2xy + gmx::square(bbcz_j[upTestCell].lower - bz1) < rlist2))
3635                                 {
3636                                     upTestCell++;
3637                                 }
3638                                 int lastCell = upTestCell - 1;
3639
3640 #define NBNXN_REFCODE 0
3641 #if NBNXN_REFCODE
3642                                 {
3643                                     /* Simple reference code, for debugging,
3644                                      * overrides the more complex code above.
3645                                      */
3646                                     firstCell = columnEnd;
3647                                     lastCell  = -1;
3648                                     for (int k = columnStart; k < columnEnd; k++)
3649                                     {
3650                                         if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D + 1] - bz0) < rlist2 &&
3651                                             k < firstCell)
3652                                         {
3653                                             firstCell = k;
3654                                         }
3655                                         if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D] - bz1) < rlist2 &&
3656                                             k > lastCell)
3657                                         {
3658                                             lastCell = k;
3659                                         }
3660                                     }
3661                                 }
3662 #endif
3663
3664                                 if (isIntraGridList)
3665                                 {
3666                                     /* We want each atom/cell pair only once,
3667                                      * only use cj >= ci.
3668                                      */
3669                                     if (!c_pbcShiftBackward || shift == CENTRAL)
3670                                     {
3671                                         firstCell = std::max(firstCell, ci);
3672                                     }
3673                                 }
3674
3675                                 if (firstCell <= lastCell)
3676                                 {
3677                                     GMX_ASSERT(firstCell >= columnStart && lastCell < columnEnd, "The range should reside within the current grid column");
3678
3679                                     /* For f buffer flags with simple lists */
3680                                     ncj_old_j = getNumSimpleJClustersInList(*nbl);
3681
3682                                     makeClusterListWrapper(nbl,
3683                                                            iGrid, ci,
3684                                                            jGrid, firstCell, lastCell,
3685                                                            excludeSubDiagonal,
3686                                                            nbat,
3687                                                            rlist2, rbb2,
3688                                                            kernelType,
3689                                                            &numDistanceChecks);
3690
3691                                     if (bFBufferFlag)
3692                                     {
3693                                         setBufferFlags(*nbl, ncj_old_j, gridj_flag_shift,
3694                                                        gridj_flag, th);
3695                                     }
3696
3697                                     incrementNumSimpleJClustersInList(nbl, ncj_old_j);
3698                                 }
3699                             }
3700                         }
3701                     }
3702
3703                     /* Set the exclusions for this ci list */
3704                     setExclusionsForIEntry(gridSet,
3705                                            nbl,
3706                                            excludeSubDiagonal,
3707                                            na_cj_2log,
3708                                            *getOpenIEntry(nbl),
3709                                            exclusions);
3710
3711                     if (haveFep)
3712                     {
3713                         make_fep_list(gridSet.atomIndices(), nbat, nbl,
3714                                       excludeSubDiagonal,
3715                                       getOpenIEntry(nbl),
3716                                       shx, shy, shz,
3717                                       rl_fep2,
3718                                       iGrid, jGrid, nbl_fep);
3719                     }
3720
3721                     /* Close this ci list */
3722                     closeIEntry(nbl,
3723                                 nsubpair_max,
3724                                 progBal, nsubpair_tot_est,
3725                                 th, nth);
3726                 }
3727             }
3728         }
3729
3730         if (bFBufferFlag && getNumSimpleJClustersInList(*nbl) > ncj_old_i)
3731         {
3732             bitmask_init_bit(&(work->buffer_flags.flag[(iGrid.cellOffset() + ci) >> gridi_flag_shift]), th);
3733         }
3734     }
3735
3736     work->ndistc = numDistanceChecks;
3737
3738     nbs_cycle_stop(&work->cc[enbsCCsearch]);
3739
3740     checkListSizeConsistency(*nbl, haveFep);
3741
3742     if (debug)
3743     {
3744         fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
3745
3746         print_nblist_statistics(debug, nbl, nbs, rlist);
3747
3748         if (haveFep)
3749         {
3750             fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj);
3751         }
3752     }
3753 }
3754
3755 static void reduce_buffer_flags(const nbnxn_search         *nbs,
3756                                 int                         nsrc,
3757                                 const nbnxn_buffer_flags_t *dest)
3758 {
3759     for (int s = 0; s < nsrc; s++)
3760     {
3761         gmx_bitmask_t * flag = nbs->work[s].buffer_flags.flag;
3762
3763         for (int b = 0; b < dest->nflag; b++)
3764         {
3765             bitmask_union(&(dest->flag[b]), flag[b]);
3766         }
3767     }
3768 }
3769
3770 static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
3771 {
3772     int           nelem, nkeep, ncopy, nred, out;
3773     gmx_bitmask_t mask_0;
3774
3775     nelem = 0;
3776     nkeep = 0;
3777     ncopy = 0;
3778     nred  = 0;
3779     bitmask_init_bit(&mask_0, 0);
3780     for (int b = 0; b < flags->nflag; b++)
3781     {
3782         if (bitmask_is_equal(flags->flag[b], mask_0))
3783         {
3784             /* Only flag 0 is set, no copy of reduction required */
3785             nelem++;
3786             nkeep++;
3787         }
3788         else if (!bitmask_is_zero(flags->flag[b]))
3789         {
3790             int c = 0;
3791             for (out = 0; out < nout; out++)
3792             {
3793                 if (bitmask_is_set(flags->flag[b], out))
3794                 {
3795                     c++;
3796                 }
3797             }
3798             nelem += c;
3799             if (c == 1)
3800             {
3801                 ncopy++;
3802             }
3803             else
3804             {
3805                 nred += c;
3806             }
3807         }
3808     }
3809
3810     fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
3811             flags->nflag, nout,
3812             nelem/static_cast<double>(flags->nflag),
3813             nkeep/static_cast<double>(flags->nflag),
3814             ncopy/static_cast<double>(flags->nflag),
3815             nred/static_cast<double>(flags->nflag));
3816 }
3817
3818 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
3819  * *cjGlobal is updated with the cj count in src.
3820  * When setFlags==true, flag bit t is set in flag for all i and j clusters.
3821  */
3822 template<bool setFlags>
3823 static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
3824                                   const NbnxnPairlistCpu * gmx_restrict src,
3825                                   NbnxnPairlistCpu * gmx_restrict dest,
3826                                   gmx_bitmask_t *flag,
3827                                   int iFlagShift, int jFlagShift, int t)
3828 {
3829     const int ncj = srcCi->cj_ind_end - srcCi->cj_ind_start;
3830
3831     dest->ci.push_back(*srcCi);
3832     dest->ci.back().cj_ind_start = dest->cj.size();
3833     dest->ci.back().cj_ind_end   = dest->cj.size() + ncj;
3834
3835     if (setFlags)
3836     {
3837         bitmask_init_bit(&flag[srcCi->ci >> iFlagShift], t);
3838     }
3839
3840     for (int j = srcCi->cj_ind_start; j < srcCi->cj_ind_end; j++)
3841     {
3842         dest->cj.push_back(src->cj[j]);
3843
3844         if (setFlags)
3845         {
3846             /* NOTE: This is relatively expensive, since this
3847              * operation is done for all elements in the list,
3848              * whereas at list generation this is done only
3849              * once for each flag entry.
3850              */
3851             bitmask_init_bit(&flag[src->cj[j].cj >> jFlagShift], t);
3852         }
3853     }
3854 }
3855
3856 /* This routine re-balances the pairlists such that all are nearly equally
3857  * sized. Only whole i-entries are moved between lists. These are moved
3858  * between the ends of the lists, such that the buffer reduction cost should
3859  * not change significantly.
3860  * Note that all original reduction flags are currently kept. This can lead
3861  * to reduction of parts of the force buffer that could be avoided. But since
3862  * the original lists are quite balanced, this will only give minor overhead.
3863  */
3864 static void rebalanceSimpleLists(int                                  numLists,
3865                                  NbnxnPairlistCpu * const * const     srcSet,
3866                                  NbnxnPairlistCpu                   **destSet,
3867                                  gmx::ArrayRef<nbnxn_search_work_t>   searchWork)
3868 {
3869     int ncjTotal = 0;
3870     for (int s = 0; s < numLists; s++)
3871     {
3872         ncjTotal += srcSet[s]->ncjInUse;
3873     }
3874     int ncjTarget = (ncjTotal + numLists - 1)/numLists;
3875
3876 #pragma omp parallel num_threads(numLists)
3877     {
3878         int t       = gmx_omp_get_thread_num();
3879
3880         int cjStart = ncjTarget* t;
3881         int cjEnd   = ncjTarget*(t + 1);
3882
3883         /* The destination pair-list for task/thread t */
3884         NbnxnPairlistCpu *dest = destSet[t];
3885
3886         clear_pairlist(dest);
3887         dest->na_cj   = srcSet[0]->na_cj;
3888
3889         /* Note that the flags in the work struct (still) contain flags
3890          * for all entries that are present in srcSet->nbl[t].
3891          */
3892         gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
3893
3894         int            iFlagShift = getBufferFlagShift(dest->na_ci);
3895         int            jFlagShift = getBufferFlagShift(dest->na_cj);
3896
3897         int            cjGlobal   = 0;
3898         for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
3899         {
3900             const NbnxnPairlistCpu *src = srcSet[s];
3901
3902             if (cjGlobal + src->ncjInUse > cjStart)
3903             {
3904                 for (gmx::index i = 0; i < gmx::ssize(src->ci) && cjGlobal < cjEnd; i++)
3905                 {
3906                     const nbnxn_ci_t *srcCi = &src->ci[i];
3907                     int               ncj   = srcCi->cj_ind_end - srcCi->cj_ind_start;
3908                     if (cjGlobal >= cjStart)
3909                     {
3910                         /* If the source list is not our own, we need to set
3911                          * extra flags (the template bool parameter).
3912                          */
3913                         if (s != t)
3914                         {
3915                             copySelectedListRange
3916                             <true>
3917                                 (srcCi, src, dest,
3918                                 flag, iFlagShift, jFlagShift, t);
3919                         }
3920                         else
3921                         {
3922                             copySelectedListRange
3923                             <false>
3924                                 (srcCi, src,
3925                                 dest, flag, iFlagShift, jFlagShift, t);
3926                         }
3927                     }
3928                     cjGlobal += ncj;
3929                 }
3930             }
3931             else
3932             {
3933                 cjGlobal += src->ncjInUse;
3934             }
3935         }
3936
3937         dest->ncjInUse = dest->cj.size();
3938     }
3939
3940 #ifndef NDEBUG
3941     int ncjTotalNew = 0;
3942     for (int s = 0; s < numLists; s++)
3943     {
3944         ncjTotalNew += destSet[s]->ncjInUse;
3945     }
3946     GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
3947 #endif
3948 }
3949
3950 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
3951 static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
3952 {
3953     int numLists = listSet->nnbl;
3954     int ncjMax   = 0;
3955     int ncjTotal = 0;
3956     for (int s = 0; s < numLists; s++)
3957     {
3958         ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
3959         ncjTotal += listSet->nbl[s]->ncjInUse;
3960     }
3961     if (debug)
3962     {
3963         fprintf(debug, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax, ncjTotal);
3964     }
3965     /* The rebalancing adds 3% extra time to the search. Heuristically we
3966      * determined that under common conditions the non-bonded kernel balance
3967      * improvement will outweigh this when the imbalance is more than 3%.
3968      * But this will, obviously, depend on search vs kernel time and nstlist.
3969      */
3970     const real rebalanceTolerance = 1.03;
3971
3972     return numLists*ncjMax > ncjTotal*rebalanceTolerance;
3973 }
3974
3975 /* Perform a count (linear) sort to sort the smaller lists to the end.
3976  * This avoids load imbalance on the GPU, as large lists will be
3977  * scheduled and executed first and the smaller lists later.
3978  * Load balancing between multi-processors only happens at the end
3979  * and there smaller lists lead to more effective load balancing.
3980  * The sorting is done on the cj4 count, not on the actual pair counts.
3981  * Not only does this make the sort faster, but it also results in
3982  * better load balancing than using a list sorted on exact load.
3983  * This function swaps the pointer in the pair list to avoid a copy operation.
3984  */
3985 static void sort_sci(NbnxnPairlistGpu *nbl)
3986 {
3987     if (nbl->cj4.size() <= nbl->sci.size())
3988     {
3989         /* nsci = 0 or all sci have size 1, sorting won't change the order */
3990         return;
3991     }
3992
3993     NbnxnPairlistGpuWork &work = *nbl->work;
3994
3995     /* We will distinguish differences up to double the average */
3996     const int m = (2*nbl->cj4.size())/nbl->sci.size();
3997
3998     /* Resize work.sci_sort so we can sort into it */
3999     work.sci_sort.resize(nbl->sci.size());
4000
4001     std::vector<int> &sort = work.sortBuffer;
4002     /* Set up m + 1 entries in sort, initialized at 0 */
4003     sort.clear();
4004     sort.resize(m + 1, 0);
4005     /* Count the entries of each size */
4006     for (const nbnxn_sci_t &sci : nbl->sci)
4007     {
4008         int i = std::min(m, sci.numJClusterGroups());
4009         sort[i]++;
4010     }
4011     /* Calculate the offset for each count */
4012     int s0  = sort[m];
4013     sort[m] = 0;
4014     for (int i = m - 1; i >= 0; i--)
4015     {
4016         int s1  = sort[i];
4017         sort[i] = sort[i + 1] + s0;
4018         s0      = s1;
4019     }
4020
4021     /* Sort entries directly into place */
4022     gmx::ArrayRef<nbnxn_sci_t> sci_sort = work.sci_sort;
4023     for (const nbnxn_sci_t &sci : nbl->sci)
4024     {
4025         int i = std::min(m, sci.numJClusterGroups());
4026         sci_sort[sort[i]++] = sci;
4027     }
4028
4029     /* Swap the sci pointers so we use the new, sorted list */
4030     std::swap(nbl->sci, work.sci_sort);
4031 }
4032
4033 void
4034 nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
4035                                             nbnxn_search              *nbs,
4036                                             nbnxn_atomdata_t          *nbat,
4037                                             const t_blocka            *excl,
4038                                             const Nbnxm::KernelType    kernelType,
4039                                             const int64_t              step,
4040                                             t_nrnb                    *nrnb)
4041 {
4042     nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality);
4043
4044     const real            rlist    = nbl_list->params.rlistOuter;
4045
4046     int                nsubpair_target;
4047     float              nsubpair_tot_est;
4048     int                nnbl;
4049     int                ci_block;
4050     gmx_bool           CombineNBLists;
4051     gmx_bool           progBal;
4052     int                np_tot, np_noq, np_hlj, nap;
4053
4054     nnbl            = nbl_list->nnbl;
4055     CombineNBLists  = nbl_list->bCombined;
4056
4057     if (debug)
4058     {
4059         fprintf(debug, "ns making %d nblists\n", nnbl);
4060     }
4061
4062     nbat->bUseBufferFlags = (nbat->out.size() > 1);
4063     /* We should re-init the flags before making the first list */
4064     if (nbat->bUseBufferFlags && iLocality == InteractionLocality::Local)
4065     {
4066         init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
4067     }
4068
4069     int nzi;
4070     if (iLocality == InteractionLocality::Local)
4071     {
4072         /* Only zone (grid) 0 vs 0 */
4073         nzi = 1;
4074     }
4075     else
4076     {
4077         nzi = nbs->domainSetup().zones->nizone;
4078     }
4079
4080     if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0)
4081     {
4082         get_nsubpair_target(nbs, iLocality, rlist, minimumIlistCountForGpuBalancing_,
4083                             &nsubpair_target, &nsubpair_tot_est);
4084     }
4085     else
4086     {
4087         nsubpair_target  = 0;
4088         nsubpair_tot_est = 0;
4089     }
4090
4091     /* Clear all pair-lists */
4092     for (int th = 0; th < nnbl; th++)
4093     {
4094         if (nbl_list->bSimple)
4095         {
4096             clear_pairlist(nbl_list->nbl[th]);
4097         }
4098         else
4099         {
4100             clear_pairlist(nbl_list->nblGpu[th]);
4101         }
4102
4103         if (nbs->gridSet().haveFep())
4104         {
4105             clear_pairlist_fep(nbl_list->nbl_fep[th]);
4106         }
4107     }
4108
4109     const gmx_domdec_zones_t *ddZones = nbs->domainSetup().zones;
4110
4111     for (int zi = 0; zi < nzi; zi++)
4112     {
4113         const Grid &iGrid = nbs->gridSet().grids()[zi];
4114
4115         int                 zj0;
4116         int                 zj1;
4117         if (iLocality == InteractionLocality::Local)
4118         {
4119             zj0 = 0;
4120             zj1 = 1;
4121         }
4122         else
4123         {
4124             zj0 = ddZones->izone[zi].j0;
4125             zj1 = ddZones->izone[zi].j1;
4126             if (zi == 0)
4127             {
4128                 zj0++;
4129             }
4130         }
4131         for (int zj = zj0; zj < zj1; zj++)
4132         {
4133             const Grid &jGrid = nbs->gridSet().grids()[zj];
4134
4135             if (debug)
4136             {
4137                 fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
4138             }
4139
4140             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4141
4142             ci_block = get_ci_block_size(iGrid, nbs->domainSetup().haveDomDec, nnbl);
4143
4144             /* With GPU: generate progressively smaller lists for
4145              * load balancing for local only or non-local with 2 zones.
4146              */
4147             progBal = (iLocality == InteractionLocality::Local || ddZones->n <= 2);
4148
4149 #pragma omp parallel for num_threads(nnbl) schedule(static)
4150             for (int th = 0; th < nnbl; th++)
4151             {
4152                 try
4153                 {
4154                     /* Re-init the thread-local work flag data before making
4155                      * the first list (not an elegant conditional).
4156                      */
4157                     if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0)))
4158                     {
4159                         init_buffer_flags(&nbs->work[th].buffer_flags, nbat->numAtoms());
4160                     }
4161
4162                     if (CombineNBLists && th > 0)
4163                     {
4164                         GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
4165
4166                         clear_pairlist(nbl_list->nblGpu[th]);
4167                     }
4168
4169                     /* Divide the i super cell equally over the nblists */
4170                     if (nbl_list->bSimple)
4171                     {
4172                         nbnxn_make_pairlist_part(nbs, iGrid, jGrid,
4173                                                  &nbs->work[th], nbat, *excl,
4174                                                  rlist,
4175                                                  kernelType,
4176                                                  ci_block,
4177                                                  nbat->bUseBufferFlags,
4178                                                  nsubpair_target,
4179                                                  progBal, nsubpair_tot_est,
4180                                                  th, nnbl,
4181                                                  nbl_list->nbl[th],
4182                                                  nbl_list->nbl_fep[th]);
4183                     }
4184                     else
4185                     {
4186                         nbnxn_make_pairlist_part(nbs, iGrid, jGrid,
4187                                                  &nbs->work[th], nbat, *excl,
4188                                                  rlist,
4189                                                  kernelType,
4190                                                  ci_block,
4191                                                  nbat->bUseBufferFlags,
4192                                                  nsubpair_target,
4193                                                  progBal, nsubpair_tot_est,
4194                                                  th, nnbl,
4195                                                  nbl_list->nblGpu[th],
4196                                                  nbl_list->nbl_fep[th]);
4197                     }
4198                 }
4199                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4200             }
4201             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4202
4203             np_tot = 0;
4204             np_noq = 0;
4205             np_hlj = 0;
4206             for (int th = 0; th < nnbl; th++)
4207             {
4208                 inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
4209
4210                 if (nbl_list->bSimple)
4211                 {
4212                     NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
4213                     np_tot += nbl->cj.size();
4214                     np_noq += nbl->work->ncj_noq;
4215                     np_hlj += nbl->work->ncj_hlj;
4216                 }
4217                 else
4218                 {
4219                     NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
4220                     /* This count ignores potential subsequent pair pruning */
4221                     np_tot += nbl->nci_tot;
4222                 }
4223             }
4224             if (nbl_list->bSimple)
4225             {
4226                 nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
4227             }
4228             else
4229             {
4230                 nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
4231             }
4232             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4233             nbl_list->natpair_lj  = np_noq*nap;
4234             nbl_list->natpair_q   = np_hlj*nap/2;
4235
4236             if (CombineNBLists && nnbl > 1)
4237             {
4238                 GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
4239                 NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
4240
4241                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4242
4243                 combine_nblists(nnbl-1, nbl+1, nbl[0]);
4244
4245                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4246             }
4247         }
4248     }
4249
4250     if (nbl_list->bSimple)
4251     {
4252         if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
4253         {
4254             rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, nbs->work);
4255
4256             /* Swap the pointer of the sets of pair lists */
4257             NbnxnPairlistCpu **tmp = nbl_list->nbl;
4258             nbl_list->nbl          = nbl_list->nbl_work;
4259             nbl_list->nbl_work     = tmp;
4260         }
4261     }
4262     else
4263     {
4264         /* Sort the entries on size, large ones first */
4265         if (CombineNBLists || nnbl == 1)
4266         {
4267             sort_sci(nbl_list->nblGpu[0]);
4268         }
4269         else
4270         {
4271 #pragma omp parallel for num_threads(nnbl) schedule(static)
4272             for (int th = 0; th < nnbl; th++)
4273             {
4274                 try
4275                 {
4276                     sort_sci(nbl_list->nblGpu[th]);
4277                 }
4278                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4279             }
4280         }
4281     }
4282
4283     if (nbat->bUseBufferFlags)
4284     {
4285         reduce_buffer_flags(nbs, nbl_list->nnbl, &nbat->buffer_flags);
4286     }
4287
4288     if (nbs->gridSet().haveFep())
4289     {
4290         /* Balance the free-energy lists over all the threads */
4291         balance_fep_lists(nbs, nbl_list);
4292     }
4293
4294     if (nbl_list->bSimple)
4295     {
4296         /* This is a fresh list, so not pruned, stored using ci.
4297          * ciOuter is invalid at this point.
4298          */
4299         GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
4300     }
4301
4302     if (iLocality == Nbnxm::InteractionLocality::Local)
4303     {
4304         outerListCreationStep_ = step;
4305     }
4306     else
4307     {
4308         GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
4309                            "Outer list should be created at the same step as the inner list");
4310     }
4311
4312     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4313     if (iLocality == InteractionLocality::Local)
4314     {
4315         nbs->search_count++;
4316     }
4317     if (nbs->print_cycles &&
4318         (!nbs->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
4319         nbs->search_count % 100 == 0)
4320     {
4321         nbs_cycle_print(stderr, nbs);
4322     }
4323
4324     /* If we have more than one list, they either got rebalancing (CPU)
4325      * or combined (GPU), so we should dump the final result to debug.
4326      */
4327     if (debug && nbl_list->nnbl > 1)
4328     {
4329         if (nbl_list->bSimple)
4330         {
4331             for (int t = 0; t < nbl_list->nnbl; t++)
4332             {
4333                 print_nblist_statistics(debug, nbl_list->nbl[t], nbs, rlist);
4334             }
4335         }
4336         else
4337         {
4338             print_nblist_statistics(debug, nbl_list->nblGpu[0], nbs, rlist);
4339         }
4340     }
4341
4342     if (debug)
4343     {
4344         if (gmx_debug_at)
4345         {
4346             if (nbl_list->bSimple)
4347             {
4348                 for (int t = 0; t < nbl_list->nnbl; t++)
4349                 {
4350                     print_nblist_ci_cj(debug, nbl_list->nbl[t]);
4351                 }
4352             }
4353             else
4354             {
4355                 print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
4356             }
4357         }
4358
4359         if (nbat->bUseBufferFlags)
4360         {
4361             print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
4362         }
4363     }
4364
4365     if (params_.useDynamicPruning && nbl_list->bSimple)
4366     {
4367         nbnxnPrepareListForDynamicPruning(nbl_list);
4368     }
4369 }
4370
4371 void
4372 nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality  iLocality,
4373                                       const t_blocka                   *excl,
4374                                       int64_t                           step,
4375                                       t_nrnb                           *nrnb)
4376 {
4377     pairlistSets_->construct(iLocality, nbs.get(), nbat.get(), excl,
4378                              kernelSetup_.kernelType,
4379                              step, nrnb);
4380
4381     if (useGpu())
4382     {
4383         /* Launch the transfer of the pairlist to the GPU.
4384          *
4385          * NOTE: The launch overhead is currently not timed separately
4386          */
4387         Nbnxm::gpu_init_pairlist(gpu_nbv,
4388                                  pairlistSets().pairlistSet(iLocality).nblGpu[0],
4389                                  iLocality);
4390     }
4391 }
4392
4393 void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
4394 {
4395     GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
4396
4397     /* TODO: Restructure the lists so we have actual outer and inner
4398      *       list objects so we can set a single pointer instead of
4399      *       swapping several pointers.
4400      */
4401
4402     for (int i = 0; i < listSet->nnbl; i++)
4403     {
4404         NbnxnPairlistCpu &list = *listSet->nbl[i];
4405
4406         /* The search produced a list in ci/cj.
4407          * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4408          * and we can prune that to get an inner list in ci/cj.
4409          */
4410         GMX_RELEASE_ASSERT(list.ciOuter.empty() && list.cjOuter.empty(),
4411                            "The outer lists should be empty before preparation");
4412
4413         std::swap(list.ci, list.ciOuter);
4414         std::swap(list.cj, list.cjOuter);
4415     }
4416 }