src/gromacs/mdlib/nbnxn_search.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_search.h"
  39
  40 #include "config.h"
  41
  42 #include <cassert>
  43 #include <cmath>
  44 #include <cstring>
  45
  46 #include <algorithm>
  47
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/utilities.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdlib/nb_verlet.h"
  55 #include "gromacs/mdlib/nbnxn_atomdata.h"
  56 #include "gromacs/mdlib/nbnxn_consts.h"
  57 #include "gromacs/mdlib/nbnxn_grid.h"
  58 #include "gromacs/mdlib/nbnxn_internal.h"
  59 #include "gromacs/mdlib/nbnxn_simd.h"
  60 #include "gromacs/mdlib/nbnxn_util.h"
  61 #include "gromacs/mdlib/ns.h"
  62 #include "gromacs/mdtypes/group.h"
  63 #include "gromacs/mdtypes/md_enums.h"
  64 #include "gromacs/pbcutil/ishift.h"
  65 #include "gromacs/pbcutil/pbc.h"
  66 #include "gromacs/simd/simd.h"
  67 #include "gromacs/simd/vector_operations.h"
  68 #include "gromacs/topology/block.h"
  69 #include "gromacs/utility/exceptions.h"
  70 #include "gromacs/utility/fatalerror.h"
  71 #include "gromacs/utility/gmxomp.h"
  72 #include "gromacs/utility/smalloc.h"
  73
  74 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  75
  76
  77 /* We shift the i-particles backward for PBC.
  78  * This leads to more conditionals than shifting forward.
  79  * We do this to get more balanced pair lists.
  80  */
  81 constexpr bool c_pbcShiftBackward = true;
  82
  83
  84 static void nbs_cycle_clear(nbnxn_cycle_t *cc)
  85 {
  86     for (int i = 0; i < enbsCCnr; i++)
  87     {
  88         cc[i].count = 0;
  89         cc[i].c     = 0;
  90     }
  91 }
  92
  93 static double Mcyc_av(const nbnxn_cycle_t *cc)
  94 {
  95     return static_cast<double>(cc->c)*1e-6/cc->count;
  96 }
  97
  98 static void nbs_cycle_print(FILE *fp, const nbnxn_search *nbs)
  99 {
 100     fprintf(fp, "\n");
 101     fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
 102             nbs->cc[enbsCCgrid].count,
 103             Mcyc_av(&nbs->cc[enbsCCgrid]),
 104             Mcyc_av(&nbs->cc[enbsCCsearch]),
 105             Mcyc_av(&nbs->cc[enbsCCreducef]));
 106
 107     if (nbs->work.size() > 1)
 108     {
 109         if (nbs->cc[enbsCCcombine].count > 0)
 110         {
 111             fprintf(fp, " comb %5.2f",
 112                     Mcyc_av(&nbs->cc[enbsCCcombine]));
 113         }
 114         fprintf(fp, " s. th");
 115         for (const nbnxn_search_work_t &work : nbs->work)
 116         {
 117             fprintf(fp, " %4.1f",
 118                     Mcyc_av(&work.cc[enbsCCsearch]));
 119         }
 120     }
 121     fprintf(fp, "\n");
 122 }
 123
 124 /* Layout for the nonbonded NxN pair lists */
 125 enum class NbnxnLayout
 126 {
 127     NoSimd4x4, // i-cluster size 4, j-cluster size 4
 128     Simd4xN,   // i-cluster size 4, j-cluster size SIMD width
 129     Simd2xNN,  // i-cluster size 4, j-cluster size half SIMD width
 130     Gpu8x8x8   // i-cluster size 8, j-cluster size 8 + super-clustering
 131 };
 132
 133 #if GMX_SIMD
 134 /* Returns the j-cluster size */
 135 template <NbnxnLayout layout>
 136 static constexpr int jClusterSize()
 137 {
 138     static_assert(layout == NbnxnLayout::NoSimd4x4 || layout == NbnxnLayout::Simd4xN || layout == NbnxnLayout::Simd2xNN, "Currently jClusterSize only supports CPU layouts");
 139
 140     return layout == NbnxnLayout::Simd4xN ? GMX_SIMD_REAL_WIDTH : (layout == NbnxnLayout::Simd2xNN ? GMX_SIMD_REAL_WIDTH/2 : c_nbnxnCpuIClusterSize);
 141 }
 142
 143 /*! \brief Returns the j-cluster index given the i-cluster index.
 144  *
 145  * \tparam    jClusterSize      The number of atoms in a j-cluster
 146  * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
 147  * \param[in] ci                The i-cluster index
 148  */
 149 template <int jClusterSize, int jSubClusterIndex>
 150 static inline int cjFromCi(int ci)
 151 {
 152     static_assert(jClusterSize == c_nbnxnCpuIClusterSize/2 || jClusterSize == c_nbnxnCpuIClusterSize || jClusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 153
 154     static_assert(jSubClusterIndex == 0 || jSubClusterIndex == 1,
 155                   "Only sub-cluster indices 0 and 1 are supported");
 156
 157     if (jClusterSize == c_nbnxnCpuIClusterSize/2)
 158     {
 159         if (jSubClusterIndex == 0)
 160         {
 161             return ci << 1;
 162         }
 163         else
 164         {
 165             return ((ci + 1) << 1) - 1;
 166         }
 167     }
 168     else if (jClusterSize == c_nbnxnCpuIClusterSize)
 169     {
 170         return ci;
 171     }
 172     else
 173     {
 174         return ci >> 1;
 175     }
 176 }
 177
 178 /*! \brief Returns the j-cluster index given the i-cluster index.
 179  *
 180  * \tparam    layout            The pair-list layout
 181  * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
 182  * \param[in] ci                The i-cluster index
 183  */
 184 template <NbnxnLayout layout, int jSubClusterIndex>
 185 static inline int cjFromCi(int ci)
 186 {
 187     constexpr int clusterSize = jClusterSize<layout>();
 188
 189     return cjFromCi<clusterSize, jSubClusterIndex>(ci);
 190 }
 191
 192 /* Returns the nbnxn coordinate data index given the i-cluster index */
 193 template <NbnxnLayout layout>
 194 static inline int xIndexFromCi(int ci)
 195 {
 196     constexpr int clusterSize = jClusterSize<layout>();
 197
 198     static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 199
 200     if (clusterSize <= c_nbnxnCpuIClusterSize)
 201     {
 202         /* Coordinates are stored packed in groups of 4 */
 203         return ci*STRIDE_P4;
 204     }
 205     else
 206     {
 207         /* Coordinates packed in 8, i-cluster size is half the packing width */
 208         return (ci >> 1)*STRIDE_P8 + (ci & 1)*(c_packX8 >> 1);
 209     }
 210 }
 211
 212 /* Returns the nbnxn coordinate data index given the j-cluster index */
 213 template <NbnxnLayout layout>
 214 static inline int xIndexFromCj(int cj)
 215 {
 216     constexpr int clusterSize = jClusterSize<layout>();
 217
 218     static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
 219
 220     if (clusterSize == c_nbnxnCpuIClusterSize/2)
 221     {
 222         /* Coordinates are stored packed in groups of 4 */
 223         return (cj >> 1)*STRIDE_P4 + (cj & 1)*(c_packX4 >> 1);
 224     }
 225     else if (clusterSize == c_nbnxnCpuIClusterSize)
 226     {
 227         /* Coordinates are stored packed in groups of 4 */
 228         return cj*STRIDE_P4;
 229     }
 230     else
 231     {
 232         /* Coordinates are stored packed in groups of 8 */
 233         return cj*STRIDE_P8;
 234     }
 235 }
 236 #endif //GMX_SIMD
 237
 238 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 239 {
 240     if (nb_kernel_type == nbnxnkNotSet)
 241     {
 242         gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 243     }
 244
 245     switch (nb_kernel_type)
 246     {
 247         case nbnxnk8x8x8_GPU:
 248         case nbnxnk8x8x8_PlainC:
 249             return FALSE;
 250
 251         case nbnxnk4x4_PlainC:
 252         case nbnxnk4xN_SIMD_4xN:
 253         case nbnxnk4xN_SIMD_2xNN:
 254             return TRUE;
 255
 256         default:
 257             gmx_incons("Invalid nonbonded kernel type passed!");
 258             return FALSE;
 259     }
 260 }
 261
 262 /* Initializes a single nbnxn_pairlist_t data structure */
 263 static void nbnxn_init_pairlist_fep(t_nblist *nl)
 264 {
 265     nl->type        = GMX_NBLIST_INTERACTION_FREE_ENERGY;
 266     nl->igeometry   = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 267     /* The interaction functions are set in the free energy kernel fuction */
 268     nl->ivdw        = -1;
 269     nl->ivdwmod     = -1;
 270     nl->ielec       = -1;
 271     nl->ielecmod    = -1;
 272
 273     nl->maxnri      = 0;
 274     nl->maxnrj      = 0;
 275     nl->nri         = 0;
 276     nl->nrj         = 0;
 277     nl->iinr        = nullptr;
 278     nl->gid         = nullptr;
 279     nl->shift       = nullptr;
 280     nl->jindex      = nullptr;
 281     nl->jjnr        = nullptr;
 282     nl->excl_fep    = nullptr;
 283
 284 }
 285
 286 static void free_nblist(t_nblist *nl)
 287 {
 288     sfree(nl->iinr);
 289     sfree(nl->gid);
 290     sfree(nl->shift);
 291     sfree(nl->jindex);
 292     sfree(nl->jjnr);
 293     sfree(nl->excl_fep);
 294 }
 295
 296 nbnxn_search_work_t::nbnxn_search_work_t() :
 297     cp0({{0}}
 298         ),
 299     buffer_flags({0, nullptr, 0}),
 300     ndistc(0),
 301     nbl_fep(new t_nblist),
 302     cp1({{0}})
 303 {
 304     nbnxn_init_pairlist_fep(nbl_fep.get());
 305
 306     nbs_cycle_clear(cc);
 307 }
 308
 309 nbnxn_search_work_t::~nbnxn_search_work_t()
 310 {
 311     sfree(buffer_flags.flag);
 312
 313     free_nblist(nbl_fep.get());
 314 }
 315
 316 nbnxn_search::nbnxn_search(const ivec               *n_dd_cells,
 317                            const gmx_domdec_zones_t *zones,
 318                            gmx_bool                  bFEP,
 319                            int                       nthread_max) :
 320     bFEP(bFEP),
 321     ePBC(epbcNONE), // The correct value will be set during the gridding
 322     zones(zones),
 323     natoms_local(0),
 324     natoms_nonlocal(0),
 325     search_count(0),
 326     work(nthread_max)
 327 {
 328     // The correct value will be set during the gridding
 329     clear_mat(box);
 330     clear_ivec(dd_dim);
 331     int numGrids = 1;
 332     DomDec = n_dd_cells != nullptr;
 333     if (DomDec)
 334     {
 335         for (int d = 0; d < DIM; d++)
 336         {
 337             if ((*n_dd_cells)[d] > 1)
 338             {
 339                 dd_dim[d] = 1;
 340                 /* Each grid matches a DD zone */
 341                 numGrids *= 2;
 342             }
 343         }
 344     }
 345
 346     grid.resize(numGrids);
 347
 348     /* Initialize detailed nbsearch cycle counting */
 349     print_cycles = (getenv("GMX_NBNXN_CYCLE") != nullptr);
 350     nbs_cycle_clear(cc);
 351 }
 352
 353 nbnxn_search *nbnxn_init_search(const ivec                *n_dd_cells,
 354                                 const gmx_domdec_zones_t  *zones,
 355                                 gmx_bool                   bFEP,
 356                                 int                        nthread_max)
 357 {
 358     return new nbnxn_search(n_dd_cells, zones, bFEP, nthread_max);
 359 }
 360
 361 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 362                               int                   natoms)
 363 {
 364     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 365     if (flags->nflag > flags->flag_nalloc)
 366     {
 367         flags->flag_nalloc = over_alloc_large(flags->nflag);
 368         srenew(flags->flag, flags->flag_nalloc);
 369     }
 370     for (int b = 0; b < flags->nflag; b++)
 371     {
 372         bitmask_clear(&(flags->flag[b]));
 373     }
 374 }
 375
 376 /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
 377  *
 378  * Given a cutoff distance between atoms, this functions returns the cutoff
 379  * distance2 between a bounding box of a group of atoms and a grid cell.
 380  * Since atoms can be geometrically outside of the cell they have been
 381  * assigned to (when atom groups instead of individual atoms are assigned
 382  * to cells), this distance returned can be larger than the input.
 383  */
 384 static real listRangeForBoundingBoxToGridCell(real                rlist,
 385                                               const nbnxn_grid_t &grid)
 386 {
 387     return rlist + grid.maxAtomGroupRadius;
 388
 389 }
 390 /* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
 391  *
 392  * Given a cutoff distance between atoms, this functions returns the cutoff
 393  * distance2 between two grid cells.
 394  * Since atoms can be geometrically outside of the cell they have been
 395  * assigned to (when atom groups instead of individual atoms are assigned
 396  * to cells), this distance returned can be larger than the input.
 397  */
 398 static real listRangeForGridCellToGridCell(real                rlist,
 399                                            const nbnxn_grid_t &iGrid,
 400                                            const nbnxn_grid_t &jGrid)
 401 {
 402     return rlist + iGrid.maxAtomGroupRadius + jGrid.maxAtomGroupRadius;
 403 }
 404
 405 /* Determines the cell range along one dimension that
 406  * the bounding box b0 - b1 sees.
 407  */
 408 template<int dim>
 409 static void get_cell_range(real b0, real b1,
 410                            const nbnxn_grid_t &jGrid,
 411                            real d2, real rlist, int *cf, int *cl)
 412 {
 413     real listRangeBBToCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGrid));
 414     real distanceInCells    = (b0 - jGrid.c0[dim])*jGrid.invCellSize[dim];
 415     *cf                     = std::max(static_cast<int>(distanceInCells), 0);
 416
 417     while (*cf > 0 &&
 418            d2 + gmx::square((b0 - jGrid.c0[dim]) - (*cf - 1 + 1)*jGrid.cellSize[dim]) < listRangeBBToCell2)
 419     {
 420         (*cf)--;
 421     }
 422
 423     *cl = std::min(static_cast<int>((b1 - jGrid.c0[dim])*jGrid.invCellSize[dim]), jGrid.numCells[dim] - 1);
 424     while (*cl < jGrid.numCells[dim] - 1 &&
 425            d2 + gmx::square((*cl + 1)*jGrid.cellSize[dim] - (b1 - jGrid.c0[dim])) < listRangeBBToCell2)
 426     {
 427         (*cl)++;
 428     }
 429 }
 430
 431 /* Reference code calculating the distance^2 between two bounding boxes */
 432 /*
 433    static float box_dist2(float bx0, float bx1, float by0,
 434                        float by1, float bz0, float bz1,
 435                        const nbnxn_bb_t *bb)
 436    {
 437     float d2;
 438     float dl, dh, dm, dm0;
 439
 440     d2 = 0;
 441
 442     dl  = bx0 - bb->upper[BB_X];
 443     dh  = bb->lower[BB_X] - bx1;
 444     dm  = std::max(dl, dh);
 445     dm0 = std::max(dm, 0.0f);
 446     d2 += dm0*dm0;
 447
 448     dl  = by0 - bb->upper[BB_Y];
 449     dh  = bb->lower[BB_Y] - by1;
 450     dm  = std::max(dl, dh);
 451     dm0 = std::max(dm, 0.0f);
 452     d2 += dm0*dm0;
 453
 454     dl  = bz0 - bb->upper[BB_Z];
 455     dh  = bb->lower[BB_Z] - bz1;
 456     dm  = std::max(dl, dh);
 457     dm0 = std::max(dm, 0.0f);
 458     d2 += dm0*dm0;
 459
 460     return d2;
 461    }
 462  */
 463
 464 /* Plain C code calculating the distance^2 between two bounding boxes */
 465 static float subc_bb_dist2(int                              si,
 466                            const nbnxn_bb_t                *bb_i_ci,
 467                            int                              csj,
 468                            gmx::ArrayRef<const nbnxn_bb_t>  bb_j_all)
 469 {
 470     const nbnxn_bb_t *bb_i = bb_i_ci         +  si;
 471     const nbnxn_bb_t *bb_j = bb_j_all.data() + csj;
 472
 473     float             d2 = 0;
 474     float             dl, dh, dm, dm0;
 475
 476     dl  = bb_i->lower[BB_X] - bb_j->upper[BB_X];
 477     dh  = bb_j->lower[BB_X] - bb_i->upper[BB_X];
 478     dm  = std::max(dl, dh);
 479     dm0 = std::max(dm, 0.0f);
 480     d2 += dm0*dm0;
 481
 482     dl  = bb_i->lower[BB_Y] - bb_j->upper[BB_Y];
 483     dh  = bb_j->lower[BB_Y] - bb_i->upper[BB_Y];
 484     dm  = std::max(dl, dh);
 485     dm0 = std::max(dm, 0.0f);
 486     d2 += dm0*dm0;
 487
 488     dl  = bb_i->lower[BB_Z] - bb_j->upper[BB_Z];
 489     dh  = bb_j->lower[BB_Z] - bb_i->upper[BB_Z];
 490     dm  = std::max(dl, dh);
 491     dm0 = std::max(dm, 0.0f);
 492     d2 += dm0*dm0;
 493
 494     return d2;
 495 }
 496
 497 #if NBNXN_SEARCH_BB_SIMD4
 498
 499 /* 4-wide SIMD code for bb distance for bb format xyz0 */
 500 static float subc_bb_dist2_simd4(int                              si,
 501                                  const nbnxn_bb_t                *bb_i_ci,
 502                                  int                              csj,
 503                                  gmx::ArrayRef<const nbnxn_bb_t>  bb_j_all)
 504 {
 505     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 506     using namespace gmx;
 507
 508     Simd4Float bb_i_S0, bb_i_S1;
 509     Simd4Float bb_j_S0, bb_j_S1;
 510     Simd4Float dl_S;
 511     Simd4Float dh_S;
 512     Simd4Float dm_S;
 513     Simd4Float dm0_S;
 514
 515     bb_i_S0 = load4(&bb_i_ci[si].lower[0]);
 516     bb_i_S1 = load4(&bb_i_ci[si].upper[0]);
 517     bb_j_S0 = load4(&bb_j_all[csj].lower[0]);
 518     bb_j_S1 = load4(&bb_j_all[csj].upper[0]);
 519
 520     dl_S    = bb_i_S0 - bb_j_S1;
 521     dh_S    = bb_j_S0 - bb_i_S1;
 522
 523     dm_S    = max(dl_S, dh_S);
 524     dm0_S   = max(dm_S, simd4SetZeroF());
 525
 526     return dotProduct(dm0_S, dm0_S);
 527 }
 528
 529 /* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 530 #define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2) \
 531     {                                                \
 532         int               shi;                                  \
 533                                                  \
 534         Simd4Float        dx_0, dy_0, dz_0;                    \
 535         Simd4Float        dx_1, dy_1, dz_1;                    \
 536                                                  \
 537         Simd4Float        mx, my, mz;                          \
 538         Simd4Float        m0x, m0y, m0z;                       \
 539                                                  \
 540         Simd4Float        d2x, d2y, d2z;                       \
 541         Simd4Float        d2s, d2t;                            \
 542                                                  \
 543         shi = (si)*NNBSBB_D*DIM;                       \
 544                                                  \
 545         xi_l = load4((bb_i)+shi+0*STRIDE_PBB);   \
 546         yi_l = load4((bb_i)+shi+1*STRIDE_PBB);   \
 547         zi_l = load4((bb_i)+shi+2*STRIDE_PBB);   \
 548         xi_h = load4((bb_i)+shi+3*STRIDE_PBB);   \
 549         yi_h = load4((bb_i)+shi+4*STRIDE_PBB);   \
 550         zi_h = load4((bb_i)+shi+5*STRIDE_PBB);   \
 551                                                  \
 552         dx_0 = xi_l - xj_h;                 \
 553         dy_0 = yi_l - yj_h;                 \
 554         dz_0 = zi_l - zj_h;                 \
 555                                                  \
 556         dx_1 = xj_l - xi_h;                 \
 557         dy_1 = yj_l - yi_h;                 \
 558         dz_1 = zj_l - zi_h;                 \
 559                                                  \
 560         mx   = max(dx_0, dx_1);                 \
 561         my   = max(dy_0, dy_1);                 \
 562         mz   = max(dz_0, dz_1);                 \
 563                                                  \
 564         m0x  = max(mx, zero);                   \
 565         m0y  = max(my, zero);                   \
 566         m0z  = max(mz, zero);                   \
 567                                                  \
 568         d2x  = m0x * m0x;                   \
 569         d2y  = m0y * m0y;                   \
 570         d2z  = m0z * m0z;                   \
 571                                                  \
 572         d2s  = d2x + d2y;                   \
 573         d2t  = d2s + d2z;                   \
 574                                                  \
 575         store4((d2)+(si), d2t);                      \
 576     }
 577
 578 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
 579 static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
 580                                      int nsi, const float *bb_i,
 581                                      float *d2)
 582 {
 583     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 584     using namespace gmx;
 585
 586     Simd4Float xj_l, yj_l, zj_l;
 587     Simd4Float xj_h, yj_h, zj_h;
 588     Simd4Float xi_l, yi_l, zi_l;
 589     Simd4Float xi_h, yi_h, zi_h;
 590
 591     Simd4Float zero;
 592
 593     zero = setZero();
 594
 595     xj_l = Simd4Float(bb_j[0*STRIDE_PBB]);
 596     yj_l = Simd4Float(bb_j[1*STRIDE_PBB]);
 597     zj_l = Simd4Float(bb_j[2*STRIDE_PBB]);
 598     xj_h = Simd4Float(bb_j[3*STRIDE_PBB]);
 599     yj_h = Simd4Float(bb_j[4*STRIDE_PBB]);
 600     zj_h = Simd4Float(bb_j[5*STRIDE_PBB]);
 601
 602     /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
 603      * But as we know the number of iterations is 1 or 2, we unroll manually.
 604      */
 605     SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2);
 606     if (STRIDE_PBB < nsi)
 607     {
 608         SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2);
 609     }
 610 }
 611
 612 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 613
 614
 615 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
 616 static inline gmx_bool
 617 clusterpair_in_range(const NbnxnPairlistGpuWork &work,
 618                      int si,
 619                      int csj, int stride, const real *x_j,
 620                      real rlist2)
 621 {
 622 #if !GMX_SIMD4_HAVE_REAL
 623
 624     /* Plain C version.
 625      * All coordinates are stored as xyzxyz...
 626      */
 627
 628     const real *x_i = work.iSuperClusterData.x.data();
 629
 630     for (int i = 0; i < c_nbnxnGpuClusterSize; i++)
 631     {
 632         int i0 = (si*c_nbnxnGpuClusterSize + i)*DIM;
 633         for (int j = 0; j < c_nbnxnGpuClusterSize; j++)
 634         {
 635             int  j0 = (csj*c_nbnxnGpuClusterSize + j)*stride;
 636
 637             real d2 = gmx::square(x_i[i0  ] - x_j[j0  ]) + gmx::square(x_i[i0+1] - x_j[j0+1]) + gmx::square(x_i[i0+2] - x_j[j0+2]);
 638
 639             if (d2 < rlist2)
 640             {
 641                 return TRUE;
 642             }
 643         }
 644     }
 645
 646     return FALSE;
 647
 648 #else /* !GMX_SIMD4_HAVE_REAL */
 649
 650     /* 4-wide SIMD version.
 651      * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
 652      * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
 653      */
 654     static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
 655                   "A cluster is hard-coded to 4/8 atoms.");
 656
 657     Simd4Real   rc2_S      = Simd4Real(rlist2);
 658
 659     const real *x_i        = work.iSuperClusterData.xSimd.data();
 660
 661     int         dim_stride = c_nbnxnGpuClusterSize*DIM;
 662     Simd4Real   ix_S0      = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
 663     Simd4Real   iy_S0      = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
 664     Simd4Real   iz_S0      = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
 665
 666     Simd4Real   ix_S1, iy_S1, iz_S1;
 667     if (c_nbnxnGpuClusterSize == 8)
 668     {
 669         ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
 670         iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
 671         iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
 672     }
 673     /* We loop from the outer to the inner particles to maximize
 674      * the chance that we find a pair in range quickly and return.
 675      */
 676     int j0 = csj*c_nbnxnGpuClusterSize;
 677     int j1 = j0 + c_nbnxnGpuClusterSize - 1;
 678     while (j0 < j1)
 679     {
 680         Simd4Real jx0_S, jy0_S, jz0_S;
 681         Simd4Real jx1_S, jy1_S, jz1_S;
 682
 683         Simd4Real dx_S0, dy_S0, dz_S0;
 684         Simd4Real dx_S1, dy_S1, dz_S1;
 685         Simd4Real dx_S2, dy_S2, dz_S2;
 686         Simd4Real dx_S3, dy_S3, dz_S3;
 687
 688         Simd4Real rsq_S0;
 689         Simd4Real rsq_S1;
 690         Simd4Real rsq_S2;
 691         Simd4Real rsq_S3;
 692
 693         Simd4Bool wco_S0;
 694         Simd4Bool wco_S1;
 695         Simd4Bool wco_S2;
 696         Simd4Bool wco_S3;
 697         Simd4Bool wco_any_S01, wco_any_S23, wco_any_S;
 698
 699         jx0_S = Simd4Real(x_j[j0*stride+0]);
 700         jy0_S = Simd4Real(x_j[j0*stride+1]);
 701         jz0_S = Simd4Real(x_j[j0*stride+2]);
 702
 703         jx1_S = Simd4Real(x_j[j1*stride+0]);
 704         jy1_S = Simd4Real(x_j[j1*stride+1]);
 705         jz1_S = Simd4Real(x_j[j1*stride+2]);
 706
 707         /* Calculate distance */
 708         dx_S0            = ix_S0 - jx0_S;
 709         dy_S0            = iy_S0 - jy0_S;
 710         dz_S0            = iz_S0 - jz0_S;
 711         dx_S2            = ix_S0 - jx1_S;
 712         dy_S2            = iy_S0 - jy1_S;
 713         dz_S2            = iz_S0 - jz1_S;
 714         if (c_nbnxnGpuClusterSize == 8)
 715         {
 716             dx_S1            = ix_S1 - jx0_S;
 717             dy_S1            = iy_S1 - jy0_S;
 718             dz_S1            = iz_S1 - jz0_S;
 719             dx_S3            = ix_S1 - jx1_S;
 720             dy_S3            = iy_S1 - jy1_S;
 721             dz_S3            = iz_S1 - jz1_S;
 722         }
 723
 724         /* rsq = dx*dx+dy*dy+dz*dz */
 725         rsq_S0           = norm2(dx_S0, dy_S0, dz_S0);
 726         rsq_S2           = norm2(dx_S2, dy_S2, dz_S2);
 727         if (c_nbnxnGpuClusterSize == 8)
 728         {
 729             rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
 730             rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
 731         }
 732
 733         wco_S0           = (rsq_S0 < rc2_S);
 734         wco_S2           = (rsq_S2 < rc2_S);
 735         if (c_nbnxnGpuClusterSize == 8)
 736         {
 737             wco_S1           = (rsq_S1 < rc2_S);
 738             wco_S3           = (rsq_S3 < rc2_S);
 739         }
 740         if (c_nbnxnGpuClusterSize == 8)
 741         {
 742             wco_any_S01      = wco_S0 || wco_S1;
 743             wco_any_S23      = wco_S2 || wco_S3;
 744             wco_any_S        = wco_any_S01 || wco_any_S23;
 745         }
 746         else
 747         {
 748             wco_any_S = wco_S0 || wco_S2;
 749         }
 750
 751         if (anyTrue(wco_any_S))
 752         {
 753             return TRUE;
 754         }
 755
 756         j0++;
 757         j1--;
 758     }
 759
 760     return FALSE;
 761
 762 #endif /* !GMX_SIMD4_HAVE_REAL */
 763 }
 764
 765 /* Returns the j-cluster index for index cjIndex in a cj list */
 766 static inline int nblCj(const nbnxn_cj_t *cjList, int cjIndex)
 767 {
 768     return cjList[cjIndex].cj;
 769 }
 770
 771 /* Returns the j-cluster index for index cjIndex in a cj4 list */
 772 static inline int nblCj(const nbnxn_cj4_t *cj4List, int cjIndex)
 773 {
 774     return cj4List[cjIndex/c_nbnxnGpuJgroupSize].cj[cjIndex & (c_nbnxnGpuJgroupSize - 1)];
 775 }
 776
 777 /* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 778 static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
 779 {
 780     return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
 781 }
 782
 783 /* Ensures there is enough space for extra extra exclusion masks */
 784 static void check_excl_space(NbnxnPairlistGpu *nbl, int extra)
 785 {
 786     if (nbl->nexcl+extra > nbl->excl_nalloc)
 787     {
 788         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 789         nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->excl),
 790                            nbl->nexcl*sizeof(*nbl->excl),
 791                            nbl->excl_nalloc*sizeof(*nbl->excl),
 792                            nbl->alloc, nbl->free);
 793     }
 794 }
 795
 796 /* Ensures there is enough space for ncell extra j-cells in the list */
 797 static void check_cell_list_space(NbnxnPairlistCpu gmx_unused *nbl,
 798                                   int gmx_unused               ncell)
 799 {
 800 }
 801
 802 /* Ensures there is enough space for ncell extra j-cells in the list */
 803 static void check_cell_list_space(NbnxnPairlistGpu *nbl,
 804                                   int               ncell)
 805 {
 806     int ncj4_max, w;
 807
 808     /* We can have maximally nsupercell*c_gpuNumClusterPerCell sj lists */
 809     /* We can store 4 j-subcell - i-supercell pairs in one struct.
 810      * since we round down, we need one extra entry.
 811      */
 812     ncj4_max = ((nbl->work->cj_ind + ncell*c_gpuNumClusterPerCell + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize);
 813
 814     if (ncj4_max > nbl->cj4_nalloc)
 815     {
 816         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 817         nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->cj4),
 818                            nbl->work->cj4_init*sizeof(*nbl->cj4),
 819                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
 820                            nbl->alloc, nbl->free);
 821     }
 822
 823     if (ncj4_max > nbl->work->cj4_init)
 824     {
 825         for (int j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
 826         {
 827             /* No i-subcells and no excl's in the list initially */
 828             for (w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
 829             {
 830                 nbl->cj4[j4].imei[w].imask    = 0U;
 831                 nbl->cj4[j4].imei[w].excl_ind = 0;
 832
 833             }
 834         }
 835         nbl->work->cj4_init = ncj4_max;
 836     }
 837 }
 838
 839 /* Set all excl masks for one GPU warp no exclusions */
 840 static void set_no_excls(nbnxn_excl_t *excl)
 841 {
 842     for (int t = 0; t < c_nbnxnGpuExclSize; t++)
 843     {
 844         /* Turn all interaction bits on */
 845         excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
 846     }
 847 }
 848
 849 /* Initializes a single NbnxnPairlistCpu data structure */
 850 static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
 851 {
 852     nbl->na_ci       = c_nbnxnCpuIClusterSize;
 853     nbl->na_cj       = 0;
 854     nbl->ci.clear();
 855     nbl->ciOuter.clear();
 856     nbl->ncjInUse    = 0;
 857     nbl->cj.clear();
 858     nbl->cjOuter.clear();
 859     nbl->nci_tot     = 0;
 860
 861     nbl->work        = new NbnxnPairlistCpuWork();
 862 }
 863
 864 /* Initializes a single NbnxnPairlistGpu data structure */
 865 static void nbnxn_init_pairlist(NbnxnPairlistGpu *nbl,
 866                                 nbnxn_alloc_t    *alloc,
 867                                 nbnxn_free_t     *free)
 868 {
 869     if (alloc == nullptr)
 870     {
 871         nbl->alloc = nbnxn_alloc_aligned;
 872     }
 873     else
 874     {
 875         nbl->alloc = alloc;
 876     }
 877     if (free == nullptr)
 878     {
 879         nbl->free = nbnxn_free_aligned;
 880     }
 881     else
 882     {
 883         nbl->free = free;
 884     }
 885
 886     nbl->na_ci       = c_nbnxnGpuClusterSize;
 887     nbl->na_cj       = c_nbnxnGpuClusterSize;
 888     nbl->na_sc       = c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize;
 889     nbl->nsci        = 0;
 890     nbl->sci         = nullptr;
 891     nbl->sci_nalloc  = 0;
 892     nbl->ncj4        = 0;
 893     /* We need one element extra in sj, so alloc initially with 1 */
 894     nbl->cj4_nalloc  = 0;
 895     nbl->cj4         = nullptr;
 896     nbl->nci_tot     = 0;
 897
 898     GMX_ASSERT(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell, "The search code assumes that the a super-cluster matches a search grid cell");
 899
 900     GMX_ASSERT(sizeof(nbl->cj4[0].imei[0].imask)*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
 901     GMX_ASSERT(sizeof(nbl->excl[0])*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The GPU exclusion mask does not contain a sufficient number of bits");
 902
 903     nbl->excl        = nullptr;
 904     nbl->excl_nalloc = 0;
 905     nbl->nexcl       = 0;
 906     check_excl_space(nbl, 1);
 907     nbl->nexcl       = 1;
 908     set_no_excls(&nbl->excl[0]);
 909
 910     nbl->work        = new NbnxnPairlistGpuWork();
 911 }
 912
 913 void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 914                              gmx_bool bSimple, gmx_bool bCombined,
 915                              nbnxn_alloc_t *alloc,
 916                              nbnxn_free_t  *free)
 917 {
 918     GMX_RELEASE_ASSERT(!bSimple || !bCombined, "Can only combine non-simple lists");
 919
 920     nbl_list->bSimple   = bSimple;
 921     nbl_list->bCombined = bCombined;
 922
 923     nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 924
 925     if (!nbl_list->bCombined &&
 926         nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 927     {
 928         gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 929                   nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
 930     }
 931
 932     if (bSimple)
 933     {
 934         snew(nbl_list->nbl, nbl_list->nnbl);
 935         if (nbl_list->nnbl > 1)
 936         {
 937             snew(nbl_list->nbl_work, nbl_list->nnbl);
 938         }
 939     }
 940     else
 941     {
 942         snew(nbl_list->nblGpu, nbl_list->nnbl);
 943     }
 944     snew(nbl_list->nbl_fep, nbl_list->nnbl);
 945     /* Execute in order to avoid memory interleaving between threads */
 946 #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 947     for (int i = 0; i < nbl_list->nnbl; i++)
 948     {
 949         try
 950         {
 951             /* Allocate the nblist data structure locally on each thread
 952              * to optimize memory access for NUMA architectures.
 953              */
 954             if (bSimple)
 955             {
 956                 nbl_list->nbl[i] = new NbnxnPairlistCpu();
 957
 958                 nbnxn_init_pairlist(nbl_list->nbl[i]);
 959                 if (nbl_list->nnbl > 1)
 960                 {
 961                     nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
 962                     nbnxn_init_pairlist(nbl_list->nbl_work[i]);
 963                 }
 964             }
 965             else
 966             {
 967                 snew(nbl_list->nblGpu[i], 1);
 968
 969                 /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 970                 if (i == 0)
 971                 {
 972                     nbnxn_init_pairlist(nbl_list->nblGpu[i], alloc, free);
 973                 }
 974                 else
 975                 {
 976                     nbnxn_init_pairlist(nbl_list->nblGpu[i], nullptr, nullptr);
 977                 }
 978             }
 979
 980             snew(nbl_list->nbl_fep[i], 1);
 981             nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
 982         }
 983         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 984     }
 985 }
 986
 987 /* Print statistics of a pair list, used for debug output */
 988 static void print_nblist_statistics(FILE *fp, const NbnxnPairlistCpu *nbl,
 989                                     const nbnxn_search *nbs, real rl)
 990 {
 991     const nbnxn_grid_t *grid;
 992     int                 cs[SHIFTS];
 993     int                 npexcl;
 994
 995     grid = &nbs->grid[0];
 996
 997     fprintf(fp, "nbl nci %zu ncj %d\n",
 998             nbl->ci.size(), nbl->ncjInUse);
 999     fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
1000             nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid->nc),
1001             nbl->ncjInUse/static_cast<double>(grid->nc)*grid->na_cj,
1002             nbl->ncjInUse/static_cast<double>(grid->nc)*grid->na_cj/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_cj/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
1003
1004     fprintf(fp, "nbl average j cell list length %.1f\n",
1005             0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
1006
1007     for (int s = 0; s < SHIFTS; s++)
1008     {
1009         cs[s] = 0;
1010     }
1011     npexcl = 0;
1012     for (const nbnxn_ci_t &ciEntry : nbl->ci)
1013     {
1014         cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
1015             ciEntry.cj_ind_end - ciEntry.cj_ind_start;
1016
1017         int j = ciEntry.cj_ind_start;
1018         while (j < ciEntry.cj_ind_end &&
1019                nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
1020         {
1021             npexcl++;
1022             j++;
1023         }
1024     }
1025     fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
1026             nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
1027     for (int s = 0; s < SHIFTS; s++)
1028     {
1029         if (cs[s] > 0)
1030         {
1031             fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
1032         }
1033     }
1034 }
1035
1036 /* Print statistics of a pair lists, used for debug output */
1037 static void print_nblist_statistics(FILE *fp, const NbnxnPairlistGpu *nbl,
1038                                     const nbnxn_search *nbs, real rl)
1039 {
1040     const nbnxn_grid_t *grid;
1041     int                 b;
1042     int                 c[c_gpuNumClusterPerCell + 1];
1043     double              sum_nsp, sum_nsp2;
1044     int                 nsp_max;
1045
1046     /* This code only produces correct statistics with domain decomposition */
1047     grid = &nbs->grid[0];
1048
1049     fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
1050             nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
1051     fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
1052             nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid->nsubc_tot),
1053             nbl->nci_tot/static_cast<double>(grid->nsubc_tot)*grid->na_c,
1054             nbl->nci_tot/static_cast<double>(grid->nsubc_tot)*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
1055
1056     sum_nsp  = 0;
1057     sum_nsp2 = 0;
1058     nsp_max  = 0;
1059     for (int si = 0; si <= c_gpuNumClusterPerCell; si++)
1060     {
1061         c[si] = 0;
1062     }
1063     for (int i = 0; i < nbl->nsci; i++)
1064     {
1065         int nsp;
1066
1067         nsp = 0;
1068         for (int j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
1069         {
1070             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
1071             {
1072                 b = 0;
1073                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
1074                 {
1075                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
1076                     {
1077                         b++;
1078                     }
1079                 }
1080                 nsp += b;
1081                 c[b]++;
1082             }
1083         }
1084         sum_nsp  += nsp;
1085         sum_nsp2 += nsp*nsp;
1086         nsp_max   = std::max(nsp_max, nsp);
1087     }
1088     if (nbl->nsci > 0)
1089     {
1090         sum_nsp  /= nbl->nsci;
1091         sum_nsp2 /= nbl->nsci;
1092     }
1093     fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
1094             sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
1095
1096     if (nbl->ncj4 > 0)
1097     {
1098         for (b = 0; b <= c_gpuNumClusterPerCell; b++)
1099         {
1100             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
1101                     b, c[b],
1102                     100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
1103         }
1104     }
1105 }
1106
1107 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
1108 static void low_get_nbl_exclusions(NbnxnPairlistGpu *nbl, int cj4,
1109                                    int warp, nbnxn_excl_t **excl)
1110 {
1111     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
1112     {
1113         /* No exclusions set, make a new list entry */
1114         nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
1115         nbl->nexcl++;
1116         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
1117         set_no_excls(*excl);
1118     }
1119     else
1120     {
1121         /* We already have some exclusions, new ones can be added to the list */
1122         *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
1123     }
1124 }
1125
1126 /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
1127  * generates a new element and allocates extra memory, if necessary.
1128  */
1129 static void get_nbl_exclusions_1(NbnxnPairlistGpu *nbl, int cj4,
1130                                  int warp, nbnxn_excl_t **excl)
1131 {
1132     if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
1133     {
1134         /* We need to make a new list entry, check if we have space */
1135         check_excl_space(nbl, 1);
1136     }
1137     low_get_nbl_exclusions(nbl, cj4, warp, excl);
1138 }
1139
1140 /* Returns pointers to the exclusion masks for cj4-unit cj4 for both warps,
1141  * generates a new element and allocates extra memory, if necessary.
1142  */
1143 static void get_nbl_exclusions_2(NbnxnPairlistGpu *nbl, int cj4,
1144                                  nbnxn_excl_t **excl_w0,
1145                                  nbnxn_excl_t **excl_w1)
1146 {
1147     /* Check for space we might need */
1148     check_excl_space(nbl, 2);
1149
1150     low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
1151     low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
1152 }
1153
1154 /* Sets the self exclusions i=j and pair exclusions i>j */
1155 static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu *nbl,
1156                                                int cj4_ind, int sj_offset,
1157                                                int i_cluster_in_cell)
1158 {
1159     nbnxn_excl_t *excl[c_nbnxnGpuClusterpairSplit];
1160
1161     /* Here we only set the set self and double pair exclusions */
1162
1163     static_assert(c_nbnxnGpuClusterpairSplit == 2, "");
1164
1165     get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
1166
1167     /* Only minor < major bits set */
1168     for (int ej = 0; ej < nbl->na_ci; ej++)
1169     {
1170         int w = (ej>>2);
1171         for (int ei = ej; ei < nbl->na_ci; ei++)
1172         {
1173             excl[w]->pair[(ej & (c_nbnxnGpuJgroupSize-1))*nbl->na_ci + ei] &=
1174                 ~(1U << (sj_offset*c_gpuNumClusterPerCell + i_cluster_in_cell));
1175         }
1176     }
1177 }
1178
1179 /* Returns a diagonal or off-diagonal interaction mask for plain C lists */
1180 static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
1181 {
1182     return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
1183 }
1184
1185 /* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
1186 gmx_unused static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
1187 {
1188     return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
1189             (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
1190              NBNXN_INTERACTION_MASK_ALL));
1191 }
1192
1193 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
1194 gmx_unused static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
1195 {
1196     return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
1197 }
1198
1199 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
1200 gmx_unused static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
1201 {
1202     return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
1203             (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
1204              NBNXN_INTERACTION_MASK_ALL));
1205 }
1206
1207 #if GMX_SIMD
1208 #if GMX_SIMD_REAL_WIDTH == 2
1209 #define get_imask_simd_4xn  get_imask_simd_j2
1210 #endif
1211 #if GMX_SIMD_REAL_WIDTH == 4
1212 #define get_imask_simd_4xn  get_imask_simd_j4
1213 #endif
1214 #if GMX_SIMD_REAL_WIDTH == 8
1215 #define get_imask_simd_4xn  get_imask_simd_j8
1216 #define get_imask_simd_2xnn get_imask_simd_j4
1217 #endif
1218 #if GMX_SIMD_REAL_WIDTH == 16
1219 #define get_imask_simd_2xnn get_imask_simd_j8
1220 #endif
1221 #endif
1222
1223 /* Plain C code for checking and adding cluster-pairs to the list.
1224  *
1225  * \param[in]     gridj               The j-grid
1226  * \param[in,out] nbl                 The pair-list to store the cluster pairs in
1227  * \param[in]     icluster            The index of the i-cluster
1228  * \param[in]     jclusterFirst       The first cluster in the j-range
1229  * \param[in]     jclusterLast        The last cluster in the j-range
1230  * \param[in]     excludeSubDiagonal  Exclude atom pairs with i-index > j-index
1231  * \param[in]     x_j                 Coordinates for the j-atom, in xyz format
1232  * \param[in]     rlist2              The squared list cut-off
1233  * \param[in]     rbb2                The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
1234  * \param[in,out] numDistanceChecks   The number of distance checks performed
1235  */
1236 static void
1237 makeClusterListSimple(const nbnxn_grid_t       &jGrid,
1238                       NbnxnPairlistCpu *        nbl,
1239                       int                       icluster,
1240                       int                       jclusterFirst,
1241                       int                       jclusterLast,
1242                       bool                      excludeSubDiagonal,
1243                       const real * gmx_restrict x_j,
1244                       real                      rlist2,
1245                       float                     rbb2,
1246                       int * gmx_restrict        numDistanceChecks)
1247 {
1248     const nbnxn_bb_t * gmx_restrict bb_ci = nbl->work->iClusterData.bb.data();
1249     const real * gmx_restrict       x_ci  = nbl->work->iClusterData.x.data();
1250
1251     gmx_bool                        InRange;
1252
1253     InRange = FALSE;
1254     while (!InRange && jclusterFirst <= jclusterLast)
1255     {
1256         real d2  = subc_bb_dist2(0, bb_ci, jclusterFirst, jGrid.bb);
1257         *numDistanceChecks += 2;
1258
1259         /* Check if the distance is within the distance where
1260          * we use only the bounding box distance rbb,
1261          * or within the cut-off and there is at least one atom pair
1262          * within the cut-off.
1263          */
1264         if (d2 < rbb2)
1265         {
1266             InRange = TRUE;
1267         }
1268         else if (d2 < rlist2)
1269         {
1270             int cjf_gl = jGrid.cell0 + jclusterFirst;
1271             for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
1272             {
1273                 for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
1274                 {
1275                     InRange = InRange ||
1276                         (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
1277                          gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
1278                          gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
1279                 }
1280             }
1281             *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
1282         }
1283         if (!InRange)
1284         {
1285             jclusterFirst++;
1286         }
1287     }
1288     if (!InRange)
1289     {
1290         return;
1291     }
1292
1293     InRange = FALSE;
1294     while (!InRange && jclusterLast > jclusterFirst)
1295     {
1296         real d2  = subc_bb_dist2(0, bb_ci, jclusterLast, jGrid.bb);
1297         *numDistanceChecks += 2;
1298
1299         /* Check if the distance is within the distance where
1300          * we use only the bounding box distance rbb,
1301          * or within the cut-off and there is at least one atom pair
1302          * within the cut-off.
1303          */
1304         if (d2 < rbb2)
1305         {
1306             InRange = TRUE;
1307         }
1308         else if (d2 < rlist2)
1309         {
1310             int cjl_gl = jGrid.cell0 + jclusterLast;
1311             for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
1312             {
1313                 for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
1314                 {
1315                     InRange = InRange ||
1316                         (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
1317                          gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
1318                          gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
1319                 }
1320             }
1321             *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
1322         }
1323         if (!InRange)
1324         {
1325             jclusterLast--;
1326         }
1327     }
1328
1329     if (jclusterFirst <= jclusterLast)
1330     {
1331         for (int jcluster = jclusterFirst; jcluster <= jclusterLast; jcluster++)
1332         {
1333             /* Store cj and the interaction mask */
1334             nbnxn_cj_t cjEntry;
1335             cjEntry.cj   = jGrid.cell0 + jcluster;
1336             cjEntry.excl = get_imask(excludeSubDiagonal, icluster, jcluster);
1337             nbl->cj.push_back(cjEntry);
1338         }
1339         /* Increase the closing index in the i list */
1340         nbl->ci.back().cj_ind_end = nbl->cj.size();
1341     }
1342 }
1343
1344 #ifdef GMX_NBNXN_SIMD_4XN
1345 #include "gromacs/mdlib/nbnxn_search_simd_4xn.h"
1346 #endif
1347 #ifdef GMX_NBNXN_SIMD_2XNN
1348 #include "gromacs/mdlib/nbnxn_search_simd_2xnn.h"
1349 #endif
1350
1351 /* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
1352  * Checks bounding box distances and possibly atom pair distances.
1353  */
1354 static void make_cluster_list_supersub(const nbnxn_grid_t &iGrid,
1355                                        const nbnxn_grid_t &jGrid,
1356                                        NbnxnPairlistGpu   *nbl,
1357                                        const int           sci,
1358                                        const int           scj,
1359                                        const bool          excludeSubDiagonal,
1360                                        const int           stride,
1361                                        const real         *x,
1362                                        const real          rlist2,
1363                                        const float         rbb2,
1364                                        int                *numDistanceChecks)
1365 {
1366     NbnxnPairlistGpuWork &work   = *nbl->work;
1367
1368 #if NBNXN_BBXXXX
1369     const float          *pbb_ci = work.iSuperClusterData.bbPacked.data();
1370 #else
1371     const nbnxn_bb_t     *bb_ci  = work.iSuperClusterData.bb.data();
1372 #endif
1373
1374     assert(c_nbnxnGpuClusterSize == iGrid.na_c);
1375     assert(c_nbnxnGpuClusterSize == jGrid.na_c);
1376
1377     /* We generate the pairlist mainly based on bounding-box distances
1378      * and do atom pair distance based pruning on the GPU.
1379      * Only if a j-group contains a single cluster-pair, we try to prune
1380      * that pair based on atom distances on the CPU to avoid empty j-groups.
1381      */
1382 #define PRUNE_LIST_CPU_ONE 1
1383 #define PRUNE_LIST_CPU_ALL 0
1384
1385 #if PRUNE_LIST_CPU_ONE
1386     int  ci_last = -1;
1387 #endif
1388
1389     float *d2l = work.distanceBuffer.data();
1390
1391     for (int subc = 0; subc < jGrid.nsubc[scj]; subc++)
1392     {
1393         const int    cj4_ind   = work.cj_ind/c_nbnxnGpuJgroupSize;
1394         const int    cj_offset = work.cj_ind - cj4_ind*c_nbnxnGpuJgroupSize;
1395         nbnxn_cj4_t *cj4       = &nbl->cj4[cj4_ind];
1396
1397         const int    cj        = scj*c_gpuNumClusterPerCell + subc;
1398
1399         const int    cj_gl     = jGrid.cell0*c_gpuNumClusterPerCell + cj;
1400
1401         /* Initialize this j-subcell i-subcell list */
1402         cj4->cj[cj_offset] = cj_gl;
1403
1404         int ci1;
1405         if (excludeSubDiagonal && sci == scj)
1406         {
1407             ci1 = subc + 1;
1408         }
1409         else
1410         {
1411             ci1 = iGrid.nsubc[sci];
1412         }
1413
1414 #if NBNXN_BBXXXX
1415         /* Determine all ci1 bb distances in one call with SIMD4 */
1416         subc_bb_dist2_simd4_xxxx(jGrid.pbb.data() + (cj >> STRIDE_PBB_2LOG)*NNBSBB_XXXX + (cj & (STRIDE_PBB-1)),
1417                                  ci1, pbb_ci, d2l);
1418         *numDistanceChecks += c_nbnxnGpuClusterSize*2;
1419 #endif
1420
1421         int          npair = 0;
1422         unsigned int imask = 0;
1423         /* We use a fixed upper-bound instead of ci1 to help optimization */
1424         for (int ci = 0; ci < c_gpuNumClusterPerCell; ci++)
1425         {
1426             if (ci == ci1)
1427             {
1428                 break;
1429             }
1430
1431 #if !NBNXN_BBXXXX
1432             /* Determine the bb distance between ci and cj */
1433             d2l[ci]             = subc_bb_dist2(ci, bb_ci, cj, jGrid.bb);
1434             *numDistanceChecks += 2;
1435 #endif
1436             float d2 = d2l[ci];
1437
1438 #if PRUNE_LIST_CPU_ALL
1439             /* Check if the distance is within the distance where
1440              * we use only the bounding box distance rbb,
1441              * or within the cut-off and there is at least one atom pair
1442              * within the cut-off. This check is very costly.
1443              */
1444             *numDistanceChecks += c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize;
1445             if (d2 < rbb2 ||
1446                 (d2 < rlist2 &&
1447                  clusterpair_in_range(work, ci, cj_gl, stride, x, rlist2)))
1448 #else
1449             /* Check if the distance between the two bounding boxes
1450              * in within the pair-list cut-off.
1451              */
1452             if (d2 < rlist2)
1453 #endif
1454             {
1455                 /* Flag this i-subcell to be taken into account */
1456                 imask |= (1U << (cj_offset*c_gpuNumClusterPerCell + ci));
1457
1458 #if PRUNE_LIST_CPU_ONE
1459                 ci_last = ci;
1460 #endif
1461
1462                 npair++;
1463             }
1464         }
1465
1466 #if PRUNE_LIST_CPU_ONE
1467         /* If we only found 1 pair, check if any atoms are actually
1468          * within the cut-off, so we could get rid of it.
1469          */
1470         if (npair == 1 && d2l[ci_last] >= rbb2 &&
1471             !clusterpair_in_range(work, ci_last, cj_gl, stride, x, rlist2))
1472         {
1473             imask &= ~(1U << (cj_offset*c_gpuNumClusterPerCell + ci_last));
1474             npair--;
1475         }
1476 #endif
1477
1478         if (npair > 0)
1479         {
1480             /* We have a useful sj entry, close it now */
1481
1482             /* Set the exclusions for the ci==sj entry.
1483              * Here we don't bother to check if this entry is actually flagged,
1484              * as it will nearly always be in the list.
1485              */
1486             if (excludeSubDiagonal && sci == scj)
1487             {
1488                 set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, subc);
1489             }
1490
1491             /* Copy the cluster interaction mask to the list */
1492             for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
1493             {
1494                 cj4->imei[w].imask |= imask;
1495             }
1496
1497             nbl->work->cj_ind++;
1498
1499             /* Keep the count */
1500             nbl->nci_tot += npair;
1501
1502             /* Increase the closing index in i super-cell list */
1503             nbl->sci[nbl->nsci - 1].cj4_ind_end =
1504                 (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
1505         }
1506     }
1507 }
1508
1509 /* Returns how many contiguous j-clusters we have starting in the i-list */
1510 template <typename CjListType>
1511 static int numContiguousJClusters(const int         cjIndexStart,
1512                                   const int         cjIndexEnd,
1513                                   const CjListType *cjList)
1514 {
1515     const int firstJCluster = nblCj(cjList, cjIndexStart);
1516
1517     int       numContiguous = 0;
1518
1519     while (cjIndexStart + numContiguous < cjIndexEnd &&
1520            nblCj(cjList, cjIndexStart + numContiguous) == firstJCluster + numContiguous)
1521     {
1522         numContiguous++;
1523     }
1524
1525     return numContiguous;
1526 }
1527
1528 /* Helper struct for efficient searching for excluded atoms in a j-list */
1529 struct JListRanges
1530 {
1531     /* Constructor */
1532     template <typename CjListType>
1533     JListRanges(int               cjIndexStart,
1534                 int               cjIndexEnd,
1535                 const CjListType &cjList);
1536
1537     int cjIndexStart; // The start index in the j-list
1538     int cjIndexEnd;   // The end index in the j-list
1539     int cjFirst;      // The j-cluster with index cjIndexStart
1540     int cjLast;       // The j-cluster with index cjIndexEnd-1
1541     int numDirect;    // Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
1542 };
1543
1544 template <typename CjListType>
1545 JListRanges::JListRanges(int               cjIndexStart,
1546                          int               cjIndexEnd,
1547                          const CjListType &cjList) :
1548     cjIndexStart(cjIndexStart),
1549     cjIndexEnd(cjIndexEnd)
1550 {
1551     GMX_ASSERT(cjIndexEnd > cjIndexStart, "JListRanges should only be called with non-empty lists");
1552
1553     cjFirst   = nblCj(cjList, cjIndexStart);
1554     cjLast    = nblCj(cjList, cjIndexEnd - 1);
1555
1556     /* Determine how many contiguous j-cells we have starting
1557      * from the first i-cell. This number can be used to directly
1558      * calculate j-cell indices for excluded atoms.
1559      */
1560     numDirect = numContiguousJClusters(cjIndexStart, cjIndexEnd, cjList);
1561 }
1562
1563 /* Return the index of \p jCluster in the given range or -1 when not present
1564  *
1565  * Note: This code is executed very often and therefore performance is
1566  *       important. It should be inlined and fully optimized.
1567  */
1568 template <typename CjListType>
1569 static inline int findJClusterInJList(int                jCluster,
1570                                       const JListRanges &ranges,
1571                                       const CjListType  &cjList)
1572 {
1573     int index;
1574
1575     if (jCluster < ranges.cjFirst + ranges.numDirect)
1576     {
1577         /* We can calculate the index directly using the offset */
1578         index = ranges.cjIndexStart + jCluster - ranges.cjFirst;
1579     }
1580     else
1581     {
1582         /* Search for jCluster using bisection */
1583         index           = -1;
1584         int rangeStart  = ranges.cjIndexStart + ranges.numDirect;
1585         int rangeEnd    = ranges.cjIndexEnd;
1586         int rangeMiddle;
1587         while (index == -1 && rangeStart < rangeEnd)
1588         {
1589             rangeMiddle = (rangeStart + rangeEnd) >> 1;
1590
1591             const int clusterMiddle = nblCj(cjList, rangeMiddle);
1592
1593             if (jCluster == clusterMiddle)
1594             {
1595                 index      = rangeMiddle;
1596             }
1597             else if (jCluster < clusterMiddle)
1598             {
1599                 rangeEnd   = rangeMiddle;
1600             }
1601             else
1602             {
1603                 rangeStart = rangeMiddle + 1;
1604             }
1605         }
1606     }
1607
1608     return index;
1609 }
1610
1611 // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
1612
1613 /* Return the i-entry in the list we are currently operating on */
1614 static nbnxn_ci_t *getOpenIEntry(NbnxnPairlistCpu *nbl)
1615 {
1616     return &nbl->ci.back();
1617 }
1618
1619 /* Return the i-entry in the list we are currently operating on */
1620 static nbnxn_sci_t *getOpenIEntry(NbnxnPairlistGpu *nbl)
1621 {
1622     return &nbl->sci[nbl->nsci - 1];
1623 }
1624
1625 /* Set all atom-pair exclusions for a simple type list i-entry
1626  *
1627  * Set all atom-pair exclusions from the topology stored in exclusions
1628  * as masks in the pair-list for simple list entry iEntry.
1629  */
1630 static void
1631 setExclusionsForIEntry(const nbnxn_search   *nbs,
1632                        NbnxnPairlistCpu     *nbl,
1633                        gmx_bool              diagRemoved,
1634                        int                   na_cj_2log,
1635                        const nbnxn_ci_t     &iEntry,
1636                        const t_blocka       &exclusions)
1637 {
1638     if (iEntry.cj_ind_end == iEntry.cj_ind_start)
1639     {
1640         /* Empty list: no exclusions */
1641         return;
1642     }
1643
1644     const JListRanges        ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, nbl->cj.data());
1645
1646     const int                iCluster = iEntry.ci;
1647
1648     gmx::ArrayRef<const int> cell = nbs->cell;
1649
1650     /* Loop over the atoms in the i-cluster */
1651     for (int i = 0; i < nbl->na_ci; i++)
1652     {
1653         const int iIndex = iCluster*nbl->na_ci + i;
1654         const int iAtom  = nbs->a[iIndex];
1655         if (iAtom >= 0)
1656         {
1657             /* Loop over the topology-based exclusions for this i-atom */
1658             for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
1659             {
1660                 const int jAtom = exclusions.a[exclIndex];
1661
1662                 if (jAtom == iAtom)
1663                 {
1664                     /* The self exclusion are already set, save some time */
1665                     continue;
1666                 }
1667
1668                 /* Get the index of the j-atom in the nbnxn atom data */
1669                 const int jIndex = cell[jAtom];
1670
1671                 /* Without shifts we only calculate interactions j>i
1672                  * for one-way pair-lists.
1673                  */
1674                 if (diagRemoved && jIndex <= iIndex)
1675                 {
1676                     continue;
1677                 }
1678
1679                 const int jCluster = (jIndex >> na_cj_2log);
1680
1681                 /* Could the cluster se be in our list? */
1682                 if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
1683                 {
1684                     const int index =
1685                         findJClusterInJList(jCluster, ranges, nbl->cj.data());
1686
1687                     if (index >= 0)
1688                     {
1689                         /* We found an exclusion, clear the corresponding
1690                          * interaction bit.
1691                          */
1692                         const int innerJ     = jIndex - (jCluster << na_cj_2log);
1693
1694                         nbl->cj[index].excl &= ~(1U << ((i << na_cj_2log) + innerJ));
1695                     }
1696                 }
1697             }
1698         }
1699     }
1700 }
1701
1702 /* Add a new i-entry to the FEP list and copy the i-properties */
1703 static inline void fep_list_new_nri_copy(t_nblist *nlist)
1704 {
1705     /* Add a new i-entry */
1706     nlist->nri++;
1707
1708     assert(nlist->nri < nlist->maxnri);
1709
1710     /* Duplicate the last i-entry, except for jindex, which continues */
1711     nlist->iinr[nlist->nri]   = nlist->iinr[nlist->nri-1];
1712     nlist->shift[nlist->nri]  = nlist->shift[nlist->nri-1];
1713     nlist->gid[nlist->nri]    = nlist->gid[nlist->nri-1];
1714     nlist->jindex[nlist->nri] = nlist->nrj;
1715 }
1716
1717 /* For load balancing of the free-energy lists over threads, we set
1718  * the maximum nrj size of an i-entry to 40. This leads to good
1719  * load balancing in the worst case scenario of a single perturbed
1720  * particle on 16 threads, while not introducing significant overhead.
1721  * Note that half of the perturbed pairs will anyhow end up in very small lists,
1722  * since non perturbed i-particles will see few perturbed j-particles).
1723  */
1724 const int max_nrj_fep = 40;
1725
1726 /* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
1727  * singularities for overlapping particles (0/0), since the charges and
1728  * LJ parameters have been zeroed in the nbnxn data structure.
1729  * Simultaneously make a group pair list for the perturbed pairs.
1730  */
1731 static void make_fep_list(const nbnxn_search     *nbs,
1732                           const nbnxn_atomdata_t *nbat,
1733                           NbnxnPairlistCpu       *nbl,
1734                           gmx_bool                bDiagRemoved,
1735                           nbnxn_ci_t             *nbl_ci,
1736                           real gmx_unused         shx,
1737                           real gmx_unused         shy,
1738                           real gmx_unused         shz,
1739                           real gmx_unused         rlist_fep2,
1740                           const nbnxn_grid_t     &iGrid,
1741                           const nbnxn_grid_t     &jGrid,
1742                           t_nblist               *nlist)
1743 {
1744     int      ci, cj_ind_start, cj_ind_end, cja, cjr;
1745     int      nri_max;
1746     int      ngid, gid_i = 0, gid_j, gid;
1747     int      egp_shift, egp_mask;
1748     int      gid_cj = 0;
1749     int      ind_i, ind_j, ai, aj;
1750     int      nri;
1751     gmx_bool bFEP_i, bFEP_i_all;
1752
1753     if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
1754     {
1755         /* Empty list */
1756         return;
1757     }
1758
1759     ci = nbl_ci->ci;
1760
1761     cj_ind_start = nbl_ci->cj_ind_start;
1762     cj_ind_end   = nbl_ci->cj_ind_end;
1763
1764     /* In worst case we have alternating energy groups
1765      * and create #atom-pair lists, which means we need the size
1766      * of a cluster pair (na_ci*na_cj) times the number of cj's.
1767      */
1768     nri_max = nbl->na_ci*nbl->na_cj*(cj_ind_end - cj_ind_start);
1769     if (nlist->nri + nri_max > nlist->maxnri)
1770     {
1771         nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
1772         reallocate_nblist(nlist);
1773     }
1774
1775     const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
1776
1777     ngid = nbatParams.nenergrp;
1778
1779     if (ngid*jGrid.na_cj > gmx::index(sizeof(gid_cj)*8))
1780     {
1781         gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
1782                   iGrid.na_c, jGrid.na_cj, (sizeof(gid_cj)*8)/jGrid.na_cj);
1783     }
1784
1785     egp_shift = nbatParams.neg_2log;
1786     egp_mask  = (1 << egp_shift) - 1;
1787
1788     /* Loop over the atoms in the i sub-cell */
1789     bFEP_i_all = TRUE;
1790     for (int i = 0; i < nbl->na_ci; i++)
1791     {
1792         ind_i = ci*nbl->na_ci + i;
1793         ai    = nbs->a[ind_i];
1794         if (ai >= 0)
1795         {
1796             nri                  = nlist->nri;
1797             nlist->jindex[nri+1] = nlist->jindex[nri];
1798             nlist->iinr[nri]     = ai;
1799             /* The actual energy group pair index is set later */
1800             nlist->gid[nri]      = 0;
1801             nlist->shift[nri]    = nbl_ci->shift & NBNXN_CI_SHIFT;
1802
1803             bFEP_i = ((iGrid.fep[ci - iGrid.cell0] & (1 << i)) != 0u);
1804
1805             bFEP_i_all = bFEP_i_all && bFEP_i;
1806
1807             if (nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj)
1808             {
1809                 nlist->maxnrj = over_alloc_small(nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj);
1810                 srenew(nlist->jjnr,     nlist->maxnrj);
1811                 srenew(nlist->excl_fep, nlist->maxnrj);
1812             }
1813
1814             if (ngid > 1)
1815             {
1816                 gid_i = (nbatParams.energrp[ci] >> (egp_shift*i)) & egp_mask;
1817             }
1818
1819             for (int cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)
1820             {
1821                 unsigned int fep_cj;
1822
1823                 cja = nbl->cj[cj_ind].cj;
1824
1825                 if (jGrid.na_cj == jGrid.na_c)
1826                 {
1827                     cjr    = cja - jGrid.cell0;
1828                     fep_cj = jGrid.fep[cjr];
1829                     if (ngid > 1)
1830                     {
1831                         gid_cj = nbatParams.energrp[cja];
1832                     }
1833                 }
1834                 else if (2*jGrid.na_cj == jGrid.na_c)
1835                 {
1836                     cjr    = cja - jGrid.cell0*2;
1837                     /* Extract half of the ci fep/energrp mask */
1838                     fep_cj = (jGrid.fep[cjr>>1] >> ((cjr&1)*jGrid.na_cj)) & ((1<<jGrid.na_cj) - 1);
1839                     if (ngid > 1)
1840                     {
1841                         gid_cj = nbatParams.energrp[cja>>1] >> ((cja&1)*jGrid.na_cj*egp_shift) & ((1<<(jGrid.na_cj*egp_shift)) - 1);
1842                     }
1843                 }
1844                 else
1845                 {
1846                     cjr    = cja - (jGrid.cell0>>1);
1847                     /* Combine two ci fep masks/energrp */
1848                     fep_cj = jGrid.fep[cjr*2] + (jGrid.fep[cjr*2+1] << jGrid.na_c);
1849                     if (ngid > 1)
1850                     {
1851                         gid_cj = nbatParams.energrp[cja*2] + (nbatParams.energrp[cja*2+1] << (jGrid.na_c*egp_shift));
1852                     }
1853                 }
1854
1855                 if (bFEP_i || fep_cj != 0)
1856                 {
1857                     for (int j = 0; j < nbl->na_cj; j++)
1858                     {
1859                         /* Is this interaction perturbed and not excluded? */
1860                         ind_j = cja*nbl->na_cj + j;
1861                         aj    = nbs->a[ind_j];
1862                         if (aj >= 0 &&
1863                             (bFEP_i || (fep_cj & (1 << j))) &&
1864                             (!bDiagRemoved || ind_j >= ind_i))
1865                         {
1866                             if (ngid > 1)
1867                             {
1868                                 gid_j = (gid_cj >> (j*egp_shift)) & egp_mask;
1869                                 gid   = GID(gid_i, gid_j, ngid);
1870
1871                                 if (nlist->nrj > nlist->jindex[nri] &&
1872                                     nlist->gid[nri] != gid)
1873                                 {
1874                                     /* Energy group pair changed: new list */
1875                                     fep_list_new_nri_copy(nlist);
1876                                     nri = nlist->nri;
1877                                 }
1878                                 nlist->gid[nri] = gid;
1879                             }
1880
1881                             if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
1882                             {
1883                                 fep_list_new_nri_copy(nlist);
1884                                 nri = nlist->nri;
1885                             }
1886
1887                             /* Add it to the FEP list */
1888                             nlist->jjnr[nlist->nrj]     = aj;
1889                             nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1;
1890                             nlist->nrj++;
1891
1892                             /* Exclude it from the normal list.
1893                              * Note that the charge has been set to zero,
1894                              * but we need to avoid 0/0, as perturbed atoms
1895                              * can be on top of each other.
1896                              */
1897                             nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j));
1898                         }
1899                     }
1900                 }
1901             }
1902
1903             if (nlist->nrj > nlist->jindex[nri])
1904             {
1905                 /* Actually add this new, non-empty, list */
1906                 nlist->nri++;
1907                 nlist->jindex[nlist->nri] = nlist->nrj;
1908             }
1909         }
1910     }
1911
1912     if (bFEP_i_all)
1913     {
1914         /* All interactions are perturbed, we can skip this entry */
1915         nbl_ci->cj_ind_end = cj_ind_start;
1916         nbl->ncjInUse     -= cj_ind_end - cj_ind_start;
1917     }
1918 }
1919
1920 /* Return the index of atom a within a cluster */
1921 static inline int cj_mod_cj4(int cj)
1922 {
1923     return cj & (c_nbnxnGpuJgroupSize - 1);
1924 }
1925
1926 /* Convert a j-cluster to a cj4 group */
1927 static inline int cj_to_cj4(int cj)
1928 {
1929     return cj/c_nbnxnGpuJgroupSize;
1930 }
1931
1932 /* Return the index of an j-atom within a warp */
1933 static inline int a_mod_wj(int a)
1934 {
1935     return a & (c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit - 1);
1936 }
1937
1938 /* As make_fep_list above, but for super/sub lists. */
1939 static void make_fep_list(const nbnxn_search     *nbs,
1940                           const nbnxn_atomdata_t *nbat,
1941                           NbnxnPairlistGpu       *nbl,
1942                           gmx_bool                bDiagRemoved,
1943                           const nbnxn_sci_t      *nbl_sci,
1944                           real                    shx,
1945                           real                    shy,
1946                           real                    shz,
1947                           real                    rlist_fep2,
1948                           const nbnxn_grid_t     &iGrid,
1949                           const nbnxn_grid_t     &jGrid,
1950                           t_nblist               *nlist)
1951 {
1952     int                sci, cj4_ind_start, cj4_ind_end, cjr;
1953     int                nri_max;
1954     int                c_abs;
1955     int                ind_i, ind_j, ai, aj;
1956     int                nri;
1957     gmx_bool           bFEP_i;
1958     real               xi, yi, zi;
1959     const nbnxn_cj4_t *cj4;
1960
1961     if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
1962     {
1963         /* Empty list */
1964         return;
1965     }
1966
1967     sci = nbl_sci->sci;
1968
1969     cj4_ind_start = nbl_sci->cj4_ind_start;
1970     cj4_ind_end   = nbl_sci->cj4_ind_end;
1971
1972     /* Here we process one super-cell, max #atoms na_sc, versus a list
1973      * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
1974      * of size na_cj atoms.
1975      * On the GPU we don't support energy groups (yet).
1976      * So for each of the na_sc i-atoms, we need max one FEP list
1977      * for each max_nrj_fep j-atoms.
1978      */
1979     nri_max = nbl->na_sc*nbl->na_cj*(1 + ((cj4_ind_end - cj4_ind_start)*c_nbnxnGpuJgroupSize)/max_nrj_fep);
1980     if (nlist->nri + nri_max > nlist->maxnri)
1981     {
1982         nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
1983         reallocate_nblist(nlist);
1984     }
1985
1986     /* Loop over the atoms in the i super-cluster */
1987     for (int c = 0; c < c_gpuNumClusterPerCell; c++)
1988     {
1989         c_abs = sci*c_gpuNumClusterPerCell + c;
1990
1991         for (int i = 0; i < nbl->na_ci; i++)
1992         {
1993             ind_i = c_abs*nbl->na_ci + i;
1994             ai    = nbs->a[ind_i];
1995             if (ai >= 0)
1996             {
1997                 nri                  = nlist->nri;
1998                 nlist->jindex[nri+1] = nlist->jindex[nri];
1999                 nlist->iinr[nri]     = ai;
2000                 /* With GPUs, energy groups are not supported */
2001                 nlist->gid[nri]      = 0;
2002                 nlist->shift[nri]    = nbl_sci->shift & NBNXN_CI_SHIFT;
2003
2004                 bFEP_i = ((iGrid.fep[c_abs - iGrid.cell0*c_gpuNumClusterPerCell] & (1 << i)) != 0u);
2005
2006                 xi = nbat->x()[ind_i*nbat->xstride+XX] + shx;
2007                 yi = nbat->x()[ind_i*nbat->xstride+YY] + shy;
2008                 zi = nbat->x()[ind_i*nbat->xstride+ZZ] + shz;
2009
2010                 if ((nlist->nrj + cj4_ind_end - cj4_ind_start)*c_nbnxnGpuJgroupSize*nbl->na_cj > nlist->maxnrj)
2011                 {
2012                     nlist->maxnrj = over_alloc_small((nlist->nrj + cj4_ind_end - cj4_ind_start)*c_nbnxnGpuJgroupSize*nbl->na_cj);
2013                     srenew(nlist->jjnr,     nlist->maxnrj);
2014                     srenew(nlist->excl_fep, nlist->maxnrj);
2015                 }
2016
2017                 for (int cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)
2018                 {
2019                     cj4 = &nbl->cj4[cj4_ind];
2020
2021                     for (int gcj = 0; gcj < c_nbnxnGpuJgroupSize; gcj++)
2022                     {
2023                         unsigned int fep_cj;
2024
2025                         if ((cj4->imei[0].imask & (1U << (gcj*c_gpuNumClusterPerCell + c))) == 0)
2026                         {
2027                             /* Skip this ci for this cj */
2028                             continue;
2029                         }
2030
2031                         cjr = cj4->cj[gcj] - jGrid.cell0*c_gpuNumClusterPerCell;
2032
2033                         fep_cj = jGrid.fep[cjr];
2034
2035                         if (bFEP_i || fep_cj != 0)
2036                         {
2037                             for (int j = 0; j < nbl->na_cj; j++)
2038                             {
2039                                 /* Is this interaction perturbed and not excluded? */
2040                                 ind_j = (jGrid.cell0*c_gpuNumClusterPerCell + cjr)*nbl->na_cj + j;
2041                                 aj    = nbs->a[ind_j];
2042                                 if (aj >= 0 &&
2043                                     (bFEP_i || (fep_cj & (1 << j))) &&
2044                                     (!bDiagRemoved || ind_j >= ind_i))
2045                                 {
2046                                     nbnxn_excl_t *excl;
2047                                     int           excl_pair;
2048                                     unsigned int  excl_bit;
2049                                     real          dx, dy, dz;
2050
2051                                     const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
2052                                     get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
2053
2054                                     excl_pair = a_mod_wj(j)*nbl->na_ci + i;
2055                                     excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
2056
2057                                     dx = nbat->x()[ind_j*nbat->xstride+XX] - xi;
2058                                     dy = nbat->x()[ind_j*nbat->xstride+YY] - yi;
2059                                     dz = nbat->x()[ind_j*nbat->xstride+ZZ] - zi;
2060
2061                                     /* The unpruned GPU list has more than 2/3
2062                                      * of the atom pairs beyond rlist. Using
2063                                      * this list will cause a lot of overhead
2064                                      * in the CPU FEP kernels, especially
2065                                      * relative to the fast GPU kernels.
2066                                      * So we prune the FEP list here.
2067                                      */
2068                                     if (dx*dx + dy*dy + dz*dz < rlist_fep2)
2069                                     {
2070                                         if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
2071                                         {
2072                                             fep_list_new_nri_copy(nlist);
2073                                             nri = nlist->nri;
2074                                         }
2075
2076                                         /* Add it to the FEP list */
2077                                         nlist->jjnr[nlist->nrj]     = aj;
2078                                         nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0;
2079                                         nlist->nrj++;
2080                                     }
2081
2082                                     /* Exclude it from the normal list.
2083                                      * Note that the charge and LJ parameters have
2084                                      * been set to zero, but we need to avoid 0/0,
2085                                      * as perturbed atoms can be on top of each other.
2086                                      */
2087                                     excl->pair[excl_pair] &= ~excl_bit;
2088                                 }
2089                             }
2090
2091                             /* Note that we could mask out this pair in imask
2092                              * if all i- and/or all j-particles are perturbed.
2093                              * But since the perturbed pairs on the CPU will
2094                              * take an order of magnitude more time, the GPU
2095                              * will finish before the CPU and there is no gain.
2096                              */
2097                         }
2098                     }
2099                 }
2100
2101                 if (nlist->nrj > nlist->jindex[nri])
2102                 {
2103                     /* Actually add this new, non-empty, list */
2104                     nlist->nri++;
2105                     nlist->jindex[nlist->nri] = nlist->nrj;
2106                 }
2107             }
2108         }
2109     }
2110 }
2111
2112 /* Set all atom-pair exclusions for a GPU type list i-entry
2113  *
2114  * Sets all atom-pair exclusions from the topology stored in exclusions
2115  * as masks in the pair-list for i-super-cluster list entry iEntry.
2116  */
2117 static void
2118 setExclusionsForIEntry(const nbnxn_search   *nbs,
2119                        NbnxnPairlistGpu     *nbl,
2120                        gmx_bool              diagRemoved,
2121                        int gmx_unused        na_cj_2log,
2122                        const nbnxn_sci_t    &iEntry,
2123                        const t_blocka       &exclusions)
2124 {
2125     if (iEntry.cj4_ind_end == iEntry.cj4_ind_start)
2126     {
2127         /* Empty list */
2128         return;
2129     }
2130
2131     /* Set the search ranges using start and end j-cluster indices.
2132      * Note that here we can not use cj4_ind_end, since the last cj4
2133      * can be only partially filled, so we use cj_ind.
2134      */
2135     const JListRanges ranges(iEntry.cj4_ind_start*c_nbnxnGpuJgroupSize,
2136                              nbl->work->cj_ind,
2137                              nbl->cj4);
2138
2139     GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
2140     constexpr int            c_clusterSize      = c_nbnxnGpuClusterSize;
2141     constexpr int            c_superClusterSize = c_nbnxnGpuNumClusterPerSupercluster*c_nbnxnGpuClusterSize;
2142
2143     const int                iSuperCluster = iEntry.sci;
2144
2145     gmx::ArrayRef<const int> cell = nbs->cell;
2146
2147     /* Loop over the atoms in the i super-cluster */
2148     for (int i = 0; i < c_superClusterSize; i++)
2149     {
2150         const int iIndex = iSuperCluster*c_superClusterSize + i;
2151         const int iAtom  = nbs->a[iIndex];
2152         if (iAtom >= 0)
2153         {
2154             const int iCluster = i/c_clusterSize;
2155
2156             /* Loop over the topology-based exclusions for this i-atom */
2157             for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
2158             {
2159                 const int jAtom = exclusions.a[exclIndex];
2160
2161                 if (jAtom == iAtom)
2162                 {
2163                     /* The self exclusions are already set, save some time */
2164                     continue;
2165                 }
2166
2167                 /* Get the index of the j-atom in the nbnxn atom data */
2168                 const int jIndex = cell[jAtom];
2169
2170                 /* Without shifts we only calculate interactions j>i
2171                  * for one-way pair-lists.
2172                  */
2173                 /* NOTE: We would like to use iIndex on the right hand side,
2174                  * but that makes this routine 25% slower with gcc6/7.
2175                  * Even using c_superClusterSize makes it slower.
2176                  * Either of these changes triggers peeling of the exclIndex
2177                  * loop, which apparently leads to far less efficient code.
2178                  */
2179                 if (diagRemoved && jIndex <= iSuperCluster*nbl->na_sc + i)
2180                 {
2181                     continue;
2182                 }
2183
2184                 const int jCluster = jIndex/c_clusterSize;
2185
2186                 /* Check whether the cluster is in our list? */
2187                 if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
2188                 {
2189                     const int index =
2190                         findJClusterInJList(jCluster, ranges, nbl->cj4);
2191
2192                     if (index >= 0)
2193                     {
2194                         /* We found an exclusion, clear the corresponding
2195                          * interaction bit.
2196                          */
2197                         const unsigned int pairMask = (1U << (cj_mod_cj4(index)*c_gpuNumClusterPerCell + iCluster));
2198                         /* Check if the i-cluster interacts with the j-cluster */
2199                         if (nbl_imask0(nbl, index) & pairMask)
2200                         {
2201                             const int innerI = (i      & (c_clusterSize - 1));
2202                             const int innerJ = (jIndex & (c_clusterSize - 1));
2203
2204                             /* Determine which j-half (CUDA warp) we are in */
2205                             const int     jHalf = innerJ/(c_clusterSize/c_nbnxnGpuClusterpairSplit);
2206
2207                             nbnxn_excl_t *interactionMask;
2208                             get_nbl_exclusions_1(nbl, cj_to_cj4(index), jHalf, &interactionMask);
2209
2210                             interactionMask->pair[a_mod_wj(innerJ)*c_clusterSize + innerI] &= ~pairMask;
2211                         }
2212                     }
2213                 }
2214             }
2215         }
2216     }
2217 }
2218
2219 /* Reallocate the super-cell sci list for at least n entries */
2220 static void nb_realloc_sci(NbnxnPairlistGpu *nbl, int n)
2221 {
2222     nbl->sci_nalloc = over_alloc_small(n);
2223     nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->sci),
2224                        nbl->nsci*sizeof(*nbl->sci),
2225                        nbl->sci_nalloc*sizeof(*nbl->sci),
2226                        nbl->alloc, nbl->free);
2227 }
2228
2229 /* Make a new ci entry at the back of nbl->ci */
2230 static void addNewIEntry(NbnxnPairlistCpu *nbl, int ci, int shift, int flags)
2231 {
2232     nbnxn_ci_t ciEntry;
2233     ciEntry.ci            = ci;
2234     ciEntry.shift         = shift;
2235     /* Store the interaction flags along with the shift */
2236     ciEntry.shift        |= flags;
2237     ciEntry.cj_ind_start  = nbl->cj.size();
2238     ciEntry.cj_ind_end    = nbl->cj.size();
2239     nbl->ci.push_back(ciEntry);
2240 }
2241
2242 /* Make a new sci entry at index nbl->nsci */
2243 static void addNewIEntry(NbnxnPairlistGpu *nbl, int sci, int shift, int gmx_unused flags)
2244 {
2245     if (nbl->nsci + 1 > nbl->sci_nalloc)
2246     {
2247         nb_realloc_sci(nbl, nbl->nsci + 1);
2248     }
2249     nbl->nsci++;
2250     nbnxn_sci_t &sciEntry = nbl->sci[nbl->nsci - 1];
2251
2252     sciEntry.sci           = sci;
2253     sciEntry.shift         = shift;
2254     sciEntry.cj4_ind_start = nbl->ncj4;
2255     sciEntry.cj4_ind_end   = nbl->ncj4;
2256 }
2257
2258 /* Sort the simple j-list cj on exclusions.
2259  * Entries with exclusions will all be sorted to the beginning of the list.
2260  */
2261 static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
2262                          NbnxnPairlistCpuWork *work)
2263 {
2264     work->cj.resize(ncj);
2265
2266     /* Make a list of the j-cells involving exclusions */
2267     int jnew = 0;
2268     for (int j = 0; j < ncj; j++)
2269     {
2270         if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
2271         {
2272             work->cj[jnew++] = cj[j];
2273         }
2274     }
2275     /* Check if there are exclusions at all or not just the first entry */
2276     if (!((jnew == 0) ||
2277           (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
2278     {
2279         for (int j = 0; j < ncj; j++)
2280         {
2281             if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
2282             {
2283                 work->cj[jnew++] = cj[j];
2284             }
2285         }
2286         for (int j = 0; j < ncj; j++)
2287         {
2288             cj[j] = work->cj[j];
2289         }
2290     }
2291 }
2292
2293 /* Close this simple list i entry */
2294 static void closeIEntry(NbnxnPairlistCpu    *nbl,
2295                         int gmx_unused       sp_max_av,
2296                         gmx_bool gmx_unused  progBal,
2297                         float gmx_unused     nsp_tot_est,
2298                         int gmx_unused       thread,
2299                         int gmx_unused       nthread)
2300 {
2301     nbnxn_ci_t &ciEntry = nbl->ci.back();
2302
2303     /* All content of the new ci entry have already been filled correctly,
2304      * we only need to sort and increase counts or remove the entry when empty.
2305      */
2306     const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
2307     if (jlen > 0)
2308     {
2309         sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
2310
2311         /* The counts below are used for non-bonded pair/flop counts
2312          * and should therefore match the available kernel setups.
2313          */
2314         if (!(ciEntry.shift & NBNXN_CI_DO_COUL(0)))
2315         {
2316             nbl->work->ncj_noq += jlen;
2317         }
2318         else if ((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) ||
2319                  !(ciEntry.shift & NBNXN_CI_DO_LJ(0)))
2320         {
2321             nbl->work->ncj_hlj += jlen;
2322         }
2323     }
2324     else
2325     {
2326         /* Entry is empty: remove it  */
2327         nbl->ci.pop_back();
2328     }
2329 }
2330
2331 /* Split sci entry for load balancing on the GPU.
2332  * Splitting ensures we have enough lists to fully utilize the whole GPU.
2333  * With progBal we generate progressively smaller lists, which improves
2334  * load balancing. As we only know the current count on our own thread,
2335  * we will need to estimate the current total amount of i-entries.
2336  * As the lists get concatenated later, this estimate depends
2337  * both on nthread and our own thread index.
2338  */
2339 static void split_sci_entry(NbnxnPairlistGpu *nbl,
2340                             int nsp_target_av,
2341                             gmx_bool progBal, float nsp_tot_est,
2342                             int thread, int nthread)
2343 {
2344     int nsp_max;
2345     int cj4_start, cj4_end, j4len;
2346     int sci;
2347     int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
2348
2349     if (progBal)
2350     {
2351         float nsp_est;
2352
2353         /* Estimate the total numbers of ci's of the nblist combined
2354          * over all threads using the target number of ci's.
2355          */
2356         nsp_est = (nsp_tot_est*thread)/nthread + nbl->nci_tot;
2357
2358         /* The first ci blocks should be larger, to avoid overhead.
2359          * The last ci blocks should be smaller, to improve load balancing.
2360          * The factor 3/2 makes the first block 3/2 times the target average
2361          * and ensures that the total number of blocks end up equal to
2362          * that of equally sized blocks of size nsp_target_av.
2363          */
2364         nsp_max = static_cast<int>(nsp_target_av*(nsp_tot_est*1.5/(nsp_est + nsp_tot_est)));
2365     }
2366     else
2367     {
2368         nsp_max = nsp_target_av;
2369     }
2370
2371     cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
2372     cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
2373     j4len     = cj4_end - cj4_start;
2374
2375     if (j4len > 1 && j4len*c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize > nsp_max)
2376     {
2377         /* Remove the last ci entry and process the cj4's again */
2378         nbl->nsci -= 1;
2379
2380         sci        = nbl->nsci;
2381         nsp        = 0;
2382         nsp_sci    = 0;
2383         nsp_cj4_e  = 0;
2384         nsp_cj4    = 0;
2385         for (int cj4 = cj4_start; cj4 < cj4_end; cj4++)
2386         {
2387             nsp_cj4_p = nsp_cj4;
2388             /* Count the number of cluster pairs in this cj4 group */
2389             nsp_cj4   = 0;
2390             for (int p = 0; p < c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize; p++)
2391             {
2392                 nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
2393             }
2394
2395             /* If adding the current cj4 with nsp_cj4 pairs get us further
2396              * away from our target nsp_max, split the list before this cj4.
2397              */
2398             if (nsp > 0 && nsp_max - nsp < nsp + nsp_cj4 - nsp_max)
2399             {
2400                 /* Split the list at cj4 */
2401                 nbl->sci[sci].cj4_ind_end = cj4;
2402                 /* Create a new sci entry */
2403                 sci++;
2404                 nbl->nsci++;
2405                 if (nbl->nsci+1 > nbl->sci_nalloc)
2406                 {
2407                     nb_realloc_sci(nbl, nbl->nsci+1);
2408                 }
2409                 nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
2410                 nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
2411                 nbl->sci[sci].cj4_ind_start = cj4;
2412                 nsp_sci                     = nsp;
2413                 nsp_cj4_e                   = nsp_cj4_p;
2414                 nsp                         = 0;
2415             }
2416             nsp += nsp_cj4;
2417         }
2418
2419         /* Put the remaining cj4's in the last sci entry */
2420         nbl->sci[sci].cj4_ind_end = cj4_end;
2421
2422         /* Possibly balance out the last two sci's
2423          * by moving the last cj4 of the second last sci.
2424          */
2425         if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
2426         {
2427             nbl->sci[sci-1].cj4_ind_end--;
2428             nbl->sci[sci].cj4_ind_start--;
2429         }
2430
2431         nbl->nsci++;
2432     }
2433 }
2434
2435 /* Clost this super/sub list i entry */
2436 static void closeIEntry(NbnxnPairlistGpu *nbl,
2437                         int nsp_max_av,
2438                         gmx_bool progBal, float nsp_tot_est,
2439                         int thread, int nthread)
2440 {
2441     nbnxn_sci_t &sciEntry = nbl->sci[nbl->nsci - 1];
2442
2443     /* All content of the new ci entry have already been filled correctly,
2444      * we only need to, potentially, split or remove the entry when empty.
2445      */
2446     int j4len = sciEntry.cj4_ind_end - sciEntry.cj4_ind_start;
2447     if (j4len > 0)
2448     {
2449         /* We can only have complete blocks of 4 j-entries in a list,
2450          * so round the count up before closing.
2451          */
2452         nbl->ncj4         = (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
2453         nbl->work->cj_ind = nbl->ncj4*c_nbnxnGpuJgroupSize;
2454
2455         if (nsp_max_av > 0)
2456         {
2457             /* Measure the size of the new entry and potentially split it */
2458             split_sci_entry(nbl, nsp_max_av, progBal, nsp_tot_est,
2459                             thread, nthread);
2460         }
2461     }
2462     else
2463     {
2464         /* Entry is empty: remove it  */
2465         nbl->nsci--;
2466     }
2467 }
2468
2469 /* Syncs the working array before adding another grid pair to the GPU list */
2470 static void sync_work(NbnxnPairlistCpu gmx_unused *nbl)
2471 {
2472 }
2473
2474 /* Syncs the working array before adding another grid pair to the GPU list */
2475 static void sync_work(NbnxnPairlistGpu *nbl)
2476 {
2477     nbl->work->cj_ind   = nbl->ncj4*c_nbnxnGpuJgroupSize;
2478     nbl->work->cj4_init = nbl->ncj4;
2479 }
2480
2481 /* Clears an NbnxnPairlistCpu data structure */
2482 static void clear_pairlist(NbnxnPairlistCpu *nbl)
2483 {
2484     nbl->ci.clear();
2485     nbl->cj.clear();
2486     nbl->ncjInUse      = 0;
2487     nbl->nci_tot       = 0;
2488     nbl->ciOuter.clear();
2489     nbl->cjOuter.clear();
2490
2491     nbl->work->ncj_noq = 0;
2492     nbl->work->ncj_hlj = 0;
2493 }
2494
2495 /* Clears an NbnxnPairlistGpu data structure */
2496 static void clear_pairlist(NbnxnPairlistGpu *nbl)
2497 {
2498     nbl->nsci          = 0;
2499     nbl->ncj4          = 0;
2500     nbl->nci_tot       = 0;
2501     nbl->nexcl         = 1;
2502 }
2503
2504 /* Clears a group scheme pair list */
2505 static void clear_pairlist_fep(t_nblist *nl)
2506 {
2507     nl->nri = 0;
2508     nl->nrj = 0;
2509     if (nl->jindex == nullptr)
2510     {
2511         snew(nl->jindex, 1);
2512     }
2513     nl->jindex[0] = 0;
2514 }
2515
2516 /* Sets a simple list i-cell bounding box, including PBC shift */
2517 static inline void set_icell_bb_simple(gmx::ArrayRef<const nbnxn_bb_t> bb,
2518                                        int ci,
2519                                        real shx, real shy, real shz,
2520                                        nbnxn_bb_t *bb_ci)
2521 {
2522     bb_ci->lower[BB_X] = bb[ci].lower[BB_X] + shx;
2523     bb_ci->lower[BB_Y] = bb[ci].lower[BB_Y] + shy;
2524     bb_ci->lower[BB_Z] = bb[ci].lower[BB_Z] + shz;
2525     bb_ci->upper[BB_X] = bb[ci].upper[BB_X] + shx;
2526     bb_ci->upper[BB_Y] = bb[ci].upper[BB_Y] + shy;
2527     bb_ci->upper[BB_Z] = bb[ci].upper[BB_Z] + shz;
2528 }
2529
2530 /* Sets a simple list i-cell bounding box, including PBC shift */
2531 static inline void set_icell_bb(const nbnxn_grid_t &iGrid,
2532                                 int ci,
2533                                 real shx, real shy, real shz,
2534                                 NbnxnPairlistCpuWork *work)
2535 {
2536     set_icell_bb_simple(iGrid.bb, ci, shx, shy, shz, &work->iClusterData.bb[0]);
2537 }
2538
2539 #if NBNXN_BBXXXX
2540 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2541 static void set_icell_bbxxxx_supersub(gmx::ArrayRef<const float> bb,
2542                                       int ci,
2543                                       real shx, real shy, real shz,
2544                                       float *bb_ci)
2545 {
2546     int ia = ci*(c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX;
2547     for (int m = 0; m < (c_gpuNumClusterPerCell >> STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
2548     {
2549         for (int i = 0; i < STRIDE_PBB; i++)
2550         {
2551             bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
2552             bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
2553             bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
2554             bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
2555             bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
2556             bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
2557         }
2558     }
2559 }
2560 #endif
2561
2562 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2563 gmx_unused static void set_icell_bb_supersub(gmx::ArrayRef<const nbnxn_bb_t> bb,
2564                                              int ci,
2565                                              real shx, real shy, real shz,
2566                                              nbnxn_bb_t *bb_ci)
2567 {
2568     for (int i = 0; i < c_gpuNumClusterPerCell; i++)
2569     {
2570         set_icell_bb_simple(bb, ci*c_gpuNumClusterPerCell+i,
2571                             shx, shy, shz,
2572                             &bb_ci[i]);
2573     }
2574 }
2575
2576 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
2577 gmx_unused static void set_icell_bb(const nbnxn_grid_t &iGrid,
2578                                     int ci,
2579                                     real shx, real shy, real shz,
2580                                     NbnxnPairlistGpuWork *work)
2581 {
2582 #if NBNXN_BBXXXX
2583     set_icell_bbxxxx_supersub(iGrid.pbb, ci, shx, shy, shz,
2584                               work->iSuperClusterData.bbPacked.data());
2585 #else
2586     set_icell_bb_supersub(iGrid.bb, ci, shx, shy, shz,
2587                           work->iSuperClusterData.bb.data());
2588 #endif
2589 }
2590
2591 /* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
2592 static void icell_set_x_simple(int ci,
2593                                real shx, real shy, real shz,
2594                                int stride, const real *x,
2595                                NbnxnPairlistCpuWork::IClusterData *iClusterData)
2596 {
2597     const int ia = ci*c_nbnxnCpuIClusterSize;
2598
2599     for (int i = 0; i < c_nbnxnCpuIClusterSize; i++)
2600     {
2601         iClusterData->x[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
2602         iClusterData->x[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
2603         iClusterData->x[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
2604     }
2605 }
2606
2607 static void icell_set_x(int ci,
2608                         real shx, real shy, real shz,
2609                         int stride, const real *x,
2610                         int nb_kernel_type,
2611                         NbnxnPairlistCpuWork *work)
2612 {
2613     switch (nb_kernel_type)
2614     {
2615 #if GMX_SIMD
2616 #ifdef GMX_NBNXN_SIMD_4XN
2617         case nbnxnk4xN_SIMD_4xN:
2618             icell_set_x_simd_4xn(ci, shx, shy, shz, stride, x, work);
2619             break;
2620 #endif
2621 #ifdef GMX_NBNXN_SIMD_2XNN
2622         case nbnxnk4xN_SIMD_2xNN:
2623             icell_set_x_simd_2xnn(ci, shx, shy, shz, stride, x, work);
2624             break;
2625 #endif
2626 #endif
2627         case nbnxnk4x4_PlainC:
2628             icell_set_x_simple(ci, shx, shy, shz, stride, x, &work->iClusterData);
2629             break;
2630         default:
2631             GMX_ASSERT(false, "Unhandled case");
2632             break;
2633     }
2634 }
2635
2636 /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
2637 static void icell_set_x(int ci,
2638                         real shx, real shy, real shz,
2639                         int stride, const real *x,
2640                         int gmx_unused nb_kernel_type,
2641                         NbnxnPairlistGpuWork *work)
2642 {
2643 #if !GMX_SIMD4_HAVE_REAL
2644
2645     real * x_ci = work->iSuperClusterData.x.data();
2646
2647     int    ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize;
2648     for (int i = 0; i < c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize; i++)
2649     {
2650         x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
2651         x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
2652         x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
2653     }
2654
2655 #else /* !GMX_SIMD4_HAVE_REAL */
2656
2657     real * x_ci = work->iSuperClusterData.xSimd.data();
2658
2659     for (int si = 0; si < c_gpuNumClusterPerCell; si++)
2660     {
2661         for (int i = 0; i < c_nbnxnGpuClusterSize; i += GMX_SIMD4_WIDTH)
2662         {
2663             int io = si*c_nbnxnGpuClusterSize + i;
2664             int ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize + io;
2665             for (int j = 0; j < GMX_SIMD4_WIDTH; j++)
2666             {
2667                 x_ci[io*DIM + j + XX*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + XX] + shx;
2668                 x_ci[io*DIM + j + YY*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + YY] + shy;
2669                 x_ci[io*DIM + j + ZZ*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + ZZ] + shz;
2670             }
2671         }
2672     }
2673
2674 #endif /* !GMX_SIMD4_HAVE_REAL */
2675 }
2676
2677 static real minimum_subgrid_size_xy(const nbnxn_grid_t &grid)
2678 {
2679     if (grid.bSimple)
2680     {
2681         return std::min(grid.cellSize[XX], grid.cellSize[YY]);
2682     }
2683     else
2684     {
2685         return std::min(grid.cellSize[XX]/c_gpuNumClusterPerCellX,
2686                         grid.cellSize[YY]/c_gpuNumClusterPerCellY);
2687     }
2688 }
2689
2690 static real effective_buffer_1x1_vs_MxN(const nbnxn_grid_t &iGrid,
2691                                         const nbnxn_grid_t &jGrid)
2692 {
2693     const real eff_1x1_buffer_fac_overest = 0.1;
2694
2695     /* Determine an atom-pair list cut-off buffer size for atom pairs,
2696      * to be added to rlist (including buffer) used for MxN.
2697      * This is for converting an MxN list to a 1x1 list. This means we can't
2698      * use the normal buffer estimate, as we have an MxN list in which
2699      * some atom pairs beyond rlist are missing. We want to capture
2700      * the beneficial effect of buffering by extra pairs just outside rlist,
2701      * while removing the useless pairs that are further away from rlist.
2702      * (Also the buffer could have been set manually not using the estimate.)
2703      * This buffer size is an overestimate.
2704      * We add 10% of the smallest grid sub-cell dimensions.
2705      * Note that the z-size differs per cell and we don't use this,
2706      * so we overestimate.
2707      * With PME, the 10% value gives a buffer that is somewhat larger
2708      * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
2709      * Smaller tolerances or using RF lead to a smaller effective buffer,
2710      * so 10% gives a safe overestimate.
2711      */
2712     return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(iGrid) +
2713                                        minimum_subgrid_size_xy(jGrid));
2714 }
2715
2716 /* Clusters at the cut-off only increase rlist by 60% of their size */
2717 static real nbnxn_rlist_inc_outside_fac = 0.6;
2718
2719 /* Due to the cluster size the effective pair-list is longer than
2720  * that of a simple atom pair-list. This function gives the extra distance.
2721  */
2722 real nbnxn_get_rlist_effective_inc(int cluster_size_j, real atom_density)
2723 {
2724     int  cluster_size_i;
2725     real vol_inc_i, vol_inc_j;
2726
2727     /* We should get this from the setup, but currently it's the same for
2728      * all setups, including GPUs.
2729      */
2730     cluster_size_i = c_nbnxnCpuIClusterSize;
2731
2732     vol_inc_i = (cluster_size_i - 1)/atom_density;
2733     vol_inc_j = (cluster_size_j - 1)/atom_density;
2734
2735     return nbnxn_rlist_inc_outside_fac*std::cbrt(vol_inc_i + vol_inc_j);
2736 }
2737
2738 /* Estimates the interaction volume^2 for non-local interactions */
2739 static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls, real r)
2740 {
2741     real cl, ca, za;
2742     real vold_est;
2743     real vol2_est_tot;
2744
2745     vol2_est_tot = 0;
2746
2747     /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
2748      * not home interaction volume^2. As these volumes are not additive,
2749      * this is an overestimate, but it would only be significant in the limit
2750      * of small cells, where we anyhow need to split the lists into
2751      * as small parts as possible.
2752      */
2753
2754     for (int z = 0; z < zones->n; z++)
2755     {
2756         if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
2757         {
2758             cl = 0;
2759             ca = 1;
2760             za = 1;
2761             for (int d = 0; d < DIM; d++)
2762             {
2763                 if (zones->shift[z][d] == 0)
2764                 {
2765                     cl += 0.5*ls[d];
2766                     ca *= ls[d];
2767                     za *= zones->size[z].x1[d] - zones->size[z].x0[d];
2768                 }
2769             }
2770
2771             /* 4 octants of a sphere */
2772             vold_est  = 0.25*M_PI*r*r*r*r;
2773             /* 4 quarter pie slices on the edges */
2774             vold_est += 4*cl*M_PI/6.0*r*r*r;
2775             /* One rectangular volume on a face */
2776             vold_est += ca*0.5*r*r;
2777
2778             vol2_est_tot += vold_est*za;
2779         }
2780     }
2781
2782     return vol2_est_tot;
2783 }
2784
2785 /* Estimates the average size of a full j-list for super/sub setup */
2786 static void get_nsubpair_target(const nbnxn_search   *nbs,
2787                                 int                   iloc,
2788                                 real                  rlist,
2789                                 int                   min_ci_balanced,
2790                                 int                  *nsubpair_target,
2791                                 float                *nsubpair_tot_est)
2792 {
2793     /* The target value of 36 seems to be the optimum for Kepler.
2794      * Maxwell is less sensitive to the exact value.
2795      */
2796     const int           nsubpair_target_min = 36;
2797     rvec                ls;
2798     real                r_eff_sup, vol_est, nsp_est, nsp_est_nl;
2799
2800     const nbnxn_grid_t &grid = nbs->grid[0];
2801
2802     /* We don't need to balance list sizes if:
2803      * - We didn't request balancing.
2804      * - The number of grid cells >= the number of lists requested,
2805      *   since we will always generate at least #cells lists.
2806      * - We don't have any cells, since then there won't be any lists.
2807      */
2808     if (min_ci_balanced <= 0 || grid.nc >= min_ci_balanced || grid.nc == 0)
2809     {
2810         /* nsubpair_target==0 signals no balancing */
2811         *nsubpair_target  = 0;
2812         *nsubpair_tot_est = 0;
2813
2814         return;
2815     }
2816
2817     ls[XX] = (grid.c1[XX] - grid.c0[XX])/(grid.numCells[XX]*c_gpuNumClusterPerCellX);
2818     ls[YY] = (grid.c1[YY] - grid.c0[YY])/(grid.numCells[YY]*c_gpuNumClusterPerCellY);
2819     ls[ZZ] = grid.na_c/(grid.atom_density*ls[XX]*ls[YY]);
2820
2821     /* The average length of the diagonal of a sub cell */
2822     real diagonal = std::sqrt(ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ]);
2823
2824     /* The formulas below are a heuristic estimate of the average nsj per si*/
2825     r_eff_sup = rlist + nbnxn_rlist_inc_outside_fac*gmx::square((grid.na_c - 1.0)/grid.na_c)*0.5*diagonal;
2826
2827     if (!nbs->DomDec || nbs->zones->n == 1)
2828     {
2829         nsp_est_nl = 0;
2830     }
2831     else
2832     {
2833         nsp_est_nl =
2834             gmx::square(grid.atom_density/grid.na_c)*
2835             nonlocal_vol2(nbs->zones, ls, r_eff_sup);
2836     }
2837
2838     if (LOCAL_I(iloc))
2839     {
2840         /* Sub-cell interacts with itself */
2841         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
2842         /* 6/2 rectangular volume on the faces */
2843         vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
2844         /* 12/2 quarter pie slices on the edges */
2845         vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*gmx::square(r_eff_sup);
2846         /* 4 octants of a sphere */
2847         vol_est += 0.5*4.0/3.0*M_PI*gmx::power3(r_eff_sup);
2848
2849         /* Estimate the number of cluster pairs as the local number of
2850          * clusters times the volume they interact with times the density.
2851          */
2852         nsp_est = grid.nsubc_tot*vol_est*grid.atom_density/grid.na_c;
2853
2854         /* Subtract the non-local pair count */
2855         nsp_est -= nsp_est_nl;
2856
2857         /* For small cut-offs nsp_est will be an underesimate.
2858          * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
2859          * So to avoid too small or negative nsp_est we set a minimum of
2860          * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
2861          * This might be a slight overestimate for small non-periodic groups of
2862          * atoms as will occur for a local domain with DD, but for small
2863          * groups of atoms we'll anyhow be limited by nsubpair_target_min,
2864          * so this overestimation will not matter.
2865          */
2866         nsp_est = std::max(nsp_est, grid.nsubc_tot*14._real);
2867
2868         if (debug)
2869         {
2870             fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
2871                     nsp_est, nsp_est_nl);
2872         }
2873     }
2874     else
2875     {
2876         nsp_est = nsp_est_nl;
2877     }
2878
2879     /* Thus the (average) maximum j-list size should be as follows.
2880      * Since there is overhead, we shouldn't make the lists too small
2881      * (and we can't chop up j-groups) so we use a minimum target size of 36.
2882      */
2883     *nsubpair_target  = std::max(nsubpair_target_min,
2884                                  roundToInt(nsp_est/min_ci_balanced));
2885     *nsubpair_tot_est = static_cast<int>(nsp_est);
2886
2887     if (debug)
2888     {
2889         fprintf(debug, "nbl nsp estimate %.1f, nsubpair_target %d\n",
2890                 nsp_est, *nsubpair_target);
2891     }
2892 }
2893
2894 /* Debug list print function */
2895 static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
2896 {
2897     for (const nbnxn_ci_t &ciEntry : nbl->ci)
2898     {
2899         fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
2900                 ciEntry.ci, ciEntry.shift,
2901                 ciEntry.cj_ind_end - ciEntry.cj_ind_start);
2902
2903         for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
2904         {
2905             fprintf(fp, "  cj %5d  imask %x\n",
2906                     nbl->cj[j].cj,
2907                     nbl->cj[j].excl);
2908         }
2909     }
2910 }
2911
2912 /* Debug list print function */
2913 static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
2914 {
2915     for (int i = 0; i < nbl->nsci; i++)
2916     {
2917         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
2918                 nbl->sci[i].sci, nbl->sci[i].shift,
2919                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
2920
2921         int ncp = 0;
2922         for (int j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
2923         {
2924             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
2925             {
2926                 fprintf(fp, "  sj %5d  imask %x\n",
2927                         nbl->cj4[j4].cj[j],
2928                         nbl->cj4[j4].imei[0].imask);
2929                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
2930                 {
2931                     if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
2932                     {
2933                         ncp++;
2934                     }
2935                 }
2936             }
2937         }
2938         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
2939                 nbl->sci[i].sci, nbl->sci[i].shift,
2940                 nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
2941                 ncp);
2942     }
2943 }
2944
2945 /* Combine pair lists *nbl generated on multiple threads nblc */
2946 static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
2947                             NbnxnPairlistGpu *nblc)
2948 {
2949     int nsci, ncj4, nexcl;
2950
2951     nsci  = nblc->nsci;
2952     ncj4  = nblc->ncj4;
2953     nexcl = nblc->nexcl;
2954     for (int i = 0; i < nnbl; i++)
2955     {
2956         nsci  += nbl[i]->nsci;
2957         ncj4  += nbl[i]->ncj4;
2958         nexcl += nbl[i]->nexcl;
2959     }
2960
2961     if (nsci > nblc->sci_nalloc)
2962     {
2963         nb_realloc_sci(nblc, nsci);
2964     }
2965     if (ncj4 > nblc->cj4_nalloc)
2966     {
2967         nblc->cj4_nalloc = over_alloc_small(ncj4);
2968         nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->cj4),
2969                            nblc->ncj4*sizeof(*nblc->cj4),
2970                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
2971                            nblc->alloc, nblc->free);
2972     }
2973     if (nexcl > nblc->excl_nalloc)
2974     {
2975         nblc->excl_nalloc = over_alloc_small(nexcl);
2976         nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->excl),
2977                            nblc->nexcl*sizeof(*nblc->excl),
2978                            nblc->excl_nalloc*sizeof(*nblc->excl),
2979                            nblc->alloc, nblc->free);
2980     }
2981
2982     /* Each thread should copy its own data to the combined arrays,
2983      * as otherwise data will go back and forth between different caches.
2984      */
2985 #if GMX_OPENMP && !(defined __clang_analyzer__)
2986     int nthreads = gmx_omp_nthreads_get(emntPairsearch);
2987 #endif
2988
2989 #pragma omp parallel for num_threads(nthreads) schedule(static)
2990     for (int n = 0; n < nnbl; n++)
2991     {
2992         try
2993         {
2994             int                     sci_offset;
2995             int                     cj4_offset;
2996             int                     excl_offset;
2997             const NbnxnPairlistGpu *nbli;
2998
2999             /* Determine the offset in the combined data for our thread */
3000             sci_offset  = nblc->nsci;
3001             cj4_offset  = nblc->ncj4;
3002             excl_offset = nblc->nexcl;
3003
3004             for (int i = 0; i < n; i++)
3005             {
3006                 sci_offset  += nbl[i]->nsci;
3007                 cj4_offset  += nbl[i]->ncj4;
3008                 excl_offset += nbl[i]->nexcl;
3009             }
3010
3011             nbli = nbl[n];
3012
3013             for (int i = 0; i < nbli->nsci; i++)
3014             {
3015                 nblc->sci[sci_offset+i]                = nbli->sci[i];
3016                 nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
3017                 nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
3018             }
3019
3020             for (int j4 = 0; j4 < nbli->ncj4; j4++)
3021             {
3022                 nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
3023                 nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
3024                 nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
3025             }
3026
3027             for (int j4 = 0; j4 < nbli->nexcl; j4++)
3028             {
3029                 nblc->excl[excl_offset+j4] = nbli->excl[j4];
3030             }
3031         }
3032         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
3033     }
3034
3035     for (int n = 0; n < nnbl; n++)
3036     {
3037         nblc->nsci    += nbl[n]->nsci;
3038         nblc->ncj4    += nbl[n]->ncj4;
3039         nblc->nci_tot += nbl[n]->nci_tot;
3040         nblc->nexcl   += nbl[n]->nexcl;
3041     }
3042 }
3043
3044 static void balance_fep_lists(const nbnxn_search   *nbs,
3045                               nbnxn_pairlist_set_t *nbl_lists)
3046 {
3047     int       nnbl;
3048     int       nri_tot, nrj_tot, nrj_target;
3049     int       th_dest;
3050     t_nblist *nbld;
3051
3052     nnbl = nbl_lists->nnbl;
3053
3054     if (nnbl == 1)
3055     {
3056         /* Nothing to balance */
3057         return;
3058     }
3059
3060     /* Count the total i-lists and pairs */
3061     nri_tot = 0;
3062     nrj_tot = 0;
3063     for (int th = 0; th < nnbl; th++)
3064     {
3065         nri_tot += nbl_lists->nbl_fep[th]->nri;
3066         nrj_tot += nbl_lists->nbl_fep[th]->nrj;
3067     }
3068
3069     nrj_target = (nrj_tot + nnbl - 1)/nnbl;
3070
3071     assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
3072
3073 #pragma omp parallel for schedule(static) num_threads(nnbl)
3074     for (int th = 0; th < nnbl; th++)
3075     {
3076         try
3077         {
3078             t_nblist *nbl = nbs->work[th].nbl_fep.get();
3079
3080             /* Note that here we allocate for the total size, instead of
3081              * a per-thread esimate (which is hard to obtain).
3082              */
3083             if (nri_tot > nbl->maxnri)
3084             {
3085                 nbl->maxnri = over_alloc_large(nri_tot);
3086                 reallocate_nblist(nbl);
3087             }
3088             if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)
3089             {
3090                 nbl->maxnrj = over_alloc_small(nrj_tot);
3091                 srenew(nbl->jjnr, nbl->maxnrj);
3092                 srenew(nbl->excl_fep, nbl->maxnrj);
3093             }
3094
3095             clear_pairlist_fep(nbl);
3096         }
3097         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
3098     }
3099
3100     /* Loop over the source lists and assign and copy i-entries */
3101     th_dest = 0;
3102     nbld    = nbs->work[th_dest].nbl_fep.get();
3103     for (int th = 0; th < nnbl; th++)
3104     {
3105         t_nblist *nbls;
3106
3107         nbls = nbl_lists->nbl_fep[th];
3108
3109         for (int i = 0; i < nbls->nri; i++)
3110         {
3111             int nrj;
3112
3113             /* The number of pairs in this i-entry */
3114             nrj = nbls->jindex[i+1] - nbls->jindex[i];
3115
3116             /* Decide if list th_dest is too large and we should procede
3117              * to the next destination list.
3118              */
3119             if (th_dest+1 < nnbl && nbld->nrj > 0 &&
3120                 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
3121             {
3122                 th_dest++;
3123                 nbld = nbs->work[th_dest].nbl_fep.get();
3124             }
3125
3126             nbld->iinr[nbld->nri]  = nbls->iinr[i];
3127             nbld->gid[nbld->nri]   = nbls->gid[i];
3128             nbld->shift[nbld->nri] = nbls->shift[i];
3129
3130             for (int j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++)
3131             {
3132                 nbld->jjnr[nbld->nrj]     = nbls->jjnr[j];
3133                 nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j];
3134                 nbld->nrj++;
3135             }
3136             nbld->nri++;
3137             nbld->jindex[nbld->nri] = nbld->nrj;
3138         }
3139     }
3140
3141     /* Swap the list pointers */
3142     for (int th = 0; th < nnbl; th++)
3143     {
3144         t_nblist *nbl_tmp      = nbs->work[th].nbl_fep.release();
3145         nbs->work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
3146         nbl_lists->nbl_fep[th] = nbl_tmp;
3147
3148         if (debug)
3149         {
3150             fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
3151                     th,
3152                     nbl_lists->nbl_fep[th]->nri,
3153                     nbl_lists->nbl_fep[th]->nrj);
3154         }
3155     }
3156 }
3157
3158 /* Returns the next ci to be processes by our thread */
3159 static gmx_bool next_ci(const nbnxn_grid_t &grid,
3160                         int nth, int ci_block,
3161                         int *ci_x, int *ci_y,
3162                         int *ci_b, int *ci)
3163 {
3164     (*ci_b)++;
3165     (*ci)++;
3166
3167     if (*ci_b == ci_block)
3168     {
3169         /* Jump to the next block assigned to this task */
3170         *ci   += (nth - 1)*ci_block;
3171         *ci_b  = 0;
3172     }
3173
3174     if (*ci >= grid.nc)
3175     {
3176         return FALSE;
3177     }
3178
3179     while (*ci >= grid.cxy_ind[*ci_x*grid.numCells[YY] + *ci_y + 1])
3180     {
3181         *ci_y += 1;
3182         if (*ci_y == grid.numCells[YY])
3183         {
3184             *ci_x += 1;
3185             *ci_y  = 0;
3186         }
3187     }
3188
3189     return TRUE;
3190 }
3191
3192 /* Returns the distance^2 for which we put cell pairs in the list
3193  * without checking atom pair distances. This is usually < rlist^2.
3194  */
3195 static float boundingbox_only_distance2(const nbnxn_grid_t &iGrid,
3196                                         const nbnxn_grid_t &jGrid,
3197                                         real                rlist,
3198                                         gmx_bool            simple)
3199 {
3200     /* If the distance between two sub-cell bounding boxes is less
3201      * than this distance, do not check the distance between
3202      * all particle pairs in the sub-cell, since then it is likely
3203      * that the box pair has atom pairs within the cut-off.
3204      * We use the nblist cut-off minus 0.5 times the average x/y diagonal
3205      * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
3206      * Using more than 0.5 gains at most 0.5%.
3207      * If forces are calculated more than twice, the performance gain
3208      * in the force calculation outweighs the cost of checking.
3209      * Note that with subcell lists, the atom-pair distance check
3210      * is only performed when only 1 out of 8 sub-cells in within range,
3211      * this is because the GPU is much faster than the cpu.
3212      */
3213     real bbx, bby;
3214     real rbb2;
3215
3216     bbx = 0.5*(iGrid.cellSize[XX] + jGrid.cellSize[XX]);
3217     bby = 0.5*(iGrid.cellSize[YY] + jGrid.cellSize[YY]);
3218     if (!simple)
3219     {
3220         bbx /= c_gpuNumClusterPerCellX;
3221         bby /= c_gpuNumClusterPerCellY;
3222     }
3223
3224     rbb2 = std::max(0.0, rlist - 0.5*std::sqrt(bbx*bbx + bby*bby));
3225     rbb2 = rbb2 * rbb2;
3226
3227 #if !GMX_DOUBLE
3228     return rbb2;
3229 #else
3230     return (float)((1+GMX_FLOAT_EPS)*rbb2);
3231 #endif
3232 }
3233
3234 static int get_ci_block_size(const nbnxn_grid_t &iGrid,
3235                              gmx_bool bDomDec, int nth)
3236 {
3237     const int ci_block_enum      = 5;
3238     const int ci_block_denom     = 11;
3239     const int ci_block_min_atoms = 16;
3240     int       ci_block;
3241
3242     /* Here we decide how to distribute the blocks over the threads.
3243      * We use prime numbers to try to avoid that the grid size becomes
3244      * a multiple of the number of threads, which would lead to some
3245      * threads getting "inner" pairs and others getting boundary pairs,
3246      * which in turns will lead to load imbalance between threads.
3247      * Set the block size as 5/11/ntask times the average number of cells
3248      * in a y,z slab. This should ensure a quite uniform distribution
3249      * of the grid parts of the different thread along all three grid
3250      * zone boundaries with 3D domain decomposition. At the same time
3251      * the blocks will not become too small.
3252      */
3253     ci_block = (iGrid.nc*ci_block_enum)/(ci_block_denom*iGrid.numCells[XX]*nth);
3254
3255     /* Ensure the blocks are not too small: avoids cache invalidation */
3256     if (ci_block*iGrid.na_sc < ci_block_min_atoms)
3257     {
3258         ci_block = (ci_block_min_atoms + iGrid.na_sc - 1)/iGrid.na_sc;
3259     }
3260
3261     /* Without domain decomposition
3262      * or with less than 3 blocks per task, divide in nth blocks.
3263      */
3264     if (!bDomDec || nth*3*ci_block > iGrid.nc)
3265     {
3266         ci_block = (iGrid.nc + nth - 1)/nth;
3267     }
3268
3269     if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.nc)
3270     {
3271         /* Some threads have no work. Although reducing the block size
3272          * does not decrease the block count on the first few threads,
3273          * with GPUs better mixing of "upper" cells that have more empty
3274          * clusters results in a somewhat lower max load over all threads.
3275          * Without GPUs the regime of so few atoms per thread is less
3276          * performance relevant, but with 8-wide SIMD the same reasoning
3277          * applies, since the pair list uses 4 i-atom "sub-clusters".
3278          */
3279         ci_block--;
3280     }
3281
3282     return ci_block;
3283 }
3284
3285 /* Returns the number of bits to right-shift a cluster index to obtain
3286  * the corresponding force buffer flag index.
3287  */
3288 static int getBufferFlagShift(int numAtomsPerCluster)
3289 {
3290     int bufferFlagShift = 0;
3291     while ((numAtomsPerCluster << bufferFlagShift) < NBNXN_BUFFERFLAG_SIZE)
3292     {
3293         bufferFlagShift++;
3294     }
3295
3296     return bufferFlagShift;
3297 }
3298
3299 static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused &pairlist)
3300 {
3301     return true;
3302 }
3303
3304 static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused &pairlist)
3305 {
3306     return false;
3307 }
3308
3309 static void makeClusterListWrapper(NbnxnPairlistCpu              *nbl,
3310                                    const nbnxn_grid_t gmx_unused &iGrid,
3311                                    const int                      ci,
3312                                    const nbnxn_grid_t            &jGrid,
3313                                    const int                      firstCell,
3314                                    const int                      lastCell,
3315                                    const bool                     excludeSubDiagonal,
3316                                    const nbnxn_atomdata_t        *nbat,
3317                                    const real                     rlist2,
3318                                    const real                     rbb2,
3319                                    const int                      nb_kernel_type,
3320                                    int                           *numDistanceChecks)
3321 {
3322     switch (nb_kernel_type)
3323     {
3324         case nbnxnk4x4_PlainC:
3325             makeClusterListSimple(jGrid,
3326                                   nbl, ci, firstCell, lastCell,
3327                                   excludeSubDiagonal,
3328                                   nbat->x().data(),
3329                                   rlist2, rbb2,
3330                                   numDistanceChecks);
3331             break;
3332 #ifdef GMX_NBNXN_SIMD_4XN
3333         case nbnxnk4xN_SIMD_4xN:
3334             makeClusterListSimd4xn(jGrid,
3335                                    nbl, ci, firstCell, lastCell,
3336                                    excludeSubDiagonal,
3337                                    nbat->x().data(),
3338                                    rlist2, rbb2,
3339                                    numDistanceChecks);
3340             break;
3341 #endif
3342 #ifdef GMX_NBNXN_SIMD_2XNN
3343         case nbnxnk4xN_SIMD_2xNN:
3344             makeClusterListSimd2xnn(jGrid,
3345                                     nbl, ci, firstCell, lastCell,
3346                                     excludeSubDiagonal,
3347                                     nbat->x().data(),
3348                                     rlist2, rbb2,
3349                                     numDistanceChecks);
3350             break;
3351 #endif
3352     }
3353 }
3354
3355 static void makeClusterListWrapper(NbnxnPairlistGpu              *nbl,
3356                                    const nbnxn_grid_t &gmx_unused iGrid,
3357                                    const int                      ci,
3358                                    const nbnxn_grid_t            &jGrid,
3359                                    const int                      firstCell,
3360                                    const int                      lastCell,
3361                                    const bool                     excludeSubDiagonal,
3362                                    const nbnxn_atomdata_t        *nbat,
3363                                    const real                     rlist2,
3364                                    const real                     rbb2,
3365                                    const int gmx_unused           nb_kernel_type,
3366                                    int                           *numDistanceChecks)
3367 {
3368     for (int cj = firstCell; cj <= lastCell; cj++)
3369     {
3370         make_cluster_list_supersub(iGrid, jGrid,
3371                                    nbl, ci, cj,
3372                                    excludeSubDiagonal,
3373                                    nbat->xstride, nbat->x().data(),
3374                                    rlist2, rbb2,
3375                                    numDistanceChecks);
3376     }
3377 }
3378
3379 static int getNumSimpleJClustersInList(const NbnxnPairlistCpu &nbl)
3380 {
3381     return nbl.cj.size();
3382 }
3383
3384 static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu &nbl)
3385 {
3386     return 0;
3387 }
3388
3389 static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu *nbl,
3390                                               int               ncj_old_j)
3391 {
3392     nbl->ncjInUse += nbl->cj.size() - ncj_old_j;
3393 }
3394
3395 static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused *nbl,
3396                                               int              gmx_unused  ncj_old_j)
3397 {
3398 }
3399
3400 static void checkListSizeConsistency(const NbnxnPairlistCpu &nbl,
3401                                      const bool              haveFreeEnergy)
3402 {
3403     GMX_RELEASE_ASSERT(static_cast<size_t>(nbl.ncjInUse) == nbl.cj.size() || haveFreeEnergy,
3404                        "Without free-energy all cj pair-list entries should be in use. "
3405                        "Note that subsequent code does not make use of the equality, "
3406                        "this check is only here to catch bugs");
3407 }
3408
3409 static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused &nbl,
3410                                      bool gmx_unused                    haveFreeEnergy)
3411 {
3412     /* We currently can not check consistency here */
3413 }
3414
3415 /* Set the buffer flags for newly added entries in the list */
3416 static void setBufferFlags(const NbnxnPairlistCpu &nbl,
3417                            const int               ncj_old_j,
3418                            const int               gridj_flag_shift,
3419                            gmx_bitmask_t          *gridj_flag,
3420                            const int               th)
3421 {
3422     if (static_cast<gmx::index>(nbl.cj.size()) > ncj_old_j)
3423     {
3424         int cbFirst = nbl.cj[ncj_old_j].cj >> gridj_flag_shift;
3425         int cbLast  = nbl.cj.back().cj >> gridj_flag_shift;
3426         for (int cb = cbFirst; cb <= cbLast; cb++)
3427         {
3428             bitmask_init_bit(&gridj_flag[cb], th);
3429         }
3430     }
3431 }
3432
3433 static void setBufferFlags(const NbnxnPairlistGpu gmx_unused &nbl,
3434                            int gmx_unused                     ncj_old_j,
3435                            int gmx_unused                     gridj_flag_shift,
3436                            gmx_bitmask_t gmx_unused          *gridj_flag,
3437                            int gmx_unused                     th)
3438 {
3439     GMX_ASSERT(false, "This function should never be called");
3440 }
3441
3442 /* Generates the part of pair-list nbl assigned to our thread */
3443 template <typename T>
3444 static void nbnxn_make_pairlist_part(const nbnxn_search *nbs,
3445                                      const nbnxn_grid_t &iGrid,
3446                                      const nbnxn_grid_t &jGrid,
3447                                      nbnxn_search_work_t *work,
3448                                      const nbnxn_atomdata_t *nbat,
3449                                      const t_blocka &exclusions,
3450                                      real rlist,
3451                                      int nb_kernel_type,
3452                                      int ci_block,
3453                                      gmx_bool bFBufferFlag,
3454                                      int nsubpair_max,
3455                                      gmx_bool progBal,
3456                                      float nsubpair_tot_est,
3457                                      int th, int nth,
3458                                      T *nbl,
3459                                      t_nblist *nbl_fep)
3460 {
3461     int               na_cj_2log;
3462     matrix            box;
3463     real              rlist2, rl_fep2 = 0;
3464     float             rbb2;
3465     int               ci_b, ci, ci_x, ci_y, ci_xy;
3466     ivec              shp;
3467     real              bx0, bx1, by0, by1, bz0, bz1;
3468     real              bz1_frac;
3469     real              d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
3470     int               cxf, cxl, cyf, cyf_x, cyl;
3471     int               numDistanceChecks;
3472     int               gridi_flag_shift = 0, gridj_flag_shift = 0;
3473     gmx_bitmask_t    *gridj_flag       = nullptr;
3474     int               ncj_old_i, ncj_old_j;
3475
3476     nbs_cycle_start(&work->cc[enbsCCsearch]);
3477
3478     if (jGrid.bSimple != pairlistIsSimple(*nbl) ||
3479         iGrid.bSimple != pairlistIsSimple(*nbl))
3480     {
3481         gmx_incons("Grid incompatible with pair-list");
3482     }
3483
3484     sync_work(nbl);
3485     GMX_ASSERT(nbl->na_ci == jGrid.na_c, "The cluster sizes in the list and grid should match");
3486     nbl->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
3487     na_cj_2log = get_2log(nbl->na_cj);
3488
3489     nbl->rlist  = rlist;
3490
3491     if (bFBufferFlag)
3492     {
3493         /* Determine conversion of clusters to flag blocks */
3494         gridi_flag_shift = getBufferFlagShift(nbl->na_ci);
3495         gridj_flag_shift = getBufferFlagShift(nbl->na_cj);
3496
3497         gridj_flag       = work->buffer_flags.flag;
3498     }
3499
3500     copy_mat(nbs->box, box);
3501
3502     rlist2 = nbl->rlist*nbl->rlist;
3503
3504     if (nbs->bFEP && !pairlistIsSimple(*nbl))
3505     {
3506         /* Determine an atom-pair list cut-off distance for FEP atom pairs.
3507          * We should not simply use rlist, since then we would not have
3508          * the small, effective buffering of the NxN lists.
3509          * The buffer is on overestimate, but the resulting cost for pairs
3510          * beyond rlist is neglible compared to the FEP pairs within rlist.
3511          */
3512         rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(iGrid, jGrid);
3513
3514         if (debug)
3515         {
3516             fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2);
3517         }
3518         rl_fep2 = rl_fep2*rl_fep2;
3519     }
3520
3521     rbb2 = boundingbox_only_distance2(iGrid, jGrid, nbl->rlist, pairlistIsSimple(*nbl));
3522
3523     if (debug)
3524     {
3525         fprintf(debug, "nbl bounding box only distance %f\n", std::sqrt(rbb2));
3526     }
3527
3528     const bool isIntraGridList = (&iGrid == &jGrid);
3529
3530     /* Set the shift range */
3531     for (int d = 0; d < DIM; d++)
3532     {
3533         /* Check if we need periodicity shifts.
3534          * Without PBC or with domain decomposition we don't need them.
3535          */
3536         if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
3537         {
3538             shp[d] = 0;
3539         }
3540         else
3541         {
3542             const real listRangeCellToCell = listRangeForGridCellToGridCell(rlist, iGrid, jGrid);
3543             if (d == XX &&
3544                 box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < listRangeCellToCell)
3545             {
3546                 shp[d] = 2;
3547             }
3548             else
3549             {
3550                 shp[d] = 1;
3551             }
3552         }
3553     }
3554     const bool bSimple = pairlistIsSimple(*nbl);
3555     gmx::ArrayRef<const nbnxn_bb_t> bb_i;
3556 #if NBNXN_BBXXXX
3557     gmx::ArrayRef<const float>      pbb_i;
3558     if (bSimple)
3559     {
3560         bb_i  = iGrid.bb;
3561     }
3562     else
3563     {
3564         pbb_i = iGrid.pbb;
3565     }
3566 #else
3567     /* We use the normal bounding box format for both grid types */
3568     bb_i  = iGrid.bb;
3569 #endif
3570     gmx::ArrayRef<const float> bbcz_i  = iGrid.bbcz;
3571     gmx::ArrayRef<const int>   flags_i = iGrid.flags;
3572     gmx::ArrayRef<const float> bbcz_j  = jGrid.bbcz;
3573     int                        cell0_i = iGrid.cell0;
3574
3575     if (debug)
3576     {
3577         fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
3578                 iGrid.nc, iGrid.nc/static_cast<double>(iGrid.numCells[XX]*iGrid.numCells[YY]), ci_block);
3579     }
3580
3581     numDistanceChecks = 0;
3582
3583     const real listRangeBBToJCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGrid));
3584
3585     /* Initially ci_b and ci to 1 before where we want them to start,
3586      * as they will both be incremented in next_ci.
3587      */
3588     ci_b = -1;
3589     ci   = th*ci_block - 1;
3590     ci_x = 0;
3591     ci_y = 0;
3592     while (next_ci(iGrid, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
3593     {
3594         if (bSimple && flags_i[ci] == 0)
3595         {
3596             continue;
3597         }
3598
3599         ncj_old_i = getNumSimpleJClustersInList(*nbl);
3600
3601         d2cx = 0;
3602         if (!isIntraGridList && shp[XX] == 0)
3603         {
3604             if (bSimple)
3605             {
3606                 bx1 = bb_i[ci].upper[BB_X];
3607             }
3608             else
3609             {
3610                 bx1 = iGrid.c0[XX] + (ci_x+1)*iGrid.cellSize[XX];
3611             }
3612             if (bx1 < jGrid.c0[XX])
3613             {
3614                 d2cx = gmx::square(jGrid.c0[XX] - bx1);
3615
3616                 if (d2cx >= listRangeBBToJCell2)
3617                 {
3618                     continue;
3619                 }
3620             }
3621         }
3622
3623         ci_xy = ci_x*iGrid.numCells[YY] + ci_y;
3624
3625         /* Loop over shift vectors in three dimensions */
3626         for (int tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
3627         {
3628             const real shz = tz*box[ZZ][ZZ];
3629
3630             bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
3631             bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
3632
3633             if (tz == 0)
3634             {
3635                 d2z = 0;
3636             }
3637             else if (tz < 0)
3638             {
3639                 d2z = gmx::square(bz1);
3640             }
3641             else
3642             {
3643                 d2z = gmx::square(bz0 - box[ZZ][ZZ]);
3644             }
3645
3646             d2z_cx = d2z + d2cx;
3647
3648             if (d2z_cx >= rlist2)
3649             {
3650                 continue;
3651             }
3652
3653             bz1_frac = bz1/(iGrid.cxy_ind[ci_xy+1] - iGrid.cxy_ind[ci_xy]);
3654             if (bz1_frac < 0)
3655             {
3656                 bz1_frac = 0;
3657             }
3658             /* The check with bz1_frac close to or larger than 1 comes later */
3659
3660             for (int ty = -shp[YY]; ty <= shp[YY]; ty++)
3661             {
3662                 const real shy = ty*box[YY][YY] + tz*box[ZZ][YY];
3663
3664                 if (bSimple)
3665                 {
3666                     by0 = bb_i[ci].lower[BB_Y] + shy;
3667                     by1 = bb_i[ci].upper[BB_Y] + shy;
3668                 }
3669                 else
3670                 {
3671                     by0 = iGrid.c0[YY] + (ci_y  )*iGrid.cellSize[YY] + shy;
3672                     by1 = iGrid.c0[YY] + (ci_y+1)*iGrid.cellSize[YY] + shy;
3673                 }
3674
3675                 get_cell_range<YY>(by0, by1,
3676                                    jGrid,
3677                                    d2z_cx, rlist,
3678                                    &cyf, &cyl);
3679
3680                 if (cyf > cyl)
3681                 {
3682                     continue;
3683                 }
3684
3685                 d2z_cy = d2z;
3686                 if (by1 < jGrid.c0[YY])
3687                 {
3688                     d2z_cy += gmx::square(jGrid.c0[YY] - by1);
3689                 }
3690                 else if (by0 > jGrid.c1[YY])
3691                 {
3692                     d2z_cy += gmx::square(by0 - jGrid.c1[YY]);
3693                 }
3694
3695                 for (int tx = -shp[XX]; tx <= shp[XX]; tx++)
3696                 {
3697                     const int  shift              = XYZ2IS(tx, ty, tz);
3698
3699                     const bool excludeSubDiagonal = (isIntraGridList && shift == CENTRAL);
3700
3701                     if (c_pbcShiftBackward && isIntraGridList && shift > CENTRAL)
3702                     {
3703                         continue;
3704                     }
3705
3706                     const real shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
3707
3708                     if (bSimple)
3709                     {
3710                         bx0 = bb_i[ci].lower[BB_X] + shx;
3711                         bx1 = bb_i[ci].upper[BB_X] + shx;
3712                     }
3713                     else
3714                     {
3715                         bx0 = iGrid.c0[XX] + (ci_x  )*iGrid.cellSize[XX] + shx;
3716                         bx1 = iGrid.c0[XX] + (ci_x+1)*iGrid.cellSize[XX] + shx;
3717                     }
3718
3719                     get_cell_range<XX>(bx0, bx1,
3720                                        jGrid,
3721                                        d2z_cy, rlist,
3722                                        &cxf, &cxl);
3723
3724                     if (cxf > cxl)
3725                     {
3726                         continue;
3727                     }
3728
3729                     addNewIEntry(nbl, cell0_i+ci, shift, flags_i[ci]);
3730
3731                     if ((!c_pbcShiftBackward || excludeSubDiagonal) &&
3732                         cxf < ci_x)
3733                     {
3734                         /* Leave the pairs with i > j.
3735                          * x is the major index, so skip half of it.
3736                          */
3737                         cxf = ci_x;
3738                     }
3739
3740                     set_icell_bb(iGrid, ci, shx, shy, shz,
3741                                  nbl->work);
3742
3743                     icell_set_x(cell0_i+ci, shx, shy, shz,
3744                                 nbat->xstride, nbat->x().data(),
3745                                 nb_kernel_type,
3746                                 nbl->work);
3747
3748                     for (int cx = cxf; cx <= cxl; cx++)
3749                     {
3750                         d2zx = d2z;
3751                         if (jGrid.c0[XX] + cx*jGrid.cellSize[XX] > bx1)
3752                         {
3753                             d2zx += gmx::square(jGrid.c0[XX] + cx*jGrid.cellSize[XX] - bx1);
3754                         }
3755                         else if (jGrid.c0[XX] + (cx+1)*jGrid.cellSize[XX] < bx0)
3756                         {
3757                             d2zx += gmx::square(jGrid.c0[XX] + (cx+1)*jGrid.cellSize[XX] - bx0);
3758                         }
3759
3760                         if (isIntraGridList &&
3761                             cx == 0 &&
3762                             (!c_pbcShiftBackward || shift == CENTRAL) &&
3763                             cyf < ci_y)
3764                         {
3765                             /* Leave the pairs with i > j.
3766                              * Skip half of y when i and j have the same x.
3767                              */
3768                             cyf_x = ci_y;
3769                         }
3770                         else
3771                         {
3772                             cyf_x = cyf;
3773                         }
3774
3775                         for (int cy = cyf_x; cy <= cyl; cy++)
3776                         {
3777                             const int columnStart = jGrid.cxy_ind[cx*jGrid.numCells[YY] + cy];
3778                             const int columnEnd   = jGrid.cxy_ind[cx*jGrid.numCells[YY] + cy + 1];
3779
3780                             d2zxy = d2zx;
3781                             if (jGrid.c0[YY] + cy*jGrid.cellSize[YY] > by1)
3782                             {
3783                                 d2zxy += gmx::square(jGrid.c0[YY] + cy*jGrid.cellSize[YY] - by1);
3784                             }
3785                             else if (jGrid.c0[YY] + (cy+1)*jGrid.cellSize[YY] < by0)
3786                             {
3787                                 d2zxy += gmx::square(jGrid.c0[YY] + (cy+1)*jGrid.cellSize[YY] - by0);
3788                             }
3789                             if (columnStart < columnEnd && d2zxy < listRangeBBToJCell2)
3790                             {
3791                                 /* To improve efficiency in the common case
3792                                  * of a homogeneous particle distribution,
3793                                  * we estimate the index of the middle cell
3794                                  * in range (midCell). We search down and up
3795                                  * starting from this index.
3796                                  *
3797                                  * Note that the bbcz_j array contains bounds
3798                                  * for i-clusters, thus for clusters of 4 atoms.
3799                                  * For the common case where the j-cluster size
3800                                  * is 8, we could step with a stride of 2,
3801                                  * but we do not do this because it would
3802                                  * complicate this code even more.
3803                                  */
3804                                 int midCell = columnStart + static_cast<int>(bz1_frac*(columnEnd - columnStart));
3805                                 if (midCell >= columnEnd)
3806                                 {
3807                                     midCell = columnEnd - 1;
3808                                 }
3809
3810                                 d2xy = d2zxy - d2z;
3811
3812                                 /* Find the lowest cell that can possibly
3813                                  * be within range.
3814                                  * Check if we hit the bottom of the grid,
3815                                  * if the j-cell is below the i-cell and if so,
3816                                  * if it is within range.
3817                                  */
3818                                 int downTestCell = midCell;
3819                                 while (downTestCell >= columnStart &&
3820                                        (bbcz_j[downTestCell*NNBSBB_D + 1] >= bz0 ||
3821                                         d2xy + gmx::square(bbcz_j[downTestCell*NNBSBB_D + 1] - bz0) < rlist2))
3822                                 {
3823                                     downTestCell--;
3824                                 }
3825                                 int firstCell = downTestCell + 1;
3826
3827                                 /* Find the highest cell that can possibly
3828                                  * be within range.
3829                                  * Check if we hit the top of the grid,
3830                                  * if the j-cell is above the i-cell and if so,
3831                                  * if it is within range.
3832                                  */
3833                                 int upTestCell = midCell + 1;
3834                                 while (upTestCell < columnEnd &&
3835                                        (bbcz_j[upTestCell*NNBSBB_D] <= bz1 ||
3836                                         d2xy + gmx::square(bbcz_j[upTestCell*NNBSBB_D] - bz1) < rlist2))
3837                                 {
3838                                     upTestCell++;
3839                                 }
3840                                 int lastCell = upTestCell - 1;
3841
3842 #define NBNXN_REFCODE 0
3843 #if NBNXN_REFCODE
3844                                 {
3845                                     /* Simple reference code, for debugging,
3846                                      * overrides the more complex code above.
3847                                      */
3848                                     firstCell = columnEnd;
3849                                     lastCell  = -1;
3850                                     for (int k = columnStart; k < columnEnd; k++)
3851                                     {
3852                                         if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D + 1] - bz0) < rlist2 &&
3853                                             k < firstCell)
3854                                         {
3855                                             firstCell = k;
3856                                         }
3857                                         if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D] - bz1) < rlist2 &&
3858                                             k > lastCell)
3859                                         {
3860                                             lastCell = k;
3861                                         }
3862                                     }
3863                                 }
3864 #endif
3865
3866                                 if (isIntraGridList)
3867                                 {
3868                                     /* We want each atom/cell pair only once,
3869                                      * only use cj >= ci.
3870                                      */
3871                                     if (!c_pbcShiftBackward || shift == CENTRAL)
3872                                     {
3873                                         firstCell = std::max(firstCell, ci);
3874                                     }
3875                                 }
3876
3877                                 if (firstCell <= lastCell)
3878                                 {
3879                                     GMX_ASSERT(firstCell >= columnStart && lastCell < columnEnd, "The range should reside within the current grid column");
3880
3881                                     /* For f buffer flags with simple lists */
3882                                     ncj_old_j = getNumSimpleJClustersInList(*nbl);
3883
3884                                     check_cell_list_space(nbl, lastCell - firstCell + 1);
3885
3886                                     makeClusterListWrapper(nbl,
3887                                                            iGrid, ci,
3888                                                            jGrid, firstCell, lastCell,
3889                                                            excludeSubDiagonal,
3890                                                            nbat,
3891                                                            rlist2, rbb2,
3892                                                            nb_kernel_type,
3893                                                            &numDistanceChecks);
3894
3895                                     if (bFBufferFlag)
3896                                     {
3897                                         setBufferFlags(*nbl, ncj_old_j, gridj_flag_shift,
3898                                                        gridj_flag, th);
3899                                     }
3900
3901                                     incrementNumSimpleJClustersInList(nbl, ncj_old_j);
3902                                 }
3903                             }
3904                         }
3905                     }
3906
3907                     /* Set the exclusions for this ci list */
3908                     setExclusionsForIEntry(nbs,
3909                                            nbl,
3910                                            excludeSubDiagonal,
3911                                            na_cj_2log,
3912                                            *getOpenIEntry(nbl),
3913                                            exclusions);
3914
3915                     if (nbs->bFEP)
3916                     {
3917                         make_fep_list(nbs, nbat, nbl,
3918                                       excludeSubDiagonal,
3919                                       getOpenIEntry(nbl),
3920                                       shx, shy, shz,
3921                                       rl_fep2,
3922                                       iGrid, jGrid, nbl_fep);
3923                     }
3924
3925                     /* Close this ci list */
3926                     closeIEntry(nbl,
3927                                 nsubpair_max,
3928                                 progBal, nsubpair_tot_est,
3929                                 th, nth);
3930                 }
3931             }
3932         }
3933
3934         if (bFBufferFlag && getNumSimpleJClustersInList(*nbl) > ncj_old_i)
3935         {
3936             bitmask_init_bit(&(work->buffer_flags.flag[(iGrid.cell0+ci) >> gridi_flag_shift]), th);
3937         }
3938     }
3939
3940     work->ndistc = numDistanceChecks;
3941
3942     nbs_cycle_stop(&work->cc[enbsCCsearch]);
3943
3944     checkListSizeConsistency(*nbl, nbs->bFEP);
3945
3946     if (debug)
3947     {
3948         fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
3949
3950         print_nblist_statistics(debug, nbl, nbs, rlist);
3951
3952         if (nbs->bFEP)
3953         {
3954             fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj);
3955         }
3956     }
3957 }
3958
3959 static void reduce_buffer_flags(const nbnxn_search         *nbs,
3960                                 int                         nsrc,
3961                                 const nbnxn_buffer_flags_t *dest)
3962 {
3963     for (int s = 0; s < nsrc; s++)
3964     {
3965         gmx_bitmask_t * flag = nbs->work[s].buffer_flags.flag;
3966
3967         for (int b = 0; b < dest->nflag; b++)
3968         {
3969             bitmask_union(&(dest->flag[b]), flag[b]);
3970         }
3971     }
3972 }
3973
3974 static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
3975 {
3976     int           nelem, nkeep, ncopy, nred, out;
3977     gmx_bitmask_t mask_0;
3978
3979     nelem = 0;
3980     nkeep = 0;
3981     ncopy = 0;
3982     nred  = 0;
3983     bitmask_init_bit(&mask_0, 0);
3984     for (int b = 0; b < flags->nflag; b++)
3985     {
3986         if (bitmask_is_equal(flags->flag[b], mask_0))
3987         {
3988             /* Only flag 0 is set, no copy of reduction required */
3989             nelem++;
3990             nkeep++;
3991         }
3992         else if (!bitmask_is_zero(flags->flag[b]))
3993         {
3994             int c = 0;
3995             for (out = 0; out < nout; out++)
3996             {
3997                 if (bitmask_is_set(flags->flag[b], out))
3998                 {
3999                     c++;
4000                 }
4001             }
4002             nelem += c;
4003             if (c == 1)
4004             {
4005                 ncopy++;
4006             }
4007             else
4008             {
4009                 nred += c;
4010             }
4011         }
4012     }
4013
4014     fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
4015             flags->nflag, nout,
4016             nelem/static_cast<double>(flags->nflag),
4017             nkeep/static_cast<double>(flags->nflag),
4018             ncopy/static_cast<double>(flags->nflag),
4019             nred/static_cast<double>(flags->nflag));
4020 }
4021
4022 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
4023  * *cjGlobal is updated with the cj count in src.
4024  * When setFlags==true, flag bit t is set in flag for all i and j clusters.
4025  */
4026 template<bool setFlags>
4027 static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
4028                                   const NbnxnPairlistCpu * gmx_restrict src,
4029                                   NbnxnPairlistCpu * gmx_restrict dest,
4030                                   gmx_bitmask_t *flag,
4031                                   int iFlagShift, int jFlagShift, int t)
4032 {
4033     const int ncj = srcCi->cj_ind_end - srcCi->cj_ind_start;
4034
4035     dest->ci.push_back(*srcCi);
4036     dest->ci.back().cj_ind_start = dest->cj.size();
4037     dest->ci.back().cj_ind_end   = dest->cj.size() + ncj;
4038
4039     if (setFlags)
4040     {
4041         bitmask_init_bit(&flag[srcCi->ci >> iFlagShift], t);
4042     }
4043
4044     for (int j = srcCi->cj_ind_start; j < srcCi->cj_ind_end; j++)
4045     {
4046         dest->cj.push_back(src->cj[j]);
4047
4048         if (setFlags)
4049         {
4050             /* NOTE: This is relatively expensive, since this
4051              * operation is done for all elements in the list,
4052              * whereas at list generation this is done only
4053              * once for each flag entry.
4054              */
4055             bitmask_init_bit(&flag[src->cj[j].cj >> jFlagShift], t);
4056         }
4057     }
4058 }
4059
4060 /* This routine re-balances the pairlists such that all are nearly equally
4061  * sized. Only whole i-entries are moved between lists. These are moved
4062  * between the ends of the lists, such that the buffer reduction cost should
4063  * not change significantly.
4064  * Note that all original reduction flags are currently kept. This can lead
4065  * to reduction of parts of the force buffer that could be avoided. But since
4066  * the original lists are quite balanced, this will only give minor overhead.
4067  */
4068 static void rebalanceSimpleLists(int                                  numLists,
4069                                  NbnxnPairlistCpu * const * const     srcSet,
4070                                  NbnxnPairlistCpu                   **destSet,
4071                                  gmx::ArrayRef<nbnxn_search_work_t>   searchWork)
4072 {
4073     int ncjTotal = 0;
4074     for (int s = 0; s < numLists; s++)
4075     {
4076         ncjTotal += srcSet[s]->ncjInUse;
4077     }
4078     int ncjTarget = (ncjTotal + numLists - 1)/numLists;
4079
4080 #pragma omp parallel num_threads(numLists)
4081     {
4082         int t       = gmx_omp_get_thread_num();
4083
4084         int cjStart = ncjTarget* t;
4085         int cjEnd   = ncjTarget*(t + 1);
4086
4087         /* The destination pair-list for task/thread t */
4088         NbnxnPairlistCpu *dest = destSet[t];
4089
4090         clear_pairlist(dest);
4091         dest->na_cj   = srcSet[0]->na_cj;
4092
4093         /* Note that the flags in the work struct (still) contain flags
4094          * for all entries that are present in srcSet->nbl[t].
4095          */
4096         gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
4097
4098         int            iFlagShift = getBufferFlagShift(dest->na_ci);
4099         int            jFlagShift = getBufferFlagShift(dest->na_cj);
4100
4101         int            cjGlobal   = 0;
4102         for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
4103         {
4104             const NbnxnPairlistCpu *src = srcSet[s];
4105
4106             if (cjGlobal + src->ncjInUse > cjStart)
4107             {
4108                 for (gmx::index i = 0; i < static_cast<gmx::index>(src->ci.size()) && cjGlobal < cjEnd; i++)
4109                 {
4110                     const nbnxn_ci_t *srcCi = &src->ci[i];
4111                     int               ncj   = srcCi->cj_ind_end - srcCi->cj_ind_start;
4112                     if (cjGlobal >= cjStart)
4113                     {
4114                         /* If the source list is not our own, we need to set
4115                          * extra flags (the template bool parameter).
4116                          */
4117                         if (s != t)
4118                         {
4119                             copySelectedListRange
4120                             <true>
4121                                 (srcCi, src, dest,
4122                                 flag, iFlagShift, jFlagShift, t);
4123                         }
4124                         else
4125                         {
4126                             copySelectedListRange
4127                             <false>
4128                                 (srcCi, src,
4129                                 dest, flag, iFlagShift, jFlagShift, t);
4130                         }
4131                     }
4132                     cjGlobal += ncj;
4133                 }
4134             }
4135             else
4136             {
4137                 cjGlobal += src->ncjInUse;
4138             }
4139         }
4140
4141         dest->ncjInUse = dest->cj.size();
4142     }
4143
4144 #ifndef NDEBUG
4145     int ncjTotalNew = 0;
4146     for (int s = 0; s < numLists; s++)
4147     {
4148         ncjTotalNew += destSet[s]->ncjInUse;
4149     }
4150     GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
4151 #endif
4152 }
4153
4154 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
4155 static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
4156 {
4157     int numLists = listSet->nnbl;
4158     int ncjMax   = 0;
4159     int ncjTotal = 0;
4160     for (int s = 0; s < numLists; s++)
4161     {
4162         ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
4163         ncjTotal += listSet->nbl[s]->ncjInUse;
4164     }
4165     if (debug)
4166     {
4167         fprintf(debug, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax, ncjTotal);
4168     }
4169     /* The rebalancing adds 3% extra time to the search. Heuristically we
4170      * determined that under common conditions the non-bonded kernel balance
4171      * improvement will outweigh this when the imbalance is more than 3%.
4172      * But this will, obviously, depend on search vs kernel time and nstlist.
4173      */
4174     const real rebalanceTolerance = 1.03;
4175
4176     return numLists*ncjMax > ncjTotal*rebalanceTolerance;
4177 }
4178
4179 /* Perform a count (linear) sort to sort the smaller lists to the end.
4180  * This avoids load imbalance on the GPU, as large lists will be
4181  * scheduled and executed first and the smaller lists later.
4182  * Load balancing between multi-processors only happens at the end
4183  * and there smaller lists lead to more effective load balancing.
4184  * The sorting is done on the cj4 count, not on the actual pair counts.
4185  * Not only does this make the sort faster, but it also results in
4186  * better load balancing than using a list sorted on exact load.
4187  * This function swaps the pointer in the pair list to avoid a copy operation.
4188  */
4189 static void sort_sci(NbnxnPairlistGpu *nbl)
4190 {
4191     if (nbl->ncj4 <= nbl->nsci)
4192     {
4193         /* nsci = 0 or all sci have size 1, sorting won't change the order */
4194         return;
4195     }
4196
4197     NbnxnPairlistGpuWork &work = *nbl->work;
4198
4199     /* We will distinguish differences up to double the average */
4200     const int m = (2*nbl->ncj4)/nbl->nsci;
4201
4202     if (work.sci_sort_nalloc != nbl->sci_nalloc)
4203     {
4204         work.sci_sort_nalloc = nbl->sci_nalloc;
4205         nbnxn_realloc_void(reinterpret_cast<void **>(&work.sci_sort),
4206                            0,
4207                            work.sci_sort_nalloc*sizeof(*work.sci_sort),
4208                            nbl->alloc, nbl->free);
4209     }
4210
4211     std::vector<int> &sort = work.sortBuffer;
4212     /* Set up m + 1 entries in sort, initialized at 0 */
4213     sort.clear();
4214     sort.resize(m + 1, 0);
4215     /* Count the entries of each size */
4216     for (int s = 0; s < nbl->nsci; s++)
4217     {
4218         int i = std::min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
4219         sort[i]++;
4220     }
4221     /* Calculate the offset for each count */
4222     int s0  = sort[m];
4223     sort[m] = 0;
4224     for (int i = m - 1; i >= 0; i--)
4225     {
4226         int s1  = sort[i];
4227         sort[i] = sort[i + 1] + s0;
4228         s0      = s1;
4229     }
4230
4231     /* Sort entries directly into place */
4232     nbnxn_sci_t *sci_sort = work.sci_sort;
4233     for (int s = 0; s < nbl->nsci; s++)
4234     {
4235         int i = std::min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
4236         sci_sort[sort[i]++] = nbl->sci[s];
4237     }
4238
4239     /* Swap the sci pointers so we use the new, sorted list */
4240     std::swap(nbl->sci, work.sci_sort);
4241 }
4242
4243 /* Make a local or non-local pair-list, depending on iloc */
4244 void nbnxn_make_pairlist(nbnxn_search         *nbs,
4245                          nbnxn_atomdata_t     *nbat,
4246                          const t_blocka       *excl,
4247                          real                  rlist,
4248                          int                   min_ci_balanced,
4249                          nbnxn_pairlist_set_t *nbl_list,
4250                          int                   iloc,
4251                          int                   nb_kernel_type,
4252                          t_nrnb               *nrnb)
4253 {
4254     int                nsubpair_target;
4255     float              nsubpair_tot_est;
4256     int                nnbl;
4257     int                ci_block;
4258     gmx_bool           CombineNBLists;
4259     gmx_bool           progBal;
4260     int                np_tot, np_noq, np_hlj, nap;
4261
4262     nnbl            = nbl_list->nnbl;
4263     CombineNBLists  = nbl_list->bCombined;
4264
4265     if (debug)
4266     {
4267         fprintf(debug, "ns making %d nblists\n", nnbl);
4268     }
4269
4270     nbat->bUseBufferFlags = (nbat->out.size() > 1);
4271     /* We should re-init the flags before making the first list */
4272     if (nbat->bUseBufferFlags && LOCAL_I(iloc))
4273     {
4274         init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
4275     }
4276
4277     int nzi;
4278     if (LOCAL_I(iloc))
4279     {
4280         /* Only zone (grid) 0 vs 0 */
4281         nzi = 1;
4282     }
4283     else
4284     {
4285         nzi = nbs->zones->nizone;
4286     }
4287
4288     if (!nbl_list->bSimple && min_ci_balanced > 0)
4289     {
4290         get_nsubpair_target(nbs, iloc, rlist, min_ci_balanced,
4291                             &nsubpair_target, &nsubpair_tot_est);
4292     }
4293     else
4294     {
4295         nsubpair_target  = 0;
4296         nsubpair_tot_est = 0;
4297     }
4298
4299     /* Clear all pair-lists */
4300     for (int th = 0; th < nnbl; th++)
4301     {
4302         if (nbl_list->bSimple)
4303         {
4304             clear_pairlist(nbl_list->nbl[th]);
4305         }
4306         else
4307         {
4308             clear_pairlist(nbl_list->nblGpu[th]);
4309         }
4310
4311         if (nbs->bFEP)
4312         {
4313             clear_pairlist_fep(nbl_list->nbl_fep[th]);
4314         }
4315     }
4316
4317     for (int zi = 0; zi < nzi; zi++)
4318     {
4319         const nbnxn_grid_t &iGrid = nbs->grid[zi];
4320
4321         int                 zj0;
4322         int                 zj1;
4323         if (LOCAL_I(iloc))
4324         {
4325             zj0 = 0;
4326             zj1 = 1;
4327         }
4328         else
4329         {
4330             zj0 = nbs->zones->izone[zi].j0;
4331             zj1 = nbs->zones->izone[zi].j1;
4332             if (zi == 0)
4333             {
4334                 zj0++;
4335             }
4336         }
4337         for (int zj = zj0; zj < zj1; zj++)
4338         {
4339             const nbnxn_grid_t &jGrid = nbs->grid[zj];
4340
4341             if (debug)
4342             {
4343                 fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
4344             }
4345
4346             nbs_cycle_start(&nbs->cc[enbsCCsearch]);
4347
4348             ci_block = get_ci_block_size(iGrid, nbs->DomDec, nnbl);
4349
4350             /* With GPU: generate progressively smaller lists for
4351              * load balancing for local only or non-local with 2 zones.
4352              */
4353             progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
4354
4355 #pragma omp parallel for num_threads(nnbl) schedule(static)
4356             for (int th = 0; th < nnbl; th++)
4357             {
4358                 try
4359                 {
4360                     /* Re-init the thread-local work flag data before making
4361                      * the first list (not an elegant conditional).
4362                      */
4363                     if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0)))
4364                     {
4365                         init_buffer_flags(&nbs->work[th].buffer_flags, nbat->numAtoms());
4366                     }
4367
4368                     if (CombineNBLists && th > 0)
4369                     {
4370                         GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
4371
4372                         clear_pairlist(nbl_list->nblGpu[th]);
4373                     }
4374
4375                     /* Divide the i super cell equally over the nblists */
4376                     if (nbl_list->bSimple)
4377                     {
4378                         nbnxn_make_pairlist_part(nbs, iGrid, jGrid,
4379                                                  &nbs->work[th], nbat, *excl,
4380                                                  rlist,
4381                                                  nb_kernel_type,
4382                                                  ci_block,
4383                                                  nbat->bUseBufferFlags,
4384                                                  nsubpair_target,
4385                                                  progBal, nsubpair_tot_est,
4386                                                  th, nnbl,
4387                                                  nbl_list->nbl[th],
4388                                                  nbl_list->nbl_fep[th]);
4389                     }
4390                     else
4391                     {
4392                         nbnxn_make_pairlist_part(nbs, iGrid, jGrid,
4393                                                  &nbs->work[th], nbat, *excl,
4394                                                  rlist,
4395                                                  nb_kernel_type,
4396                                                  ci_block,
4397                                                  nbat->bUseBufferFlags,
4398                                                  nsubpair_target,
4399                                                  progBal, nsubpair_tot_est,
4400                                                  th, nnbl,
4401                                                  nbl_list->nblGpu[th],
4402                                                  nbl_list->nbl_fep[th]);
4403                     }
4404                 }
4405                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4406             }
4407             nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
4408
4409             np_tot = 0;
4410             np_noq = 0;
4411             np_hlj = 0;
4412             for (int th = 0; th < nnbl; th++)
4413             {
4414                 inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
4415
4416                 if (nbl_list->bSimple)
4417                 {
4418                     NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
4419                     np_tot += nbl->cj.size();
4420                     np_noq += nbl->work->ncj_noq;
4421                     np_hlj += nbl->work->ncj_hlj;
4422                 }
4423                 else
4424                 {
4425                     NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
4426                     /* This count ignores potential subsequent pair pruning */
4427                     np_tot += nbl->nci_tot;
4428                 }
4429             }
4430             if (nbl_list->bSimple)
4431             {
4432                 nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
4433             }
4434             else
4435             {
4436                 nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
4437             }
4438             nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
4439             nbl_list->natpair_lj  = np_noq*nap;
4440             nbl_list->natpair_q   = np_hlj*nap/2;
4441
4442             if (CombineNBLists && nnbl > 1)
4443             {
4444                 GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
4445                 NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
4446
4447                 nbs_cycle_start(&nbs->cc[enbsCCcombine]);
4448
4449                 combine_nblists(nnbl-1, nbl+1, nbl[0]);
4450
4451                 nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
4452             }
4453         }
4454     }
4455
4456     if (nbl_list->bSimple)
4457     {
4458         if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
4459         {
4460             rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, nbs->work);
4461
4462             /* Swap the pointer of the sets of pair lists */
4463             NbnxnPairlistCpu **tmp = nbl_list->nbl;
4464             nbl_list->nbl          = nbl_list->nbl_work;
4465             nbl_list->nbl_work     = tmp;
4466         }
4467     }
4468     else
4469     {
4470         /* Sort the entries on size, large ones first */
4471         if (CombineNBLists || nnbl == 1)
4472         {
4473             sort_sci(nbl_list->nblGpu[0]);
4474         }
4475         else
4476         {
4477 #pragma omp parallel for num_threads(nnbl) schedule(static)
4478             for (int th = 0; th < nnbl; th++)
4479             {
4480                 try
4481                 {
4482                     sort_sci(nbl_list->nblGpu[th]);
4483                 }
4484                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
4485             }
4486         }
4487     }
4488
4489     if (nbat->bUseBufferFlags)
4490     {
4491         reduce_buffer_flags(nbs, nbl_list->nnbl, &nbat->buffer_flags);
4492     }
4493
4494     if (nbs->bFEP)
4495     {
4496         /* Balance the free-energy lists over all the threads */
4497         balance_fep_lists(nbs, nbl_list);
4498     }
4499
4500     if (nbl_list->bSimple)
4501     {
4502         /* This is a fresh list, so not pruned, stored using ci.
4503          * ciOuter is invalid at this point.
4504          */
4505         GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
4506     }
4507
4508     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
4509     if (LOCAL_I(iloc))
4510     {
4511         nbs->search_count++;
4512     }
4513     if (nbs->print_cycles &&
4514         (!nbs->DomDec || !LOCAL_I(iloc)) &&
4515         nbs->search_count % 100 == 0)
4516     {
4517         nbs_cycle_print(stderr, nbs);
4518     }
4519
4520     /* If we have more than one list, they either got rebalancing (CPU)
4521      * or combined (GPU), so we should dump the final result to debug.
4522      */
4523     if (debug && nbl_list->nnbl > 1)
4524     {
4525         if (nbl_list->bSimple)
4526         {
4527             for (int t = 0; t < nbl_list->nnbl; t++)
4528             {
4529                 print_nblist_statistics(debug, nbl_list->nbl[t], nbs, rlist);
4530             }
4531         }
4532         else
4533         {
4534             print_nblist_statistics(debug, nbl_list->nblGpu[0], nbs, rlist);
4535         }
4536     }
4537
4538     if (debug)
4539     {
4540         if (gmx_debug_at)
4541         {
4542             if (nbl_list->bSimple)
4543             {
4544                 for (int t = 0; t < nbl_list->nnbl; t++)
4545                 {
4546                     print_nblist_ci_cj(debug, nbl_list->nbl[t]);
4547                 }
4548             }
4549             else
4550             {
4551                 print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
4552             }
4553         }
4554
4555         if (nbat->bUseBufferFlags)
4556         {
4557             print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
4558         }
4559     }
4560 }
4561
4562 void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
4563 {
4564     GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
4565
4566     /* TODO: Restructure the lists so we have actual outer and inner
4567      *       list objects so we can set a single pointer instead of
4568      *       swapping several pointers.
4569      */
4570
4571     for (int i = 0; i < listSet->nnbl; i++)
4572     {
4573         NbnxnPairlistCpu &list = *listSet->nbl[i];
4574
4575         /* The search produced a list in ci/cj.
4576          * Swap the list pointers so we get the outer list is ciOuter,cjOuter
4577          * and we can prune that to get an inner list in ci/cj.
4578          */
4579         GMX_RELEASE_ASSERT(list.ciOuter.empty() && list.cjOuter.empty(),
4580                            "The outer lists should be empty before preparation");
4581
4582         std::swap(list.ci, list.ciOuter);
4583         std::swap(list.cj, list.cjOuter);
4584     }
4585 }