src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief
  39  * Implements the Grid class.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_nbnxm
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "grid.h"
  48
  49 #include <cmath>
  50 #include <cstring>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/math/utilities.h"
  55 #include "gromacs/math/vec.h"
  56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  57 #include "gromacs/mdlib/updategroupscog.h"
  58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  59 #include "gromacs/simd/simd.h"
  60 #include "gromacs/simd/vector_operations.h"
  61
  62 #include "atomdata.h"
  63 #include "boundingboxes.h"
  64 #include "gridsetdata.h"
  65 #include "nbnxm_geometry.h"
  66 #include "pairlistparams.h"
  67
  68 namespace Nbnxm
  69 {
  70
  71 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  72     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  73     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  74     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  75     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell) * numAtomsICluster),
  76     numAtomsICluster2Log(get_2log(numAtomsICluster))
  77 {
  78 }
  79
  80 Grid::Grid(const PairlistType pairlistType, const bool& haveFep) :
  81     geometry_(pairlistType),
  82     haveFep_(haveFep)
  83 {
  84 }
  85
  86 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  87 static real gridAtomDensity(int numAtoms, const rvec lowerCorner, const rvec upperCorner)
  88 {
  89     rvec size;
  90
  91     if (numAtoms == 0)
  92     {
  93         /* To avoid zero density we use a minimum of 1 atom */
  94         numAtoms = 1;
  95     }
  96
  97     rvec_sub(upperCorner, lowerCorner, size);
  98
  99     return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
 100 }
 101
 102 void Grid::setDimensions(const int          ddZone,
 103                          const int          numAtoms,
 104                          gmx::RVec          lowerCorner,
 105                          gmx::RVec          upperCorner,
 106                          real               atomDensity,
 107                          const real         maxAtomGroupRadius,
 108                          const bool         haveFep,
 109                          gmx::PinningPolicy pinningPolicy)
 110 {
 111     /* We allow passing lowerCorner=upperCorner, in which case we need to
 112      * create a finite sized bounding box to avoid division by zero.
 113      * We use a minimum size such that the volume fits in float with some
 114      * margin for computing and using the atom number density.
 115      */
 116     constexpr real c_minimumGridSize = 1e-10;
 117     for (int d = 0; d < DIM; d++)
 118     {
 119         GMX_ASSERT(upperCorner[d] >= lowerCorner[d],
 120                    "Upper corner should be larger than the lower corner");
 121         if (upperCorner[d] - lowerCorner[d] < c_minimumGridSize)
 122         {
 123             /* Ensure we apply a correction to the bounding box */
 124             real correction =
 125                     std::max(std::abs(lowerCorner[d]) * GMX_REAL_EPS, 0.5_real * c_minimumGridSize);
 126             lowerCorner[d] -= correction;
 127             upperCorner[d] += correction;
 128         }
 129     }
 130
 131     /* For the home zone we compute the density when not set (=-1) or when =0 */
 132     if (ddZone == 0 && atomDensity <= 0)
 133     {
 134         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 135     }
 136
 137     dimensions_.atomDensity        = atomDensity;
 138     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 139
 140     rvec size;
 141     rvec_sub(upperCorner, lowerCorner, size);
 142
 143     if (numAtoms > geometry_.numAtomsPerCell)
 144     {
 145         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 146
 147         /* target cell length */
 148         real tlen_x;
 149         real tlen_y;
 150         if (geometry_.isSimple)
 151         {
 152             /* To minimize the zero interactions, we should make
 153              * the largest of the i/j cell cubic.
 154              */
 155             int numAtomsInCell = std::max(geometry_.numAtomsICluster, geometry_.numAtomsJCluster);
 156
 157             /* Approximately cubic cells */
 158             real tlen = std::cbrt(numAtomsInCell / atomDensity);
 159             tlen_x    = tlen;
 160             tlen_y    = tlen;
 161         }
 162         else
 163         {
 164             /* Approximately cubic sub cells */
 165             real tlen = std::cbrt(geometry_.numAtomsICluster / atomDensity);
 166             tlen_x    = tlen * c_gpuNumClusterPerCellX;
 167             tlen_y    = tlen * c_gpuNumClusterPerCellY;
 168         }
 169         /* We round ncx and ncy down, because we get less cell pairs
 170          * in the pairlist when the fixed cell dimensions (x,y) are
 171          * larger than the variable one (z) than the other way around.
 172          */
 173         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX] / tlen_x));
 174         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY] / tlen_y));
 175     }
 176     else
 177     {
 178         dimensions_.numCells[XX] = 1;
 179         dimensions_.numCells[YY] = 1;
 180     }
 181
 182     for (int d = 0; d < DIM - 1; d++)
 183     {
 184         dimensions_.cellSize[d]    = size[d] / dimensions_.numCells[d];
 185         dimensions_.invCellSize[d] = 1 / dimensions_.cellSize[d];
 186     }
 187
 188     if (ddZone > 0)
 189     {
 190         /* This is a non-home zone, add an extra row of cells
 191          * for particles communicated for bonded interactions.
 192          * These can be beyond the cut-off. It doesn't matter where
 193          * they end up on the grid, but for performance it's better
 194          * if they don't end up in cells that can be within cut-off range.
 195          */
 196         dimensions_.numCells[XX]++;
 197         dimensions_.numCells[YY]++;
 198     }
 199
 200     /* We need one additional cell entry for particles moved by DD */
 201     cxy_na_.resize(numColumns() + 1);
 202     cxy_ind_.resize(numColumns() + 2);
 203     changePinningPolicy(&cxy_na_, pinningPolicy);
 204     changePinningPolicy(&cxy_ind_, pinningPolicy);
 205
 206     /* Worst case scenario of 1 atom in each last cell */
 207     int maxNumCells;
 208     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 209     {
 210         maxNumCells = numAtoms / geometry_.numAtomsPerCell + numColumns();
 211     }
 212     else
 213     {
 214         maxNumCells = numAtoms / geometry_.numAtomsPerCell
 215                       + numColumns() * geometry_.numAtomsJCluster / geometry_.numAtomsICluster;
 216     }
 217
 218     if (!geometry_.isSimple)
 219     {
 220         numClusters_.resize(maxNumCells);
 221     }
 222     bbcz_.resize(maxNumCells);
 223
 224     /* This resize also zeros the contents, this avoid possible
 225      * floating exceptions in SIMD with the unused bb elements.
 226      */
 227     if (geometry_.isSimple)
 228     {
 229         bb_.resize(maxNumCells);
 230     }
 231     else
 232     {
 233 #if NBNXN_BBXXXX
 234         pbb_.resize(packedBoundingBoxesIndex(maxNumCells * c_gpuNumClusterPerCell));
 235 #else
 236         bb_.resize(maxNumCells * c_gpuNumClusterPerCell);
 237 #endif
 238     }
 239
 240     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 241     {
 242         bbj_ = bb_;
 243     }
 244     else
 245     {
 246         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 247
 248         bbjStorage_.resize(maxNumCells * geometry_.numAtomsICluster / geometry_.numAtomsJCluster);
 249         bbj_ = bbjStorage_;
 250     }
 251
 252     flags_.resize(maxNumCells);
 253     if (haveFep)
 254     {
 255         fep_.resize(maxNumCells * geometry_.numAtomsPerCell / geometry_.numAtomsICluster);
 256     }
 257
 258     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 259     copy_rvec(upperCorner, dimensions_.upperCorner);
 260     copy_rvec(size, dimensions_.gridSize);
 261 }
 262
 263 /* We need to sort paricles in grid columns on z-coordinate.
 264  * As particle are very often distributed homogeneously, we use a sorting
 265  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 266  * by a factor, cast to an int and try to store in that hole. If the hole
 267  * is full, we move this or another particle. A second pass is needed to make
 268  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 269  * 4 is the optimal value for homogeneous particle distribution and allows
 270  * for an O(#particles) sort up till distributions were all particles are
 271  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 272  * as it can be expensive to detect imhomogeneous particle distributions.
 273  */
 274 /*! \brief Ratio of grid cells to atoms */
 275 static constexpr int c_sortGridRatio = 4;
 276 /*! \brief Maximum ratio of holes used, in the worst case all particles
 277  * end up in the last hole and we need num. atoms extra holes at the end.
 278  */
 279 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 280
 281 /*! \brief Sorts particle index a on coordinates x along dim.
 282  *
 283  * Backwards tells if we want decreasing iso increasing coordinates.
 284  * h0 is the minimum of the coordinate range.
 285  * invh is the 1/length of the sorting range.
 286  * n_per_h (>=n) is the expected average number of particles per 1/invh
 287  * sort is the sorting work array.
 288  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 289  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 290  */
 291 static void sort_atoms(int      dim,
 292                        gmx_bool Backwards,
 293                        int gmx_unused dd_zone,
 294                        bool gmx_unused                relevantAtomsAreWithinGridBounds,
 295                        int*                           a,
 296                        int                            n,
 297                        gmx::ArrayRef<const gmx::RVec> x,
 298                        real                           h0,
 299                        real                           invh,
 300                        int                            n_per_h,
 301                        gmx::ArrayRef<int>             sort)
 302 {
 303     if (n <= 1)
 304     {
 305         /* Nothing to do */
 306         return;
 307     }
 308
 309     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 310
 311     /* Transform the inverse range height into the inverse hole height */
 312     invh *= n_per_h * c_sortGridRatio;
 313
 314     /* Set nsort to the maximum possible number of holes used.
 315      * In worst case all n elements end up in the last bin.
 316      */
 317     int nsort = n_per_h * c_sortGridRatio + n;
 318
 319     /* Determine the index range used, so we can limit it for the second pass */
 320     int zi_min = INT_MAX;
 321     int zi_max = -1;
 322
 323     /* Sort the particles using a simple index sort */
 324     for (int i = 0; i < n; i++)
 325     {
 326         /* The cast takes care of float-point rounding effects below zero.
 327          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 328          * times the box height out of the box.
 329          */
 330         int zi = static_cast<int>((x[a[i]][dim] - h0) * invh);
 331
 332 #ifndef NDEBUG
 333         /* As we can have rounding effect, we use > iso >= here */
 334         if (relevantAtomsAreWithinGridBounds && (zi < 0 || (dd_zone == 0 && zi > n_per_h * c_sortGridRatio)))
 335         {
 336             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n", a[i],
 337                       'x' + dim, x[a[i]][dim], h0, invh, zi, n_per_h, c_sortGridRatio);
 338         }
 339 #endif
 340         if (zi < 0)
 341         {
 342             zi = 0;
 343         }
 344
 345         /* In a non-local domain, particles communcated for bonded interactions
 346          * can be far beyond the grid size, which is set by the non-bonded
 347          * cut-off distance. We sort such particles into the last cell.
 348          */
 349         if (zi > n_per_h * c_sortGridRatio)
 350         {
 351             zi = n_per_h * c_sortGridRatio;
 352         }
 353
 354         /* Ideally this particle should go in sort cell zi,
 355          * but that might already be in use,
 356          * in that case find the first empty cell higher up
 357          */
 358         if (sort[zi] < 0)
 359         {
 360             sort[zi] = a[i];
 361             zi_min   = std::min(zi_min, zi);
 362             zi_max   = std::max(zi_max, zi);
 363         }
 364         else
 365         {
 366             /* We have multiple atoms in the same sorting slot.
 367              * Sort on real z for minimal bounding box size.
 368              * There is an extra check for identical z to ensure
 369              * well-defined output order, independent of input order
 370              * to ensure binary reproducibility after restarts.
 371              */
 372             while (sort[zi] >= 0
 373                    && (x[a[i]][dim] > x[sort[zi]][dim]
 374                        || (x[a[i]][dim] == x[sort[zi]][dim] && a[i] > sort[zi])))
 375             {
 376                 zi++;
 377             }
 378
 379             if (sort[zi] >= 0)
 380             {
 381                 /* Shift all elements by one slot until we find an empty slot */
 382                 int cp  = sort[zi];
 383                 int zim = zi + 1;
 384                 while (sort[zim] >= 0)
 385                 {
 386                     int tmp   = sort[zim];
 387                     sort[zim] = cp;
 388                     cp        = tmp;
 389                     zim++;
 390                 }
 391                 sort[zim] = cp;
 392                 zi_max    = std::max(zi_max, zim);
 393             }
 394             sort[zi] = a[i];
 395             zi_max   = std::max(zi_max, zi);
 396         }
 397     }
 398
 399     int c = 0;
 400     if (!Backwards)
 401     {
 402         for (int zi = 0; zi < nsort; zi++)
 403         {
 404             if (sort[zi] >= 0)
 405             {
 406                 a[c++]   = sort[zi];
 407                 sort[zi] = -1;
 408             }
 409         }
 410     }
 411     else
 412     {
 413         for (int zi = zi_max; zi >= zi_min; zi--)
 414         {
 415             if (sort[zi] >= 0)
 416             {
 417                 a[c++]   = sort[zi];
 418                 sort[zi] = -1;
 419             }
 420         }
 421     }
 422     if (c < n)
 423     {
 424         gmx_incons("Lost particles while sorting");
 425     }
 426 }
 427
 428 #if GMX_DOUBLE
 429 //! Returns double up to one least significant float bit smaller than x
 430 static double R2F_D(const float x)
 431 {
 432     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS) * x : (1 + GMX_FLOAT_EPS) * x);
 433 }
 434 //! Returns double up to one least significant float bit larger than x
 435 static double R2F_U(const float x)
 436 {
 437     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS) * x : (1 - GMX_FLOAT_EPS) * x);
 438 }
 439 #else
 440 //! Returns x
 441 static float R2F_D(const float x)
 442 {
 443     return x;
 444 }
 445 //! Returns x
 446 static float R2F_U(const float x)
 447 {
 448     return x;
 449 }
 450 #endif
 451
 452 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 453 static void calc_bounding_box(int na, int stride, const real* x, BoundingBox* bb)
 454 {
 455     int  i;
 456     real xl, xh, yl, yh, zl, zh;
 457
 458     i  = 0;
 459     xl = x[i + XX];
 460     xh = x[i + XX];
 461     yl = x[i + YY];
 462     yh = x[i + YY];
 463     zl = x[i + ZZ];
 464     zh = x[i + ZZ];
 465     i += stride;
 466     for (int j = 1; j < na; j++)
 467     {
 468         xl = std::min(xl, x[i + XX]);
 469         xh = std::max(xh, x[i + XX]);
 470         yl = std::min(yl, x[i + YY]);
 471         yh = std::max(yh, x[i + YY]);
 472         zl = std::min(zl, x[i + ZZ]);
 473         zh = std::max(zh, x[i + ZZ]);
 474         i += stride;
 475     }
 476     /* Note: possible double to float conversion here */
 477     bb->lower.x = R2F_D(xl);
 478     bb->lower.y = R2F_D(yl);
 479     bb->lower.z = R2F_D(zl);
 480     bb->upper.x = R2F_U(xh);
 481     bb->upper.y = R2F_U(yh);
 482     bb->upper.z = R2F_U(zh);
 483 }
 484
 485 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 486 static void calc_bounding_box_x_x4(int na, const real* x, BoundingBox* bb)
 487 {
 488     real xl, xh, yl, yh, zl, zh;
 489
 490     xl = x[XX * c_packX4];
 491     xh = x[XX * c_packX4];
 492     yl = x[YY * c_packX4];
 493     yh = x[YY * c_packX4];
 494     zl = x[ZZ * c_packX4];
 495     zh = x[ZZ * c_packX4];
 496     for (int j = 1; j < na; j++)
 497     {
 498         xl = std::min(xl, x[j + XX * c_packX4]);
 499         xh = std::max(xh, x[j + XX * c_packX4]);
 500         yl = std::min(yl, x[j + YY * c_packX4]);
 501         yh = std::max(yh, x[j + YY * c_packX4]);
 502         zl = std::min(zl, x[j + ZZ * c_packX4]);
 503         zh = std::max(zh, x[j + ZZ * c_packX4]);
 504     }
 505     /* Note: possible double to float conversion here */
 506     bb->lower.x = R2F_D(xl);
 507     bb->lower.y = R2F_D(yl);
 508     bb->lower.z = R2F_D(zl);
 509     bb->upper.x = R2F_U(xh);
 510     bb->upper.y = R2F_U(yh);
 511     bb->upper.z = R2F_U(zh);
 512 }
 513
 514 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 515 static void calc_bounding_box_x_x8(int na, const real* x, BoundingBox* bb)
 516 {
 517     real xl, xh, yl, yh, zl, zh;
 518
 519     xl = x[XX * c_packX8];
 520     xh = x[XX * c_packX8];
 521     yl = x[YY * c_packX8];
 522     yh = x[YY * c_packX8];
 523     zl = x[ZZ * c_packX8];
 524     zh = x[ZZ * c_packX8];
 525     for (int j = 1; j < na; j++)
 526     {
 527         xl = std::min(xl, x[j + XX * c_packX8]);
 528         xh = std::max(xh, x[j + XX * c_packX8]);
 529         yl = std::min(yl, x[j + YY * c_packX8]);
 530         yh = std::max(yh, x[j + YY * c_packX8]);
 531         zl = std::min(zl, x[j + ZZ * c_packX8]);
 532         zh = std::max(zh, x[j + ZZ * c_packX8]);
 533     }
 534     /* Note: possible double to float conversion here */
 535     bb->lower.x = R2F_D(xl);
 536     bb->lower.y = R2F_D(yl);
 537     bb->lower.z = R2F_D(zl);
 538     bb->upper.x = R2F_U(xh);
 539     bb->upper.y = R2F_U(yh);
 540     bb->upper.z = R2F_U(zh);
 541 }
 542
 543 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 544 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real* x, BoundingBox* bb, BoundingBox* bbj)
 545 {
 546     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 547     using namespace gmx;
 548
 549     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 550
 551     if (na > 2)
 552     {
 553         calc_bounding_box_x_x4(std::min(na - 2, 2), x + (c_packX4 >> 1), bbj + 1);
 554     }
 555     else
 556     {
 557         /* Set the "empty" bounding box to the same as the first one,
 558          * so we don't need to treat special cases in the rest of the code.
 559          */
 560 #if NBNXN_SEARCH_BB_SIMD4
 561         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 562         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 563 #else
 564         bbj[1] = bbj[0];
 565 #endif
 566     }
 567
 568 #if NBNXN_SEARCH_BB_SIMD4
 569     store4(bb->lower.ptr(), min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 570     store4(bb->upper.ptr(), max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 571 #else
 572     {
 573         bb->lower = BoundingBox::Corner::min(bbj[0].lower, bbj[1].lower);
 574         bb->upper = BoundingBox::Corner::max(bbj[0].upper, bbj[1].upper);
 575     }
 576 #endif
 577 }
 578
 579 #if NBNXN_BBXXXX
 580
 581 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 582 static void calc_bounding_box_xxxx(int na, int stride, const real* x, float* bb)
 583 {
 584     int  i;
 585     real xl, xh, yl, yh, zl, zh;
 586
 587     i  = 0;
 588     xl = x[i + XX];
 589     xh = x[i + XX];
 590     yl = x[i + YY];
 591     yh = x[i + YY];
 592     zl = x[i + ZZ];
 593     zh = x[i + ZZ];
 594     i += stride;
 595     for (int j = 1; j < na; j++)
 596     {
 597         xl = std::min(xl, x[i + XX]);
 598         xh = std::max(xh, x[i + XX]);
 599         yl = std::min(yl, x[i + YY]);
 600         yh = std::max(yh, x[i + YY]);
 601         zl = std::min(zl, x[i + ZZ]);
 602         zh = std::max(zh, x[i + ZZ]);
 603         i += stride;
 604     }
 605     /* Note: possible double to float conversion here */
 606     bb[0 * c_packedBoundingBoxesDimSize] = R2F_D(xl);
 607     bb[1 * c_packedBoundingBoxesDimSize] = R2F_D(yl);
 608     bb[2 * c_packedBoundingBoxesDimSize] = R2F_D(zl);
 609     bb[3 * c_packedBoundingBoxesDimSize] = R2F_U(xh);
 610     bb[4 * c_packedBoundingBoxesDimSize] = R2F_U(yh);
 611     bb[5 * c_packedBoundingBoxesDimSize] = R2F_U(zh);
 612 }
 613
 614 #endif /* NBNXN_BBXXXX */
 615
 616 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 617
 618 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 619 static void calc_bounding_box_simd4(int na, const float* x, BoundingBox* bb)
 620 {
 621     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 622     using namespace gmx;
 623
 624     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH * sizeof(float),
 625                   "The Corner struct should hold exactly 4 floats");
 626
 627     Simd4Float bb_0_S = load4(x);
 628     Simd4Float bb_1_S = bb_0_S;
 629
 630     for (int i = 1; i < na; i++)
 631     {
 632         Simd4Float x_S = load4(x + i * GMX_SIMD4_WIDTH);
 633         bb_0_S         = min(bb_0_S, x_S);
 634         bb_1_S         = max(bb_1_S, x_S);
 635     }
 636
 637     store4(bb->lower.ptr(), bb_0_S);
 638     store4(bb->upper.ptr(), bb_1_S);
 639 }
 640
 641 #    if NBNXN_BBXXXX
 642
 643 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 644 static void calc_bounding_box_xxxx_simd4(int na, const float* x, BoundingBox* bb_work_aligned, real* bb)
 645 {
 646     calc_bounding_box_simd4(na, x, bb_work_aligned);
 647
 648     bb[0 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 649     bb[1 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 650     bb[2 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 651     bb[3 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 652     bb[4 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 653     bb[5 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 654 }
 655
 656 #    endif /* NBNXN_BBXXXX */
 657
 658 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 659
 660
 661 /*! \brief Combines pairs of consecutive bounding boxes */
 662 static void combine_bounding_box_pairs(const Grid&                      grid,
 663                                        gmx::ArrayRef<const BoundingBox> bb,
 664                                        gmx::ArrayRef<BoundingBox>       bbj)
 665 {
 666     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 667     using namespace gmx;
 668
 669     for (int i = 0; i < grid.numColumns(); i++)
 670     {
 671         /* Starting bb in a column is expected to be 2-aligned */
 672         const int sc2 = grid.firstCellInColumn(i) >> 1;
 673         /* For odd numbers skip the last bb here */
 674         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 675         int       c2;
 676         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 677         {
 678 #if NBNXN_SEARCH_BB_SIMD4
 679             Simd4Float min_S, max_S;
 680
 681             min_S = min(load4(bb[c2 * 2 + 0].lower.ptr()), load4(bb[c2 * 2 + 1].lower.ptr()));
 682             max_S = max(load4(bb[c2 * 2 + 0].upper.ptr()), load4(bb[c2 * 2 + 1].upper.ptr()));
 683             store4(bbj[c2].lower.ptr(), min_S);
 684             store4(bbj[c2].upper.ptr(), max_S);
 685 #else
 686             bbj[c2].lower = BoundingBox::Corner::min(bb[c2 * 2 + 0].lower, bb[c2 * 2 + 1].lower);
 687             bbj[c2].upper = BoundingBox::Corner::max(bb[c2 * 2 + 0].upper, bb[c2 * 2 + 1].upper);
 688 #endif
 689         }
 690         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 691         {
 692             /* The bb count in this column is odd: duplicate the last bb */
 693             bbj[c2].lower = bb[c2 * 2].lower;
 694             bbj[c2].upper = bb[c2 * 2].upper;
 695         }
 696     }
 697 }
 698
 699
 700 /*! \brief Prints the average bb size, used for debug output */
 701 static void print_bbsizes_simple(FILE* fp, const Grid& grid)
 702 {
 703     dvec ba = { 0 };
 704     for (int c = 0; c < grid.numCells(); c++)
 705     {
 706         const BoundingBox& bb = grid.iBoundingBoxes()[c];
 707         ba[XX] += bb.upper.x - bb.lower.x;
 708         ba[YY] += bb.upper.y - bb.lower.y;
 709         ba[ZZ] += bb.upper.z - bb.lower.z;
 710     }
 711     dsvmul(1.0 / grid.numCells(), ba, ba);
 712
 713     const Grid::Dimensions& dims = grid.dimensions();
 714     real                    avgCellSizeZ =
 715             (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster
 716                                             / (dims.atomDensity * dims.cellSize[XX] * dims.cellSize[YY])
 717                                   : 0.0);
 718
 719     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 720             dims.cellSize[XX], dims.cellSize[YY], avgCellSizeZ, ba[XX], ba[YY], ba[ZZ],
 721             ba[XX] * dims.invCellSize[XX], ba[YY] * dims.invCellSize[YY],
 722             dims.atomDensity > 0 ? ba[ZZ] / avgCellSizeZ : 0.0);
 723 }
 724
 725 /*! \brief Prints the average bb size, used for debug output */
 726 static void print_bbsizes_supersub(FILE* fp, const Grid& grid)
 727 {
 728     int  ns;
 729     dvec ba;
 730
 731     clear_dvec(ba);
 732     ns = 0;
 733     for (int c = 0; c < grid.numCells(); c++)
 734     {
 735 #if NBNXN_BBXXXX
 736         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 737         {
 738             int  cs_w          = (c * c_gpuNumClusterPerCell + s) / c_packedBoundingBoxesDimSize;
 739             auto boundingBoxes = grid.packedBoundingBoxes().subArray(
 740                     cs_w * c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 741             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 742             {
 743                 for (int d = 0; d < DIM; d++)
 744                 {
 745                     ba[d] += boundingBoxes[(DIM + d) * c_packedBoundingBoxesDimSize + i]
 746                              - boundingBoxes[(0 + d) * c_packedBoundingBoxesDimSize + i];
 747                 }
 748             }
 749         }
 750 #else
 751         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 752         {
 753             const BoundingBox& bb = grid.iBoundingBoxes()[c * c_gpuNumClusterPerCell + s];
 754             ba[XX] += bb.upper.x - bb.lower.x;
 755             ba[YY] += bb.upper.y - bb.lower.y;
 756             ba[ZZ] += bb.upper.z - bb.lower.z;
 757         }
 758 #endif
 759         ns += grid.numClustersPerCell()[c];
 760     }
 761     dsvmul(1.0 / ns, ba, ba);
 762
 763     const Grid::Dimensions& dims = grid.dimensions();
 764     const real              avgClusterSizeZ =
 765             (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell
 766                                             / (dims.atomDensity * dims.cellSize[XX]
 767                                                * dims.cellSize[YY] * c_gpuNumClusterPerCellZ)
 768                                   : 0.0);
 769
 770     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 771             dims.cellSize[XX] / c_gpuNumClusterPerCellX,
 772             dims.cellSize[YY] / c_gpuNumClusterPerCellY, avgClusterSizeZ, ba[XX], ba[YY], ba[ZZ],
 773             ba[XX] * c_gpuNumClusterPerCellX * dims.invCellSize[XX],
 774             ba[YY] * c_gpuNumClusterPerCellY * dims.invCellSize[YY],
 775             dims.atomDensity > 0 ? ba[ZZ] / avgClusterSizeZ : 0.0);
 776 }
 777
 778 /*!\brief Set non-bonded interaction flags for the current cluster.
 779  *
 780  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 781  */
 782 static void sort_cluster_on_flag(int                numAtomsInCluster,
 783                                  int                atomStart,
 784                                  int                atomEnd,
 785                                  const int*         atinfo,
 786                                  gmx::ArrayRef<int> order,
 787                                  int*               flags)
 788 {
 789     constexpr int c_maxNumAtomsInCluster = 8;
 790     int           sort1[c_maxNumAtomsInCluster];
 791     int           sort2[c_maxNumAtomsInCluster];
 792
 793     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster,
 794                "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 795
 796     *flags = 0;
 797
 798     int subc = 0;
 799     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 800     {
 801         /* Make lists for this (sub-)cell on atoms with and without LJ */
 802         int      n1       = 0;
 803         int      n2       = 0;
 804         gmx_bool haveQ    = FALSE;
 805         int      a_lj_max = -1;
 806         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 807         {
 808             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 809
 810             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 811             {
 812                 sort1[n1++] = order[a];
 813                 a_lj_max    = a;
 814             }
 815             else
 816             {
 817                 sort2[n2++] = order[a];
 818             }
 819         }
 820
 821         /* If we don't have atoms with LJ, there's nothing to sort */
 822         if (n1 > 0)
 823         {
 824             *flags |= NBNXN_CI_DO_LJ(subc);
 825
 826             if (2 * n1 <= numAtomsInCluster)
 827             {
 828                 /* Only sort when strictly necessary. Ordering particles
 829                  * Ordering particles can lead to less accurate summation
 830                  * due to rounding, both for LJ and Coulomb interactions.
 831                  */
 832                 if (2 * (a_lj_max - s) >= numAtomsInCluster)
 833                 {
 834                     for (int i = 0; i < n1; i++)
 835                     {
 836                         order[atomStart + i] = sort1[i];
 837                     }
 838                     for (int j = 0; j < n2; j++)
 839                     {
 840                         order[atomStart + n1 + j] = sort2[j];
 841                     }
 842                 }
 843
 844                 *flags |= NBNXN_CI_HALF_LJ(subc);
 845             }
 846         }
 847         if (haveQ)
 848         {
 849             *flags |= NBNXN_CI_DO_COUL(subc);
 850         }
 851         subc++;
 852     }
 853 }
 854
 855 /*! \brief Fill a pair search cell with atoms.
 856  *
 857  * Potentially sorts atoms and sets the interaction flags.
 858  */
 859 void Grid::fillCell(GridSetData*                   gridSetData,
 860                     nbnxn_atomdata_t*              nbat,
 861                     int                            atomStart,
 862                     int                            atomEnd,
 863                     const int*                     atinfo,
 864                     gmx::ArrayRef<const gmx::RVec> x,
 865                     BoundingBox gmx_unused* bb_work_aligned)
 866 {
 867     const int numAtoms = atomEnd - atomStart;
 868
 869     const gmx::ArrayRef<int>& cells       = gridSetData->cells;
 870     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
 871
 872     if (geometry_.isSimple)
 873     {
 874         /* Note that non-local grids are already sorted.
 875          * Then sort_cluster_on_flag will only set the flags and the sorting
 876          * will not affect the atom order.
 877          */
 878         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
 879                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 880     }
 881
 882     if (haveFep_)
 883     {
 884         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 885
 886         /* The grid-local cluster/(sub-)cell index */
 887         int cell = atomToCluster(atomStart)
 888                    - cellOffset_ * (geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 889         fep_[cell] = 0;
 890         for (int at = atomStart; at < atomEnd; at++)
 891         {
 892             if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
 893             {
 894                 fep_[cell] |= (1 << (at - atomStart));
 895             }
 896         }
 897     }
 898
 899     /* Now we have sorted the atoms, set the cell indices */
 900     for (int at = atomStart; at < atomEnd; at++)
 901     {
 902         cells[atomIndices[at]] = at;
 903     }
 904
 905     copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 906                            as_rvec_array(x.data()), nbat->XFormat, nbat->x().data(), atomStart);
 907
 908     if (nbat->XFormat == nbatX4)
 909     {
 910         /* Store the bounding boxes as xyz.xyz. */
 911         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 912         BoundingBox* bb_ptr = bb_.data() + offset;
 913
 914 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 915         if (2 * geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 916         {
 917             calc_bounding_box_x_x4_halves(numAtoms,
 918                                           nbat->x().data() + atom_to_x_index<c_packX4>(atomStart),
 919                                           bb_ptr, bbj_.data() + offset * 2);
 920         }
 921         else
 922 #endif
 923         {
 924             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 925         }
 926     }
 927     else if (nbat->XFormat == nbatX8)
 928     {
 929         /* Store the bounding boxes as xyz.xyz. */
 930         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 931         BoundingBox* bb_ptr = bb_.data() + offset;
 932
 933         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 934     }
 935 #if NBNXN_BBXXXX
 936     else if (!geometry_.isSimple)
 937     {
 938         /* Store the bounding boxes in a format convenient
 939          * for SIMD4 calculations: xxxxyyyyzzzz...
 940          */
 941         const int clusterIndex = ((atomStart - cellOffset_ * geometry_.numAtomsPerCell)
 942                                   >> geometry_.numAtomsICluster2Log);
 943         float*    pbb_ptr      = pbb_.data() + packedBoundingBoxesIndex(clusterIndex)
 944                          + (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 945
 946 #    if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 947         if (nbat->XFormat == nbatXYZQ)
 948         {
 949             GMX_ASSERT(bb_work_aligned != nullptr, "Must have valid aligned work structure");
 950             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart * nbat->xstride,
 951                                          bb_work_aligned, pbb_ptr);
 952         }
 953         else
 954 #    endif
 955         {
 956             calc_bounding_box_xxxx(numAtoms, nbat->xstride,
 957                                    nbat->x().data() + atomStart * nbat->xstride, pbb_ptr);
 958         }
 959         if (gmx_debug_at)
 960         {
 961             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 962                     atomToCluster(atomStart), pbb_ptr[0 * c_packedBoundingBoxesDimSize],
 963                     pbb_ptr[3 * c_packedBoundingBoxesDimSize], pbb_ptr[1 * c_packedBoundingBoxesDimSize],
 964                     pbb_ptr[4 * c_packedBoundingBoxesDimSize], pbb_ptr[2 * c_packedBoundingBoxesDimSize],
 965                     pbb_ptr[5 * c_packedBoundingBoxesDimSize]);
 966         }
 967     }
 968 #endif
 969     else
 970     {
 971         /* Store the bounding boxes as xyz.xyz. */
 972         BoundingBox* bb_ptr =
 973                 bb_.data() + atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
 974
 975         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart * nbat->xstride, bb_ptr);
 976
 977         if (gmx_debug_at)
 978         {
 979             int bbo = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
 980             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 981                     atomToCluster(atomStart), bb_[bbo].lower.x, bb_[bbo].lower.y, bb_[bbo].lower.z,
 982                     bb_[bbo].upper.x, bb_[bbo].upper.y, bb_[bbo].upper.z);
 983         }
 984     }
 985 }
 986
 987 void Grid::sortColumnsCpuGeometry(GridSetData*                   gridSetData,
 988                                   int                            dd_zone,
 989                                   const int*                     atinfo,
 990                                   gmx::ArrayRef<const gmx::RVec> x,
 991                                   nbnxn_atomdata_t*              nbat,
 992                                   const gmx::Range<int>          columnRange,
 993                                   gmx::ArrayRef<int>             sort_work)
 994 {
 995     if (debug)
 996     {
 997         fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
 998                 *columnRange.begin(), *columnRange.end());
 999     }
1000
1001     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1002
1003     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1004
1005     /* Sort the atoms within each x,y column in 3 dimensions */
1006     for (int cxy : columnRange)
1007     {
1008         const int numAtoms   = numAtomsInColumn(cxy);
1009         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1010         const int atomOffset = firstAtomInColumn(cxy);
1011
1012         /* Sort the atoms within each x,y column on z coordinate */
1013         sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1014                    gridSetData->atomIndices.data() + atomOffset, numAtoms, x, dimensions_.lowerCorner[ZZ],
1015                    1.0 / dimensions_.gridSize[ZZ], numCellsZ * numAtomsPerCell, sort_work);
1016
1017         /* Fill the ncz cells in this column */
1018         const int firstCell  = firstCellInColumn(cxy);
1019         int       cellFilled = firstCell;
1020         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1021         {
1022             const int cell = firstCell + cellZ;
1023
1024             const int atomOffsetCell = atomOffset + cellZ * numAtomsPerCell;
1025             const int numAtomsCell = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1026
1027             fillCell(gridSetData, nbat, atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x, nullptr);
1028
1029             /* This copy to bbcz is not really necessary.
1030              * But it allows to use the same grid search code
1031              * for the simple and supersub cell setups.
1032              */
1033             if (numAtomsCell > 0)
1034             {
1035                 cellFilled = cell;
1036             }
1037             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1038             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1039         }
1040
1041         /* Set the unused atom indices to -1 */
1042         for (int ind = numAtoms; ind < numCellsZ * numAtomsPerCell; ind++)
1043         {
1044             gridSetData->atomIndices[atomOffset + ind] = -1;
1045         }
1046     }
1047 }
1048
1049 /* Spatially sort the atoms within one grid column */
1050 void Grid::sortColumnsGpuGeometry(GridSetData*                   gridSetData,
1051                                   int                            dd_zone,
1052                                   const int*                     atinfo,
1053                                   gmx::ArrayRef<const gmx::RVec> x,
1054                                   nbnxn_atomdata_t*              nbat,
1055                                   const gmx::Range<int>          columnRange,
1056                                   gmx::ArrayRef<int>             sort_work)
1057 {
1058     BoundingBox  bb_work_array[2];
1059     BoundingBox* bb_work_aligned = reinterpret_cast<BoundingBox*>(
1060             (reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1061
1062     if (debug)
1063     {
1064         fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
1065                 *columnRange.begin(), *columnRange.end());
1066     }
1067
1068     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1069
1070     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1071
1072     const int subdiv_x = geometry_.numAtomsICluster;
1073     const int subdiv_y = c_gpuNumClusterPerCellX * subdiv_x;
1074     const int subdiv_z = c_gpuNumClusterPerCellY * subdiv_y;
1075
1076     /* Extract the atom index array that will be filled here */
1077     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
1078
1079     /* Sort the atoms within each x,y column in 3 dimensions.
1080      * Loop over all columns on the x/y grid.
1081      */
1082     for (int cxy : columnRange)
1083     {
1084         const int gridX = cxy / dimensions_.numCells[YY];
1085         const int gridY = cxy - gridX * dimensions_.numCells[YY];
1086
1087         const int numAtomsInColumn = cxy_na_[cxy];
1088         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1089         const int atomOffset       = firstAtomInColumn(cxy);
1090
1091         /* Sort the atoms within each x,y column on z coordinate */
1092         sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1093                    atomIndices.data() + atomOffset, numAtomsInColumn, x, dimensions_.lowerCorner[ZZ],
1094                    1.0 / dimensions_.gridSize[ZZ], numCellsInColumn * numAtomsPerCell, sort_work);
1095
1096         /* This loop goes over the cells and clusters along z at once */
1097         for (int sub_z = 0; sub_z < numCellsInColumn * c_gpuNumClusterPerCellZ; sub_z++)
1098         {
1099             const int atomOffsetZ = atomOffset + sub_z * subdiv_z;
1100             const int numAtomsZ = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1101             int       cz        = -1;
1102             /* We have already sorted on z */
1103
1104             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1105             {
1106                 cz             = sub_z / c_gpuNumClusterPerCellZ;
1107                 const int cell = cxy_ind_[cxy] + cz;
1108
1109                 /* The number of atoms in this cell/super-cluster */
1110                 const int numAtoms =
1111                         std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1112
1113                 numClusters_[cell] =
1114                         std::min(c_gpuNumClusterPerCell, (numAtoms + geometry_.numAtomsICluster - 1)
1115                                                                  / geometry_.numAtomsICluster);
1116
1117                 /* Store the z-boundaries of the bounding box of the cell */
1118                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1119                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1120             }
1121
1122             if (c_gpuNumClusterPerCellY > 1)
1123             {
1124                 /* Sort the atoms along y */
1125                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone, relevantAtomsAreWithinGridBounds,
1126                            atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1127                            dimensions_.lowerCorner[YY] + gridY * dimensions_.cellSize[YY],
1128                            dimensions_.invCellSize[YY], subdiv_z, sort_work);
1129             }
1130
1131             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1132             {
1133                 const int atomOffsetY = atomOffsetZ + sub_y * subdiv_y;
1134                 const int numAtomsY = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1135
1136                 if (c_gpuNumClusterPerCellX > 1)
1137                 {
1138                     /* Sort the atoms along x */
1139                     sort_atoms(XX, ((cz * c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1140                                relevantAtomsAreWithinGridBounds, atomIndices.data() + atomOffsetY, numAtomsY,
1141                                x, dimensions_.lowerCorner[XX] + gridX * dimensions_.cellSize[XX],
1142                                dimensions_.invCellSize[XX], subdiv_y, sort_work);
1143                 }
1144
1145                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1146                 {
1147                     const int atomOffsetX = atomOffsetY + sub_x * subdiv_x;
1148                     const int numAtomsX =
1149                             std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1150
1151                     fillCell(gridSetData, nbat, atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1152                              bb_work_aligned);
1153                 }
1154             }
1155         }
1156
1157         /* Set the unused atom indices to -1 */
1158         for (int ind = numAtomsInColumn; ind < numCellsInColumn * numAtomsPerCell; ind++)
1159         {
1160             atomIndices[atomOffset + ind] = -1;
1161         }
1162     }
1163 }
1164
1165 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1166 static void setCellAndAtomCount(gmx::ArrayRef<int> cell, int cellIndex, gmx::ArrayRef<int> cxy_na, int atomIndex)
1167 {
1168     cell[atomIndex] = cellIndex;
1169     cxy_na[cellIndex] += 1;
1170 }
1171
1172 void Grid::calcColumnIndices(const Grid::Dimensions&        gridDims,
1173                              const gmx::UpdateGroupsCog*    updateGroupsCog,
1174                              const gmx::Range<int>          atomRange,
1175                              gmx::ArrayRef<const gmx::RVec> x,
1176                              const int                      dd_zone,
1177                              const int*                     move,
1178                              const int                      thread,
1179                              const int                      nthread,
1180                              gmx::ArrayRef<int>             cell,
1181                              gmx::ArrayRef<int>             cxy_na)
1182 {
1183     const int numColumns = gridDims.numCells[XX] * gridDims.numCells[YY];
1184
1185     /* We add one extra cell for particles which moved during DD */
1186     for (int i = 0; i < numColumns; i++)
1187     {
1188         cxy_na[i] = 0;
1189     }
1190
1191     int taskAtomStart = *atomRange.begin() + static_cast<int>((thread + 0) * atomRange.size()) / nthread;
1192     int taskAtomEnd = *atomRange.begin() + static_cast<int>((thread + 1) * atomRange.size()) / nthread;
1193
1194     if (dd_zone == 0)
1195     {
1196         /* Home zone */
1197         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1198         {
1199             if (move == nullptr || move[i] >= 0)
1200             {
1201
1202                 const gmx::RVec& coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1203
1204                 /* We need to be careful with rounding,
1205                  * particles might be a few bits outside the local zone.
1206                  * The int cast takes care of the lower bound,
1207                  * we will explicitly take care of the upper bound.
1208                  */
1209                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])
1210                                           * gridDims.invCellSize[XX]);
1211                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])
1212                                           * gridDims.invCellSize[YY]);
1213
1214 #ifndef NDEBUG
1215                 if (cx < 0 || cx > gridDims.numCells[XX] || cy < 0 || cy > gridDims.numCells[YY])
1216                 {
1217                     gmx_fatal(FARGS,
1218                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1219                               "atom %f %f %f, grid->c0 %f %f",
1220                               cx, cy, gridDims.numCells[XX], gridDims.numCells[YY], x[i][XX],
1221                               x[i][YY], x[i][ZZ], gridDims.lowerCorner[XX], gridDims.lowerCorner[YY]);
1222                 }
1223 #endif
1224                 /* Take care of potential rouding issues */
1225                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1226                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1227
1228                 /* For the moment cell will contain only the, grid local,
1229                  * x and y indices, not z.
1230                  */
1231                 setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1232             }
1233             else
1234             {
1235                 /* Put this moved particle after the end of the grid,
1236                  * so we can process it later without using conditionals.
1237                  */
1238                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1239             }
1240         }
1241     }
1242     else
1243     {
1244         /* Non-home zone */
1245         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1246         {
1247             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX]) * gridDims.invCellSize[XX]);
1248             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY]) * gridDims.invCellSize[YY]);
1249
1250             /* For non-home zones there could be particles outside
1251              * the non-bonded cut-off range, which have been communicated
1252              * for bonded interactions only. For the result it doesn't
1253              * matter where these end up on the grid. For performance
1254              * we put them in an extra row at the border.
1255              */
1256             cx = std::max(cx, 0);
1257             cx = std::min(cx, gridDims.numCells[XX] - 1);
1258             cy = std::max(cy, 0);
1259             cy = std::min(cy, gridDims.numCells[YY] - 1);
1260
1261             /* For the moment cell will contain only the, grid local,
1262              * x and y indices, not z.
1263              */
1264             setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1265         }
1266     }
1267 }
1268
1269 /*! \brief Resizes grid and atom data which depend on the number of cells */
1270 static void resizeForNumberOfCells(const int         numNbnxnAtoms,
1271                                    const int         numAtomsMoved,
1272                                    GridSetData*      gridSetData,
1273                                    nbnxn_atomdata_t* nbat)
1274 {
1275     /* Note: gridSetData->cellIndices was already resized before */
1276
1277     /* To avoid conditionals we store the moved particles at the end of a,
1278      * make sure we have enough space.
1279      */
1280     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1281
1282     /* Make space in nbat for storing the atom coordinates */
1283     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1284 }
1285
1286 void Grid::setCellIndices(int                            ddZone,
1287                           int                            cellOffset,
1288                           GridSetData*                   gridSetData,
1289                           gmx::ArrayRef<GridWork>        gridWork,
1290                           const gmx::Range<int>          atomRange,
1291                           const int*                     atinfo,
1292                           gmx::ArrayRef<const gmx::RVec> x,
1293                           const int                      numAtomsMoved,
1294                           nbnxn_atomdata_t*              nbat)
1295 {
1296     cellOffset_ = cellOffset;
1297
1298     srcAtomBegin_ = *atomRange.begin();
1299     srcAtomEnd_   = *atomRange.end();
1300
1301     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1302
1303     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1304
1305     /* Make the cell index as a function of x and y */
1306     int ncz_max = 0;
1307     int ncz     = 0;
1308     cxy_ind_[0] = 0;
1309     for (int i = 0; i < numColumns() + 1; i++)
1310     {
1311         /* We set ncz_max at the beginning of the loop iso at the end
1312          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1313          * that do not need to be ordered on the grid.
1314          */
1315         if (ncz > ncz_max)
1316         {
1317             ncz_max = ncz;
1318         }
1319         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1320         for (int thread = 1; thread < nthread; thread++)
1321         {
1322             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1323         }
1324         ncz = (cxy_na_i + numAtomsPerCell - 1) / numAtomsPerCell;
1325         if (nbat->XFormat == nbatX8)
1326         {
1327             /* Make the number of cell a multiple of 2 */
1328             ncz = (ncz + 1) & ~1;
1329         }
1330         cxy_ind_[i + 1] = cxy_ind_[i] + ncz;
1331         /* Clear cxy_na_, so we can reuse the array below */
1332         cxy_na_[i] = 0;
1333     }
1334     numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
1335     numCellsColumnMax_ = ncz_max;
1336
1337     /* Resize grid and atom data which depend on the number of cells */
1338     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1339
1340     if (debug)
1341     {
1342         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1343                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_, dimensions_.numCells[XX],
1344                 dimensions_.numCells[YY], numCellsTotal_ / (static_cast<double>(numColumns())), ncz_max);
1345         if (gmx_debug_at)
1346         {
1347             int i = 0;
1348             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1349             {
1350                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1351                 {
1352                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1353                     i++;
1354                 }
1355                 fprintf(debug, "\n");
1356             }
1357         }
1358     }
1359
1360     /* Make sure the work array for sorting is large enough */
1361     const int worstCaseSortBufferSize = ncz_max * numAtomsPerCell * c_sortGridMaxSizeFactor;
1362     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1363     {
1364         for (GridWork& work : gridWork)
1365         {
1366             /* Elements not in use should be -1 */
1367             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1368         }
1369     }
1370
1371     /* Now we know the dimensions we can fill the grid.
1372      * This is the first, unsorted fill. We sort the columns after this.
1373      */
1374     gmx::ArrayRef<int> cells       = gridSetData->cells;
1375     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1376     for (int i : atomRange)
1377     {
1378         /* At this point nbs->cell contains the local grid x,y indices */
1379         const int cxy                                        = cells[i];
1380         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1381     }
1382
1383     if (ddZone == 0)
1384     {
1385         /* Set the cell indices for the moved particles */
1386         int n0 = numCellsTotal_ * numAtomsPerCell;
1387         int n1 = numCellsTotal_ * numAtomsPerCell + cxy_na_[numColumns()];
1388         for (int i = n0; i < n1; i++)
1389         {
1390             cells[atomIndices[i]] = i;
1391         }
1392     }
1393
1394     /* Sort the super-cell columns along z into the sub-cells. */
1395 #pragma omp parallel for num_threads(nthread) schedule(static)
1396     for (int thread = 0; thread < nthread; thread++)
1397     {
1398         try
1399         {
1400             gmx::Range<int> columnRange(((thread + 0) * numColumns()) / nthread,
1401                                         ((thread + 1) * numColumns()) / nthread);
1402             if (geometry_.isSimple)
1403             {
1404                 sortColumnsCpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1405                                        gridWork[thread].sortBuffer);
1406             }
1407             else
1408             {
1409                 sortColumnsGpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1410                                        gridWork[thread].sortBuffer);
1411             }
1412         }
1413         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1414     }
1415
1416     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1417     {
1418         combine_bounding_box_pairs(*this, bb_, bbj_);
1419     }
1420
1421     if (!geometry_.isSimple)
1422     {
1423         numClustersTotal_ = 0;
1424         for (int i = 0; i < numCellsTotal_; i++)
1425         {
1426             numClustersTotal_ += numClusters_[i];
1427         }
1428     }
1429
1430     if (debug)
1431     {
1432         if (geometry_.isSimple)
1433         {
1434             print_bbsizes_simple(debug, *this);
1435         }
1436         else
1437         {
1438             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n", numClustersTotal_,
1439                     atomRange.size() / static_cast<double>(numClustersTotal_));
1440
1441             print_bbsizes_supersub(debug, *this);
1442         }
1443     }
1444 }
1445
1446 } // namespace Nbnxm