src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  *
  39  * \brief
  40  * Implements the Grid class.
  41  *
  42  * \author Berk Hess <hess@kth.se>
  43  * \ingroup module_nbnxm
  44  */
  45
  46 #include "gmxpre.h"
  47
  48 #include "grid.h"
  49
  50 #include <cmath>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/math/utilities.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  58 #include "gromacs/mdlib/updategroupscog.h"
  59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  60 #include "gromacs/nbnxm/atomdata.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/vector_operations.h"
  63
  64 #include "boundingboxes.h"
  65 #include "gridsetdata.h"
  66 #include "nbnxm_geometry.h"
  67 #include "pairlistparams.h"
  68
  69 namespace Nbnxm
  70 {
  71
  72 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  73     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  74     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  75     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  76     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell) * numAtomsICluster),
  77     numAtomsICluster2Log(get_2log(numAtomsICluster))
  78 {
  79 }
  80
  81 Grid::Grid(const PairlistType pairlistType, const bool& haveFep) :
  82     geometry_(pairlistType),
  83     haveFep_(haveFep)
  84 {
  85 }
  86
  87 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  88 static real gridAtomDensity(int numAtoms, const rvec lowerCorner, const rvec upperCorner)
  89 {
  90     rvec size;
  91
  92     if (numAtoms == 0)
  93     {
  94         /* To avoid zero density we use a minimum of 1 atom */
  95         numAtoms = 1;
  96     }
  97
  98     rvec_sub(upperCorner, lowerCorner, size);
  99
 100     return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
 101 }
 102
 103 void Grid::setDimensions(const int          ddZone,
 104                          const int          numAtoms,
 105                          gmx::RVec          lowerCorner,
 106                          gmx::RVec          upperCorner,
 107                          real               atomDensity,
 108                          const real         maxAtomGroupRadius,
 109                          const bool         haveFep,
 110                          gmx::PinningPolicy pinningPolicy)
 111 {
 112     /* We allow passing lowerCorner=upperCorner, in which case we need to
 113      * create a finite sized bounding box to avoid division by zero.
 114      * We use a minimum size such that the volume fits in float with some
 115      * margin for computing and using the atom number density.
 116      */
 117     constexpr real c_minimumGridSize = 1e-10;
 118     for (int d = 0; d < DIM; d++)
 119     {
 120         GMX_ASSERT(upperCorner[d] >= lowerCorner[d],
 121                    "Upper corner should be larger than the lower corner");
 122         if (upperCorner[d] - lowerCorner[d] < c_minimumGridSize)
 123         {
 124             /* Ensure we apply a correction to the bounding box */
 125             real correction =
 126                     std::max(std::abs(lowerCorner[d]) * GMX_REAL_EPS, 0.5_real * c_minimumGridSize);
 127             lowerCorner[d] -= correction;
 128             upperCorner[d] += correction;
 129         }
 130     }
 131
 132     /* For the home zone we compute the density when not set (=-1) or when =0 */
 133     if (ddZone == 0 && atomDensity <= 0)
 134     {
 135         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 136     }
 137
 138     dimensions_.atomDensity        = atomDensity;
 139     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 140
 141     rvec size;
 142     rvec_sub(upperCorner, lowerCorner, size);
 143
 144     if (numAtoms > geometry_.numAtomsPerCell)
 145     {
 146         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 147
 148         /* target cell length */
 149         real tlen_x;
 150         real tlen_y;
 151         if (geometry_.isSimple)
 152         {
 153             /* To minimize the zero interactions, we should make
 154              * the largest of the i/j cell cubic.
 155              */
 156             int numAtomsInCell = std::max(geometry_.numAtomsICluster, geometry_.numAtomsJCluster);
 157
 158             /* Approximately cubic cells */
 159             real tlen = std::cbrt(numAtomsInCell / atomDensity);
 160             tlen_x    = tlen;
 161             tlen_y    = tlen;
 162         }
 163         else
 164         {
 165             /* Approximately cubic sub cells */
 166             real tlen = std::cbrt(geometry_.numAtomsICluster / atomDensity);
 167             tlen_x    = tlen * c_gpuNumClusterPerCellX;
 168             tlen_y    = tlen * c_gpuNumClusterPerCellY;
 169         }
 170         /* We round ncx and ncy down, because we get less cell pairs
 171          * in the pairlist when the fixed cell dimensions (x,y) are
 172          * larger than the variable one (z) than the other way around.
 173          */
 174         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX] / tlen_x));
 175         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY] / tlen_y));
 176     }
 177     else
 178     {
 179         dimensions_.numCells[XX] = 1;
 180         dimensions_.numCells[YY] = 1;
 181     }
 182
 183     for (int d = 0; d < DIM - 1; d++)
 184     {
 185         dimensions_.cellSize[d]    = size[d] / dimensions_.numCells[d];
 186         dimensions_.invCellSize[d] = 1 / dimensions_.cellSize[d];
 187     }
 188
 189     if (ddZone > 0)
 190     {
 191         /* This is a non-home zone, add an extra row of cells
 192          * for particles communicated for bonded interactions.
 193          * These can be beyond the cut-off. It doesn't matter where
 194          * they end up on the grid, but for performance it's better
 195          * if they don't end up in cells that can be within cut-off range.
 196          */
 197         dimensions_.numCells[XX]++;
 198         dimensions_.numCells[YY]++;
 199     }
 200
 201     /* We need one additional cell entry for particles moved by DD */
 202     cxy_na_.resize(numColumns() + 1);
 203     cxy_ind_.resize(numColumns() + 2);
 204     changePinningPolicy(&cxy_na_, pinningPolicy);
 205     changePinningPolicy(&cxy_ind_, pinningPolicy);
 206
 207     /* Worst case scenario of 1 atom in each last cell */
 208     int maxNumCells;
 209     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 210     {
 211         maxNumCells = numAtoms / geometry_.numAtomsPerCell + numColumns();
 212     }
 213     else
 214     {
 215         maxNumCells = numAtoms / geometry_.numAtomsPerCell
 216                       + numColumns() * geometry_.numAtomsJCluster / geometry_.numAtomsICluster;
 217     }
 218
 219     if (!geometry_.isSimple)
 220     {
 221         numClusters_.resize(maxNumCells);
 222     }
 223     bbcz_.resize(maxNumCells);
 224
 225     /* This resize also zeros the contents, this avoid possible
 226      * floating exceptions in SIMD with the unused bb elements.
 227      */
 228     if (geometry_.isSimple)
 229     {
 230         bb_.resize(maxNumCells);
 231     }
 232     else
 233     {
 234 #if NBNXN_BBXXXX
 235         pbb_.resize(packedBoundingBoxesIndex(maxNumCells * c_gpuNumClusterPerCell));
 236 #else
 237         bb_.resize(maxNumCells * c_gpuNumClusterPerCell);
 238 #endif
 239     }
 240
 241     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 242     {
 243         bbj_ = bb_;
 244     }
 245     else
 246     {
 247         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 248
 249         bbjStorage_.resize(maxNumCells * geometry_.numAtomsICluster / geometry_.numAtomsJCluster);
 250         bbj_ = bbjStorage_;
 251     }
 252
 253     flags_.resize(maxNumCells);
 254     if (haveFep)
 255     {
 256         fep_.resize(maxNumCells * geometry_.numAtomsPerCell / geometry_.numAtomsICluster);
 257     }
 258
 259     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 260     copy_rvec(upperCorner, dimensions_.upperCorner);
 261     copy_rvec(size, dimensions_.gridSize);
 262 }
 263
 264 /* We need to sort paricles in grid columns on z-coordinate.
 265  * As particle are very often distributed homogeneously, we use a sorting
 266  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 267  * by a factor, cast to an int and try to store in that hole. If the hole
 268  * is full, we move this or another particle. A second pass is needed to make
 269  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 270  * 4 is the optimal value for homogeneous particle distribution and allows
 271  * for an O(#particles) sort up till distributions were all particles are
 272  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 273  * as it can be expensive to detect imhomogeneous particle distributions.
 274  */
 275 /*! \brief Ratio of grid cells to atoms */
 276 static constexpr int c_sortGridRatio = 4;
 277 /*! \brief Maximum ratio of holes used, in the worst case all particles
 278  * end up in the last hole and we need num. atoms extra holes at the end.
 279  */
 280 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 281
 282 /*! \brief Sorts particle index a on coordinates x along dim.
 283  *
 284  * Backwards tells if we want decreasing iso increasing coordinates.
 285  * h0 is the minimum of the coordinate range.
 286  * invh is the 1/length of the sorting range.
 287  * n_per_h (>=n) is the expected average number of particles per 1/invh
 288  * sort is the sorting work array.
 289  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 290  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 291  */
 292 static void sort_atoms(int      dim,
 293                        gmx_bool Backwards,
 294                        int gmx_unused dd_zone,
 295                        bool gmx_unused                relevantAtomsAreWithinGridBounds,
 296                        int*                           a,
 297                        int                            n,
 298                        gmx::ArrayRef<const gmx::RVec> x,
 299                        real                           h0,
 300                        real                           invh,
 301                        int                            n_per_h,
 302                        gmx::ArrayRef<int>             sort)
 303 {
 304     if (n <= 1)
 305     {
 306         /* Nothing to do */
 307         return;
 308     }
 309
 310     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 311
 312     /* Transform the inverse range height into the inverse hole height */
 313     invh *= n_per_h * c_sortGridRatio;
 314
 315     /* Set nsort to the maximum possible number of holes used.
 316      * In worst case all n elements end up in the last bin.
 317      */
 318     int nsort = n_per_h * c_sortGridRatio + n;
 319
 320     /* Determine the index range used, so we can limit it for the second pass */
 321     int zi_min = INT_MAX;
 322     int zi_max = -1;
 323
 324     /* Sort the particles using a simple index sort */
 325     for (int i = 0; i < n; i++)
 326     {
 327         /* The cast takes care of float-point rounding effects below zero.
 328          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 329          * times the box height out of the box.
 330          */
 331         int zi = static_cast<int>((x[a[i]][dim] - h0) * invh);
 332
 333 #ifndef NDEBUG
 334         /* As we can have rounding effect, we use > iso >= here */
 335         if (relevantAtomsAreWithinGridBounds && (zi < 0 || (dd_zone == 0 && zi > n_per_h * c_sortGridRatio)))
 336         {
 337             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n", a[i],
 338                       'x' + dim, x[a[i]][dim], h0, invh, zi, n_per_h, c_sortGridRatio);
 339         }
 340 #endif
 341         if (zi < 0)
 342         {
 343             zi = 0;
 344         }
 345
 346         /* In a non-local domain, particles communcated for bonded interactions
 347          * can be far beyond the grid size, which is set by the non-bonded
 348          * cut-off distance. We sort such particles into the last cell.
 349          */
 350         if (zi > n_per_h * c_sortGridRatio)
 351         {
 352             zi = n_per_h * c_sortGridRatio;
 353         }
 354
 355         /* Ideally this particle should go in sort cell zi,
 356          * but that might already be in use,
 357          * in that case find the first empty cell higher up
 358          */
 359         if (sort[zi] < 0)
 360         {
 361             sort[zi] = a[i];
 362             zi_min   = std::min(zi_min, zi);
 363             zi_max   = std::max(zi_max, zi);
 364         }
 365         else
 366         {
 367             /* We have multiple atoms in the same sorting slot.
 368              * Sort on real z for minimal bounding box size.
 369              * There is an extra check for identical z to ensure
 370              * well-defined output order, independent of input order
 371              * to ensure binary reproducibility after restarts.
 372              */
 373             while (sort[zi] >= 0
 374                    && (x[a[i]][dim] > x[sort[zi]][dim]
 375                        || (x[a[i]][dim] == x[sort[zi]][dim] && a[i] > sort[zi])))
 376             {
 377                 zi++;
 378             }
 379
 380             if (sort[zi] >= 0)
 381             {
 382                 /* Shift all elements by one slot until we find an empty slot */
 383                 int cp  = sort[zi];
 384                 int zim = zi + 1;
 385                 while (sort[zim] >= 0)
 386                 {
 387                     int tmp   = sort[zim];
 388                     sort[zim] = cp;
 389                     cp        = tmp;
 390                     zim++;
 391                 }
 392                 sort[zim] = cp;
 393                 zi_max    = std::max(zi_max, zim);
 394             }
 395             sort[zi] = a[i];
 396             zi_max   = std::max(zi_max, zi);
 397         }
 398     }
 399
 400     int c = 0;
 401     if (!Backwards)
 402     {
 403         for (int zi = 0; zi < nsort; zi++)
 404         {
 405             if (sort[zi] >= 0)
 406             {
 407                 a[c++]   = sort[zi];
 408                 sort[zi] = -1;
 409             }
 410         }
 411     }
 412     else
 413     {
 414         for (int zi = zi_max; zi >= zi_min; zi--)
 415         {
 416             if (sort[zi] >= 0)
 417             {
 418                 a[c++]   = sort[zi];
 419                 sort[zi] = -1;
 420             }
 421         }
 422     }
 423     if (c < n)
 424     {
 425         gmx_incons("Lost particles while sorting");
 426     }
 427 }
 428
 429 #if GMX_DOUBLE
 430 //! Returns double up to one least significant float bit smaller than x
 431 static double R2F_D(const float x)
 432 {
 433     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS) * x : (1 + GMX_FLOAT_EPS) * x);
 434 }
 435 //! Returns double up to one least significant float bit larger than x
 436 static double R2F_U(const float x)
 437 {
 438     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS) * x : (1 - GMX_FLOAT_EPS) * x);
 439 }
 440 #else
 441 //! Returns x
 442 static float R2F_D(const float x)
 443 {
 444     return x;
 445 }
 446 //! Returns x
 447 static float R2F_U(const float x)
 448 {
 449     return x;
 450 }
 451 #endif
 452
 453 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 454 static void calc_bounding_box(int na, int stride, const real* x, BoundingBox* bb)
 455 {
 456     int  i;
 457     real xl, xh, yl, yh, zl, zh;
 458
 459     i  = 0;
 460     xl = x[i + XX];
 461     xh = x[i + XX];
 462     yl = x[i + YY];
 463     yh = x[i + YY];
 464     zl = x[i + ZZ];
 465     zh = x[i + ZZ];
 466     i += stride;
 467     for (int j = 1; j < na; j++)
 468     {
 469         xl = std::min(xl, x[i + XX]);
 470         xh = std::max(xh, x[i + XX]);
 471         yl = std::min(yl, x[i + YY]);
 472         yh = std::max(yh, x[i + YY]);
 473         zl = std::min(zl, x[i + ZZ]);
 474         zh = std::max(zh, x[i + ZZ]);
 475         i += stride;
 476     }
 477     /* Note: possible double to float conversion here */
 478     bb->lower.x = R2F_D(xl);
 479     bb->lower.y = R2F_D(yl);
 480     bb->lower.z = R2F_D(zl);
 481     bb->upper.x = R2F_U(xh);
 482     bb->upper.y = R2F_U(yh);
 483     bb->upper.z = R2F_U(zh);
 484 }
 485
 486 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 487 static void calc_bounding_box_x_x4(int na, const real* x, BoundingBox* bb)
 488 {
 489     real xl, xh, yl, yh, zl, zh;
 490
 491     xl = x[XX * c_packX4];
 492     xh = x[XX * c_packX4];
 493     yl = x[YY * c_packX4];
 494     yh = x[YY * c_packX4];
 495     zl = x[ZZ * c_packX4];
 496     zh = x[ZZ * c_packX4];
 497     for (int j = 1; j < na; j++)
 498     {
 499         xl = std::min(xl, x[j + XX * c_packX4]);
 500         xh = std::max(xh, x[j + XX * c_packX4]);
 501         yl = std::min(yl, x[j + YY * c_packX4]);
 502         yh = std::max(yh, x[j + YY * c_packX4]);
 503         zl = std::min(zl, x[j + ZZ * c_packX4]);
 504         zh = std::max(zh, x[j + ZZ * c_packX4]);
 505     }
 506     /* Note: possible double to float conversion here */
 507     bb->lower.x = R2F_D(xl);
 508     bb->lower.y = R2F_D(yl);
 509     bb->lower.z = R2F_D(zl);
 510     bb->upper.x = R2F_U(xh);
 511     bb->upper.y = R2F_U(yh);
 512     bb->upper.z = R2F_U(zh);
 513 }
 514
 515 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 516 static void calc_bounding_box_x_x8(int na, const real* x, BoundingBox* bb)
 517 {
 518     real xl, xh, yl, yh, zl, zh;
 519
 520     xl = x[XX * c_packX8];
 521     xh = x[XX * c_packX8];
 522     yl = x[YY * c_packX8];
 523     yh = x[YY * c_packX8];
 524     zl = x[ZZ * c_packX8];
 525     zh = x[ZZ * c_packX8];
 526     for (int j = 1; j < na; j++)
 527     {
 528         xl = std::min(xl, x[j + XX * c_packX8]);
 529         xh = std::max(xh, x[j + XX * c_packX8]);
 530         yl = std::min(yl, x[j + YY * c_packX8]);
 531         yh = std::max(yh, x[j + YY * c_packX8]);
 532         zl = std::min(zl, x[j + ZZ * c_packX8]);
 533         zh = std::max(zh, x[j + ZZ * c_packX8]);
 534     }
 535     /* Note: possible double to float conversion here */
 536     bb->lower.x = R2F_D(xl);
 537     bb->lower.y = R2F_D(yl);
 538     bb->lower.z = R2F_D(zl);
 539     bb->upper.x = R2F_U(xh);
 540     bb->upper.y = R2F_U(yh);
 541     bb->upper.z = R2F_U(zh);
 542 }
 543
 544 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 545 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real* x, BoundingBox* bb, BoundingBox* bbj)
 546 {
 547     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 548     using namespace gmx;
 549
 550     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 551
 552     if (na > 2)
 553     {
 554         calc_bounding_box_x_x4(std::min(na - 2, 2), x + (c_packX4 >> 1), bbj + 1);
 555     }
 556     else
 557     {
 558         /* Set the "empty" bounding box to the same as the first one,
 559          * so we don't need to treat special cases in the rest of the code.
 560          */
 561 #if NBNXN_SEARCH_BB_SIMD4
 562         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 563         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 564 #else
 565         bbj[1] = bbj[0];
 566 #endif
 567     }
 568
 569 #if NBNXN_SEARCH_BB_SIMD4
 570     store4(bb->lower.ptr(), min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 571     store4(bb->upper.ptr(), max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 572 #else
 573     {
 574         bb->lower = BoundingBox::Corner::min(bbj[0].lower, bbj[1].lower);
 575         bb->upper = BoundingBox::Corner::max(bbj[0].upper, bbj[1].upper);
 576     }
 577 #endif
 578 }
 579
 580 #if NBNXN_BBXXXX
 581
 582 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 583 static void calc_bounding_box_xxxx(int na, int stride, const real* x, float* bb)
 584 {
 585     int  i;
 586     real xl, xh, yl, yh, zl, zh;
 587
 588     i  = 0;
 589     xl = x[i + XX];
 590     xh = x[i + XX];
 591     yl = x[i + YY];
 592     yh = x[i + YY];
 593     zl = x[i + ZZ];
 594     zh = x[i + ZZ];
 595     i += stride;
 596     for (int j = 1; j < na; j++)
 597     {
 598         xl = std::min(xl, x[i + XX]);
 599         xh = std::max(xh, x[i + XX]);
 600         yl = std::min(yl, x[i + YY]);
 601         yh = std::max(yh, x[i + YY]);
 602         zl = std::min(zl, x[i + ZZ]);
 603         zh = std::max(zh, x[i + ZZ]);
 604         i += stride;
 605     }
 606     /* Note: possible double to float conversion here */
 607     bb[0 * c_packedBoundingBoxesDimSize] = R2F_D(xl);
 608     bb[1 * c_packedBoundingBoxesDimSize] = R2F_D(yl);
 609     bb[2 * c_packedBoundingBoxesDimSize] = R2F_D(zl);
 610     bb[3 * c_packedBoundingBoxesDimSize] = R2F_U(xh);
 611     bb[4 * c_packedBoundingBoxesDimSize] = R2F_U(yh);
 612     bb[5 * c_packedBoundingBoxesDimSize] = R2F_U(zh);
 613 }
 614
 615 #endif /* NBNXN_BBXXXX */
 616
 617 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 618
 619 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 620 static void calc_bounding_box_simd4(int na, const float* x, BoundingBox* bb)
 621 {
 622     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 623     using namespace gmx;
 624
 625     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH * sizeof(float),
 626                   "The Corner struct should hold exactly 4 floats");
 627
 628     Simd4Float bb_0_S = load4(x);
 629     Simd4Float bb_1_S = bb_0_S;
 630
 631     for (int i = 1; i < na; i++)
 632     {
 633         Simd4Float x_S = load4(x + i * GMX_SIMD4_WIDTH);
 634         bb_0_S         = min(bb_0_S, x_S);
 635         bb_1_S         = max(bb_1_S, x_S);
 636     }
 637
 638     store4(bb->lower.ptr(), bb_0_S);
 639     store4(bb->upper.ptr(), bb_1_S);
 640 }
 641
 642 #    if NBNXN_BBXXXX
 643
 644 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 645 static void calc_bounding_box_xxxx_simd4(int na, const float* x, BoundingBox* bb_work_aligned, real* bb)
 646 {
 647     calc_bounding_box_simd4(na, x, bb_work_aligned);
 648
 649     bb[0 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 650     bb[1 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 651     bb[2 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 652     bb[3 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 653     bb[4 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 654     bb[5 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 655 }
 656
 657 #    endif /* NBNXN_BBXXXX */
 658
 659 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 660
 661
 662 /*! \brief Combines pairs of consecutive bounding boxes */
 663 static void combine_bounding_box_pairs(const Grid&                      grid,
 664                                        gmx::ArrayRef<const BoundingBox> bb,
 665                                        gmx::ArrayRef<BoundingBox>       bbj)
 666 {
 667     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 668     using namespace gmx;
 669
 670     for (int i = 0; i < grid.numColumns(); i++)
 671     {
 672         /* Starting bb in a column is expected to be 2-aligned */
 673         const int sc2 = grid.firstCellInColumn(i) >> 1;
 674         /* For odd numbers skip the last bb here */
 675         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 676         int       c2;
 677         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 678         {
 679 #if NBNXN_SEARCH_BB_SIMD4
 680             Simd4Float min_S, max_S;
 681
 682             min_S = min(load4(bb[c2 * 2 + 0].lower.ptr()), load4(bb[c2 * 2 + 1].lower.ptr()));
 683             max_S = max(load4(bb[c2 * 2 + 0].upper.ptr()), load4(bb[c2 * 2 + 1].upper.ptr()));
 684             store4(bbj[c2].lower.ptr(), min_S);
 685             store4(bbj[c2].upper.ptr(), max_S);
 686 #else
 687             bbj[c2].lower = BoundingBox::Corner::min(bb[c2 * 2 + 0].lower, bb[c2 * 2 + 1].lower);
 688             bbj[c2].upper = BoundingBox::Corner::max(bb[c2 * 2 + 0].upper, bb[c2 * 2 + 1].upper);
 689 #endif
 690         }
 691         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 692         {
 693             /* The bb count in this column is odd: duplicate the last bb */
 694             bbj[c2].lower = bb[c2 * 2].lower;
 695             bbj[c2].upper = bb[c2 * 2].upper;
 696         }
 697     }
 698 }
 699
 700
 701 /*! \brief Prints the average bb size, used for debug output */
 702 static void print_bbsizes_simple(FILE* fp, const Grid& grid)
 703 {
 704     dvec ba = { 0 };
 705     for (int c = 0; c < grid.numCells(); c++)
 706     {
 707         const BoundingBox& bb = grid.iBoundingBoxes()[c];
 708         ba[XX] += bb.upper.x - bb.lower.x;
 709         ba[YY] += bb.upper.y - bb.lower.y;
 710         ba[ZZ] += bb.upper.z - bb.lower.z;
 711     }
 712     dsvmul(1.0 / grid.numCells(), ba, ba);
 713
 714     const Grid::Dimensions& dims = grid.dimensions();
 715     real                    avgCellSizeZ =
 716             (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster
 717                                             / (dims.atomDensity * dims.cellSize[XX] * dims.cellSize[YY])
 718                                   : 0.0);
 719
 720     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 721             dims.cellSize[XX], dims.cellSize[YY], avgCellSizeZ, ba[XX], ba[YY], ba[ZZ],
 722             ba[XX] * dims.invCellSize[XX], ba[YY] * dims.invCellSize[YY],
 723             dims.atomDensity > 0 ? ba[ZZ] / avgCellSizeZ : 0.0);
 724 }
 725
 726 /*! \brief Prints the average bb size, used for debug output */
 727 static void print_bbsizes_supersub(FILE* fp, const Grid& grid)
 728 {
 729     int  ns;
 730     dvec ba;
 731
 732     clear_dvec(ba);
 733     ns = 0;
 734     for (int c = 0; c < grid.numCells(); c++)
 735     {
 736 #if NBNXN_BBXXXX
 737         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 738         {
 739             int  cs_w          = (c * c_gpuNumClusterPerCell + s) / c_packedBoundingBoxesDimSize;
 740             auto boundingBoxes = grid.packedBoundingBoxes().subArray(
 741                     cs_w * c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 742             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 743             {
 744                 for (int d = 0; d < DIM; d++)
 745                 {
 746                     ba[d] += boundingBoxes[(DIM + d) * c_packedBoundingBoxesDimSize + i]
 747                              - boundingBoxes[(0 + d) * c_packedBoundingBoxesDimSize + i];
 748                 }
 749             }
 750         }
 751 #else
 752         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 753         {
 754             const BoundingBox& bb = grid.iBoundingBoxes()[c * c_gpuNumClusterPerCell + s];
 755             ba[XX] += bb.upper.x - bb.lower.x;
 756             ba[YY] += bb.upper.y - bb.lower.y;
 757             ba[ZZ] += bb.upper.z - bb.lower.z;
 758         }
 759 #endif
 760         ns += grid.numClustersPerCell()[c];
 761     }
 762     dsvmul(1.0 / ns, ba, ba);
 763
 764     const Grid::Dimensions& dims = grid.dimensions();
 765     const real              avgClusterSizeZ =
 766             (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell
 767                                             / (dims.atomDensity * dims.cellSize[XX]
 768                                                * dims.cellSize[YY] * c_gpuNumClusterPerCellZ)
 769                                   : 0.0);
 770
 771     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 772             dims.cellSize[XX] / c_gpuNumClusterPerCellX,
 773             dims.cellSize[YY] / c_gpuNumClusterPerCellY, avgClusterSizeZ, ba[XX], ba[YY], ba[ZZ],
 774             ba[XX] * c_gpuNumClusterPerCellX * dims.invCellSize[XX],
 775             ba[YY] * c_gpuNumClusterPerCellY * dims.invCellSize[YY],
 776             dims.atomDensity > 0 ? ba[ZZ] / avgClusterSizeZ : 0.0);
 777 }
 778
 779 /*!\brief Set non-bonded interaction flags for the current cluster.
 780  *
 781  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 782  */
 783 static void sort_cluster_on_flag(int                numAtomsInCluster,
 784                                  int                atomStart,
 785                                  int                atomEnd,
 786                                  const int*         atinfo,
 787                                  gmx::ArrayRef<int> order,
 788                                  int*               flags)
 789 {
 790     constexpr int c_maxNumAtomsInCluster = 8;
 791     int           sort1[c_maxNumAtomsInCluster];
 792     int           sort2[c_maxNumAtomsInCluster];
 793
 794     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster,
 795                "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 796
 797     *flags = 0;
 798
 799     int subc = 0;
 800     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 801     {
 802         /* Make lists for this (sub-)cell on atoms with and without LJ */
 803         int      n1       = 0;
 804         int      n2       = 0;
 805         gmx_bool haveQ    = FALSE;
 806         int      a_lj_max = -1;
 807         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 808         {
 809             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 810
 811             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 812             {
 813                 sort1[n1++] = order[a];
 814                 a_lj_max    = a;
 815             }
 816             else
 817             {
 818                 sort2[n2++] = order[a];
 819             }
 820         }
 821
 822         /* If we don't have atoms with LJ, there's nothing to sort */
 823         if (n1 > 0)
 824         {
 825             *flags |= NBNXN_CI_DO_LJ(subc);
 826
 827             if (2 * n1 <= numAtomsInCluster)
 828             {
 829                 /* Only sort when strictly necessary. Ordering particles
 830                  * Ordering particles can lead to less accurate summation
 831                  * due to rounding, both for LJ and Coulomb interactions.
 832                  */
 833                 if (2 * (a_lj_max - s) >= numAtomsInCluster)
 834                 {
 835                     for (int i = 0; i < n1; i++)
 836                     {
 837                         order[atomStart + i] = sort1[i];
 838                     }
 839                     for (int j = 0; j < n2; j++)
 840                     {
 841                         order[atomStart + n1 + j] = sort2[j];
 842                     }
 843                 }
 844
 845                 *flags |= NBNXN_CI_HALF_LJ(subc);
 846             }
 847         }
 848         if (haveQ)
 849         {
 850             *flags |= NBNXN_CI_DO_COUL(subc);
 851         }
 852         subc++;
 853     }
 854 }
 855
 856 /*! \brief Fill a pair search cell with atoms.
 857  *
 858  * Potentially sorts atoms and sets the interaction flags.
 859  */
 860 void Grid::fillCell(GridSetData*                   gridSetData,
 861                     nbnxn_atomdata_t*              nbat,
 862                     int                            atomStart,
 863                     int                            atomEnd,
 864                     const int*                     atinfo,
 865                     gmx::ArrayRef<const gmx::RVec> x,
 866                     BoundingBox gmx_unused* bb_work_aligned)
 867 {
 868     const int numAtoms = atomEnd - atomStart;
 869
 870     const gmx::ArrayRef<int>& cells       = gridSetData->cells;
 871     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
 872
 873     if (geometry_.isSimple)
 874     {
 875         /* Note that non-local grids are already sorted.
 876          * Then sort_cluster_on_flag will only set the flags and the sorting
 877          * will not affect the atom order.
 878          */
 879         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
 880                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 881     }
 882
 883     if (haveFep_)
 884     {
 885         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 886
 887         /* The grid-local cluster/(sub-)cell index */
 888         int cell = atomToCluster(atomStart)
 889                    - cellOffset_ * (geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 890         fep_[cell] = 0;
 891         for (int at = atomStart; at < atomEnd; at++)
 892         {
 893             if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
 894             {
 895                 fep_[cell] |= (1 << (at - atomStart));
 896             }
 897         }
 898     }
 899
 900     /* Now we have sorted the atoms, set the cell indices */
 901     for (int at = atomStart; at < atomEnd; at++)
 902     {
 903         cells[atomIndices[at]] = at;
 904     }
 905
 906     copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 907                            as_rvec_array(x.data()), nbat->XFormat, nbat->x().data(), atomStart);
 908
 909     if (nbat->XFormat == nbatX4)
 910     {
 911         /* Store the bounding boxes as xyz.xyz. */
 912         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 913         BoundingBox* bb_ptr = bb_.data() + offset;
 914
 915 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 916         if (2 * geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 917         {
 918             calc_bounding_box_x_x4_halves(numAtoms,
 919                                           nbat->x().data() + atom_to_x_index<c_packX4>(atomStart),
 920                                           bb_ptr, bbj_.data() + offset * 2);
 921         }
 922         else
 923 #endif
 924         {
 925             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 926         }
 927     }
 928     else if (nbat->XFormat == nbatX8)
 929     {
 930         /* Store the bounding boxes as xyz.xyz. */
 931         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 932         BoundingBox* bb_ptr = bb_.data() + offset;
 933
 934         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 935     }
 936 #if NBNXN_BBXXXX
 937     else if (!geometry_.isSimple)
 938     {
 939         /* Store the bounding boxes in a format convenient
 940          * for SIMD4 calculations: xxxxyyyyzzzz...
 941          */
 942         const int clusterIndex = ((atomStart - cellOffset_ * geometry_.numAtomsPerCell)
 943                                   >> geometry_.numAtomsICluster2Log);
 944         float*    pbb_ptr      = pbb_.data() + packedBoundingBoxesIndex(clusterIndex)
 945                          + (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 946
 947 #    if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 948         if (nbat->XFormat == nbatXYZQ)
 949         {
 950             GMX_ASSERT(bb_work_aligned != nullptr, "Must have valid aligned work structure");
 951             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart * nbat->xstride,
 952                                          bb_work_aligned, pbb_ptr);
 953         }
 954         else
 955 #    endif
 956         {
 957             calc_bounding_box_xxxx(numAtoms, nbat->xstride,
 958                                    nbat->x().data() + atomStart * nbat->xstride, pbb_ptr);
 959         }
 960         if (gmx_debug_at)
 961         {
 962             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 963                     atomToCluster(atomStart), pbb_ptr[0 * c_packedBoundingBoxesDimSize],
 964                     pbb_ptr[3 * c_packedBoundingBoxesDimSize], pbb_ptr[1 * c_packedBoundingBoxesDimSize],
 965                     pbb_ptr[4 * c_packedBoundingBoxesDimSize], pbb_ptr[2 * c_packedBoundingBoxesDimSize],
 966                     pbb_ptr[5 * c_packedBoundingBoxesDimSize]);
 967         }
 968     }
 969 #endif
 970     else
 971     {
 972         /* Store the bounding boxes as xyz.xyz. */
 973         BoundingBox* bb_ptr =
 974                 bb_.data() + atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
 975
 976         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart * nbat->xstride, bb_ptr);
 977
 978         if (gmx_debug_at)
 979         {
 980             int bbo = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
 981             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 982                     atomToCluster(atomStart), bb_[bbo].lower.x, bb_[bbo].lower.y, bb_[bbo].lower.z,
 983                     bb_[bbo].upper.x, bb_[bbo].upper.y, bb_[bbo].upper.z);
 984         }
 985     }
 986 }
 987
 988 void Grid::sortColumnsCpuGeometry(GridSetData*                   gridSetData,
 989                                   int                            dd_zone,
 990                                   const int*                     atinfo,
 991                                   gmx::ArrayRef<const gmx::RVec> x,
 992                                   nbnxn_atomdata_t*              nbat,
 993                                   const gmx::Range<int>          columnRange,
 994                                   gmx::ArrayRef<int>             sort_work)
 995 {
 996     if (debug)
 997     {
 998         fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
 999                 *columnRange.begin(), *columnRange.end());
1000     }
1001
1002     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1003
1004     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1005
1006     /* Sort the atoms within each x,y column in 3 dimensions */
1007     for (int cxy : columnRange)
1008     {
1009         const int numAtoms   = numAtomsInColumn(cxy);
1010         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1011         const int atomOffset = firstAtomInColumn(cxy);
1012
1013         /* Sort the atoms within each x,y column on z coordinate */
1014         sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1015                    gridSetData->atomIndices.data() + atomOffset, numAtoms, x, dimensions_.lowerCorner[ZZ],
1016                    1.0 / dimensions_.gridSize[ZZ], numCellsZ * numAtomsPerCell, sort_work);
1017
1018         /* Fill the ncz cells in this column */
1019         const int firstCell  = firstCellInColumn(cxy);
1020         int       cellFilled = firstCell;
1021         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1022         {
1023             const int cell = firstCell + cellZ;
1024
1025             const int atomOffsetCell = atomOffset + cellZ * numAtomsPerCell;
1026             const int numAtomsCell = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1027
1028             fillCell(gridSetData, nbat, atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x, nullptr);
1029
1030             /* This copy to bbcz is not really necessary.
1031              * But it allows to use the same grid search code
1032              * for the simple and supersub cell setups.
1033              */
1034             if (numAtomsCell > 0)
1035             {
1036                 cellFilled = cell;
1037             }
1038             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1039             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1040         }
1041
1042         /* Set the unused atom indices to -1 */
1043         for (int ind = numAtoms; ind < numCellsZ * numAtomsPerCell; ind++)
1044         {
1045             gridSetData->atomIndices[atomOffset + ind] = -1;
1046         }
1047     }
1048 }
1049
1050 /* Spatially sort the atoms within one grid column */
1051 void Grid::sortColumnsGpuGeometry(GridSetData*                   gridSetData,
1052                                   int                            dd_zone,
1053                                   const int*                     atinfo,
1054                                   gmx::ArrayRef<const gmx::RVec> x,
1055                                   nbnxn_atomdata_t*              nbat,
1056                                   const gmx::Range<int>          columnRange,
1057                                   gmx::ArrayRef<int>             sort_work)
1058 {
1059     BoundingBox  bb_work_array[2];
1060     BoundingBox* bb_work_aligned = reinterpret_cast<BoundingBox*>(
1061             (reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1062
1063     if (debug)
1064     {
1065         fprintf(debug, "cell_offset %d sorting columns %d - %d\n", cellOffset_,
1066                 *columnRange.begin(), *columnRange.end());
1067     }
1068
1069     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1070
1071     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1072
1073     const int subdiv_x = geometry_.numAtomsICluster;
1074     const int subdiv_y = c_gpuNumClusterPerCellX * subdiv_x;
1075     const int subdiv_z = c_gpuNumClusterPerCellY * subdiv_y;
1076
1077     /* Extract the atom index array that will be filled here */
1078     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
1079
1080     /* Sort the atoms within each x,y column in 3 dimensions.
1081      * Loop over all columns on the x/y grid.
1082      */
1083     for (int cxy : columnRange)
1084     {
1085         const int gridX = cxy / dimensions_.numCells[YY];
1086         const int gridY = cxy - gridX * dimensions_.numCells[YY];
1087
1088         const int numAtomsInColumn = cxy_na_[cxy];
1089         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1090         const int atomOffset       = firstAtomInColumn(cxy);
1091
1092         /* Sort the atoms within each x,y column on z coordinate */
1093         sort_atoms(ZZ, FALSE, dd_zone, relevantAtomsAreWithinGridBounds,
1094                    atomIndices.data() + atomOffset, numAtomsInColumn, x, dimensions_.lowerCorner[ZZ],
1095                    1.0 / dimensions_.gridSize[ZZ], numCellsInColumn * numAtomsPerCell, sort_work);
1096
1097         /* This loop goes over the cells and clusters along z at once */
1098         for (int sub_z = 0; sub_z < numCellsInColumn * c_gpuNumClusterPerCellZ; sub_z++)
1099         {
1100             const int atomOffsetZ = atomOffset + sub_z * subdiv_z;
1101             const int numAtomsZ = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1102             int       cz        = -1;
1103             /* We have already sorted on z */
1104
1105             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1106             {
1107                 cz             = sub_z / c_gpuNumClusterPerCellZ;
1108                 const int cell = cxy_ind_[cxy] + cz;
1109
1110                 /* The number of atoms in this cell/super-cluster */
1111                 const int numAtoms =
1112                         std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1113
1114                 numClusters_[cell] =
1115                         std::min(c_gpuNumClusterPerCell, (numAtoms + geometry_.numAtomsICluster - 1)
1116                                                                  / geometry_.numAtomsICluster);
1117
1118                 /* Store the z-boundaries of the bounding box of the cell */
1119                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1120                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1121             }
1122
1123             if (c_gpuNumClusterPerCellY > 1)
1124             {
1125                 /* Sort the atoms along y */
1126                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone, relevantAtomsAreWithinGridBounds,
1127                            atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1128                            dimensions_.lowerCorner[YY] + gridY * dimensions_.cellSize[YY],
1129                            dimensions_.invCellSize[YY], subdiv_z, sort_work);
1130             }
1131
1132             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1133             {
1134                 const int atomOffsetY = atomOffsetZ + sub_y * subdiv_y;
1135                 const int numAtomsY = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1136
1137                 if (c_gpuNumClusterPerCellX > 1)
1138                 {
1139                     /* Sort the atoms along x */
1140                     sort_atoms(XX, ((cz * c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1141                                relevantAtomsAreWithinGridBounds, atomIndices.data() + atomOffsetY, numAtomsY,
1142                                x, dimensions_.lowerCorner[XX] + gridX * dimensions_.cellSize[XX],
1143                                dimensions_.invCellSize[XX], subdiv_y, sort_work);
1144                 }
1145
1146                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1147                 {
1148                     const int atomOffsetX = atomOffsetY + sub_x * subdiv_x;
1149                     const int numAtomsX =
1150                             std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1151
1152                     fillCell(gridSetData, nbat, atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1153                              bb_work_aligned);
1154                 }
1155             }
1156         }
1157
1158         /* Set the unused atom indices to -1 */
1159         for (int ind = numAtomsInColumn; ind < numCellsInColumn * numAtomsPerCell; ind++)
1160         {
1161             atomIndices[atomOffset + ind] = -1;
1162         }
1163     }
1164 }
1165
1166 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1167 static void setCellAndAtomCount(gmx::ArrayRef<int> cell, int cellIndex, gmx::ArrayRef<int> cxy_na, int atomIndex)
1168 {
1169     cell[atomIndex] = cellIndex;
1170     cxy_na[cellIndex] += 1;
1171 }
1172
1173 void Grid::calcColumnIndices(const Grid::Dimensions&        gridDims,
1174                              const gmx::UpdateGroupsCog*    updateGroupsCog,
1175                              const gmx::Range<int>          atomRange,
1176                              gmx::ArrayRef<const gmx::RVec> x,
1177                              const int                      dd_zone,
1178                              const int*                     move,
1179                              const int                      thread,
1180                              const int                      nthread,
1181                              gmx::ArrayRef<int>             cell,
1182                              gmx::ArrayRef<int>             cxy_na)
1183 {
1184     const int numColumns = gridDims.numCells[XX] * gridDims.numCells[YY];
1185
1186     /* We add one extra cell for particles which moved during DD */
1187     for (int i = 0; i < numColumns; i++)
1188     {
1189         cxy_na[i] = 0;
1190     }
1191
1192     int taskAtomStart = *atomRange.begin() + static_cast<int>((thread + 0) * atomRange.size()) / nthread;
1193     int taskAtomEnd = *atomRange.begin() + static_cast<int>((thread + 1) * atomRange.size()) / nthread;
1194
1195     if (dd_zone == 0)
1196     {
1197         /* Home zone */
1198         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1199         {
1200             if (move == nullptr || move[i] >= 0)
1201             {
1202
1203                 const gmx::RVec& coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1204
1205                 /* We need to be careful with rounding,
1206                  * particles might be a few bits outside the local zone.
1207                  * The int cast takes care of the lower bound,
1208                  * we will explicitly take care of the upper bound.
1209                  */
1210                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])
1211                                           * gridDims.invCellSize[XX]);
1212                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])
1213                                           * gridDims.invCellSize[YY]);
1214
1215 #ifndef NDEBUG
1216                 if (cx < 0 || cx > gridDims.numCells[XX] || cy < 0 || cy > gridDims.numCells[YY])
1217                 {
1218                     gmx_fatal(FARGS,
1219                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1220                               "atom %f %f %f, grid->c0 %f %f",
1221                               cx, cy, gridDims.numCells[XX], gridDims.numCells[YY], x[i][XX],
1222                               x[i][YY], x[i][ZZ], gridDims.lowerCorner[XX], gridDims.lowerCorner[YY]);
1223                 }
1224 #endif
1225                 /* Take care of potential rouding issues */
1226                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1227                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1228
1229                 /* For the moment cell will contain only the, grid local,
1230                  * x and y indices, not z.
1231                  */
1232                 setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1233             }
1234             else
1235             {
1236                 /* Put this moved particle after the end of the grid,
1237                  * so we can process it later without using conditionals.
1238                  */
1239                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1240             }
1241         }
1242     }
1243     else
1244     {
1245         /* Non-home zone */
1246         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1247         {
1248             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX]) * gridDims.invCellSize[XX]);
1249             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY]) * gridDims.invCellSize[YY]);
1250
1251             /* For non-home zones there could be particles outside
1252              * the non-bonded cut-off range, which have been communicated
1253              * for bonded interactions only. For the result it doesn't
1254              * matter where these end up on the grid. For performance
1255              * we put them in an extra row at the border.
1256              */
1257             cx = std::max(cx, 0);
1258             cx = std::min(cx, gridDims.numCells[XX] - 1);
1259             cy = std::max(cy, 0);
1260             cy = std::min(cy, gridDims.numCells[YY] - 1);
1261
1262             /* For the moment cell will contain only the, grid local,
1263              * x and y indices, not z.
1264              */
1265             setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1266         }
1267     }
1268 }
1269
1270 /*! \brief Resizes grid and atom data which depend on the number of cells */
1271 static void resizeForNumberOfCells(const int         numNbnxnAtoms,
1272                                    const int         numAtomsMoved,
1273                                    GridSetData*      gridSetData,
1274                                    nbnxn_atomdata_t* nbat)
1275 {
1276     /* Note: gridSetData->cellIndices was already resized before */
1277
1278     /* To avoid conditionals we store the moved particles at the end of a,
1279      * make sure we have enough space.
1280      */
1281     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1282
1283     /* Make space in nbat for storing the atom coordinates */
1284     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1285 }
1286
1287 void Grid::setCellIndices(int                            ddZone,
1288                           int                            cellOffset,
1289                           GridSetData*                   gridSetData,
1290                           gmx::ArrayRef<GridWork>        gridWork,
1291                           const gmx::Range<int>          atomRange,
1292                           const int*                     atinfo,
1293                           gmx::ArrayRef<const gmx::RVec> x,
1294                           const int                      numAtomsMoved,
1295                           nbnxn_atomdata_t*              nbat)
1296 {
1297     cellOffset_ = cellOffset;
1298
1299     srcAtomBegin_ = *atomRange.begin();
1300     srcAtomEnd_   = *atomRange.end();
1301
1302     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1303
1304     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1305
1306     /* Make the cell index as a function of x and y */
1307     int ncz_max = 0;
1308     int ncz     = 0;
1309     cxy_ind_[0] = 0;
1310     for (int i = 0; i < numColumns() + 1; i++)
1311     {
1312         /* We set ncz_max at the beginning of the loop iso at the end
1313          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1314          * that do not need to be ordered on the grid.
1315          */
1316         if (ncz > ncz_max)
1317         {
1318             ncz_max = ncz;
1319         }
1320         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1321         for (int thread = 1; thread < nthread; thread++)
1322         {
1323             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1324         }
1325         ncz = (cxy_na_i + numAtomsPerCell - 1) / numAtomsPerCell;
1326         if (nbat->XFormat == nbatX8)
1327         {
1328             /* Make the number of cell a multiple of 2 */
1329             ncz = (ncz + 1) & ~1;
1330         }
1331         cxy_ind_[i + 1] = cxy_ind_[i] + ncz;
1332         /* Clear cxy_na_, so we can reuse the array below */
1333         cxy_na_[i] = 0;
1334     }
1335     numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
1336     numCellsColumnMax_ = ncz_max;
1337
1338     /* Resize grid and atom data which depend on the number of cells */
1339     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1340
1341     if (debug)
1342     {
1343         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1344                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_, dimensions_.numCells[XX],
1345                 dimensions_.numCells[YY], numCellsTotal_ / (static_cast<double>(numColumns())), ncz_max);
1346         if (gmx_debug_at)
1347         {
1348             int i = 0;
1349             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1350             {
1351                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1352                 {
1353                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1354                     i++;
1355                 }
1356                 fprintf(debug, "\n");
1357             }
1358         }
1359     }
1360
1361     /* Make sure the work array for sorting is large enough */
1362     const int worstCaseSortBufferSize = ncz_max * numAtomsPerCell * c_sortGridMaxSizeFactor;
1363     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1364     {
1365         for (GridWork& work : gridWork)
1366         {
1367             /* Elements not in use should be -1 */
1368             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1369         }
1370     }
1371
1372     /* Now we know the dimensions we can fill the grid.
1373      * This is the first, unsorted fill. We sort the columns after this.
1374      */
1375     gmx::ArrayRef<int> cells       = gridSetData->cells;
1376     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1377     for (int i : atomRange)
1378     {
1379         /* At this point nbs->cell contains the local grid x,y indices */
1380         const int cxy                                        = cells[i];
1381         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1382     }
1383
1384     if (ddZone == 0)
1385     {
1386         /* Set the cell indices for the moved particles */
1387         int n0 = numCellsTotal_ * numAtomsPerCell;
1388         int n1 = numCellsTotal_ * numAtomsPerCell + cxy_na_[numColumns()];
1389         for (int i = n0; i < n1; i++)
1390         {
1391             cells[atomIndices[i]] = i;
1392         }
1393     }
1394
1395     /* Sort the super-cell columns along z into the sub-cells. */
1396 #pragma omp parallel for num_threads(nthread) schedule(static)
1397     for (int thread = 0; thread < nthread; thread++)
1398     {
1399         try
1400         {
1401             gmx::Range<int> columnRange(((thread + 0) * numColumns()) / nthread,
1402                                         ((thread + 1) * numColumns()) / nthread);
1403             if (geometry_.isSimple)
1404             {
1405                 sortColumnsCpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1406                                        gridWork[thread].sortBuffer);
1407             }
1408             else
1409             {
1410                 sortColumnsGpuGeometry(gridSetData, ddZone, atinfo, x, nbat, columnRange,
1411                                        gridWork[thread].sortBuffer);
1412             }
1413         }
1414         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1415     }
1416
1417     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1418     {
1419         combine_bounding_box_pairs(*this, bb_, bbj_);
1420     }
1421
1422     if (!geometry_.isSimple)
1423     {
1424         numClustersTotal_ = 0;
1425         for (int i = 0; i < numCellsTotal_; i++)
1426         {
1427             numClustersTotal_ += numClusters_[i];
1428         }
1429     }
1430
1431     if (debug)
1432     {
1433         if (geometry_.isSimple)
1434         {
1435             print_bbsizes_simple(debug, *this);
1436         }
1437         else
1438         {
1439             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n", numClustersTotal_,
1440                     atomRange.size() / static_cast<double>(numClustersTotal_));
1441
1442             print_bbsizes_supersub(debug, *this);
1443         }
1444     }
1445 }
1446
1447 } // namespace Nbnxm