src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  *
  39  * \brief
  40  * Implements the Grid class.
  41  *
  42  * \author Berk Hess <hess@kth.se>
  43  * \ingroup module_nbnxm
  44  */
  45
  46 #include "gmxpre.h"
  47
  48 #include "grid.h"
  49
  50 #include <cmath>
  51 #include <cstring>
  52
  53 #include <algorithm>
  54
  55 #include "gromacs/math/utilities.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  58 #include "gromacs/mdlib/updategroupscog.h"
  59 #include "gromacs/mdtypes/atominfo.h"
  60 #include "gromacs/nbnxm/atomdata.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/vector_operations.h"
  63
  64 #include "boundingboxes.h"
  65 #include "gridsetdata.h"
  66 #include "nbnxm_geometry.h"
  67 #include "pairlistparams.h"
  68
  69 namespace Nbnxm
  70 {
  71
  72 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  73     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  74     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  75     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  76     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell) * numAtomsICluster),
  77     numAtomsICluster2Log(get_2log(numAtomsICluster))
  78 {
  79 }
  80
  81 Grid::Grid(const PairlistType pairlistType, const bool& haveFep) :
  82     geometry_(pairlistType), haveFep_(haveFep)
  83 {
  84 }
  85
  86 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  87 static real gridAtomDensity(int numAtoms, const rvec lowerCorner, const rvec upperCorner)
  88 {
  89     rvec size;
  90
  91     if (numAtoms == 0)
  92     {
  93         /* To avoid zero density we use a minimum of 1 atom */
  94         numAtoms = 1;
  95     }
  96
  97     rvec_sub(upperCorner, lowerCorner, size);
  98
  99     return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
 100 }
 101
 102 // Get approximate dimensions of each cell. Returns the length along X and Y.
 103 static std::array<real, DIM - 1> getTargetCellLength(const Grid::Geometry& geometry, const real atomDensity)
 104 {
 105     if (geometry.isSimple)
 106     {
 107         /* To minimize the zero interactions, we should make
 108          * the largest of the i/j cell cubic.
 109          */
 110         int numAtomsInCell = std::max(geometry.numAtomsICluster, geometry.numAtomsJCluster);
 111
 112         /* Approximately cubic cells */
 113         real tlen = std::cbrt(numAtomsInCell / atomDensity);
 114         return { tlen, tlen };
 115     }
 116     else
 117     {
 118         /* Approximately cubic sub cells */
 119         real tlen = std::cbrt(geometry.numAtomsICluster / atomDensity);
 120         return { tlen * c_gpuNumClusterPerCellX, tlen * c_gpuNumClusterPerCellY };
 121     }
 122 }
 123
 124 static int getMaxNumCells(const Grid::Geometry& geometry, const int numAtoms, const int numColumns)
 125 {
 126     if (geometry.numAtomsJCluster <= geometry.numAtomsICluster)
 127     {
 128         return numAtoms / geometry.numAtomsPerCell + numColumns;
 129     }
 130     else
 131     {
 132         return numAtoms / geometry.numAtomsPerCell
 133                + numColumns * geometry.numAtomsJCluster / geometry.numAtomsICluster;
 134     }
 135 }
 136
 137 void Grid::setDimensions(const int          ddZone,
 138                          const int          numAtoms,
 139                          gmx::RVec          lowerCorner,
 140                          gmx::RVec          upperCorner,
 141                          real               atomDensity,
 142                          const real         maxAtomGroupRadius,
 143                          const bool         haveFep,
 144                          gmx::PinningPolicy pinningPolicy)
 145 {
 146     /* We allow passing lowerCorner=upperCorner, in which case we need to
 147      * create a finite sized bounding box to avoid division by zero.
 148      * We use a minimum size such that the volume fits in float with some
 149      * margin for computing and using the atom number density.
 150      */
 151     constexpr real c_minimumGridSize = 1e-10;
 152     for (int d = 0; d < DIM; d++)
 153     {
 154         GMX_ASSERT(upperCorner[d] >= lowerCorner[d],
 155                    "Upper corner should be larger than the lower corner");
 156         if (upperCorner[d] - lowerCorner[d] < c_minimumGridSize)
 157         {
 158             /* Ensure we apply a correction to the bounding box */
 159             real correction =
 160                     std::max(std::abs(lowerCorner[d]) * GMX_REAL_EPS, 0.5_real * c_minimumGridSize);
 161             lowerCorner[d] -= correction;
 162             upperCorner[d] += correction;
 163         }
 164     }
 165
 166     /* For the home zone we compute the density when not set (=-1) or when =0 */
 167     if (ddZone == 0 && atomDensity <= 0)
 168     {
 169         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 170     }
 171
 172     dimensions_.atomDensity        = atomDensity;
 173     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 174
 175     rvec size;
 176     rvec_sub(upperCorner, lowerCorner, size);
 177
 178     if (numAtoms > geometry_.numAtomsPerCell)
 179     {
 180         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 181
 182         /* target cell length */
 183         const std::array<real, DIM - 1> tlen = getTargetCellLength(geometry_, atomDensity);
 184
 185         /* We round ncx and ncy down, because we get less cell pairs
 186          * in the pairlist when the fixed cell dimensions (x,y) are
 187          * larger than the variable one (z) than the other way around.
 188          */
 189         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX] / tlen[XX]));
 190         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY] / tlen[YY]));
 191     }
 192     else
 193     {
 194         dimensions_.numCells[XX] = 1;
 195         dimensions_.numCells[YY] = 1;
 196     }
 197
 198     for (int d = 0; d < DIM - 1; d++)
 199     {
 200         dimensions_.cellSize[d]    = size[d] / dimensions_.numCells[d];
 201         dimensions_.invCellSize[d] = 1 / dimensions_.cellSize[d];
 202     }
 203
 204     if (ddZone > 0)
 205     {
 206         /* This is a non-home zone, add an extra row of cells
 207          * for particles communicated for bonded interactions.
 208          * These can be beyond the cut-off. It doesn't matter where
 209          * they end up on the grid, but for performance it's better
 210          * if they don't end up in cells that can be within cut-off range.
 211          */
 212         dimensions_.numCells[XX]++;
 213         dimensions_.numCells[YY]++;
 214     }
 215
 216     /* We need one additional cell entry for particles moved by DD */
 217     cxy_na_.resize(numColumns() + 1);
 218     cxy_ind_.resize(numColumns() + 2);
 219     changePinningPolicy(&cxy_na_, pinningPolicy);
 220     changePinningPolicy(&cxy_ind_, pinningPolicy);
 221
 222     /* Worst case scenario of 1 atom in each last cell */
 223     const int maxNumCells = getMaxNumCells(geometry_, numAtoms, numColumns());
 224
 225     if (!geometry_.isSimple)
 226     {
 227         numClusters_.resize(maxNumCells);
 228     }
 229     bbcz_.resize(maxNumCells);
 230
 231     /* This resize also zeros the contents, this avoid possible
 232      * floating exceptions in SIMD with the unused bb elements.
 233      */
 234     if (geometry_.isSimple)
 235     {
 236         bb_.resize(maxNumCells);
 237     }
 238     else
 239     {
 240 #if NBNXN_BBXXXX
 241         pbb_.resize(packedBoundingBoxesIndex(maxNumCells * c_gpuNumClusterPerCell));
 242 #else
 243         bb_.resize(maxNumCells * c_gpuNumClusterPerCell);
 244 #endif
 245     }
 246
 247     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 248     {
 249         bbj_ = bb_;
 250     }
 251     else
 252     {
 253         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 254
 255         bbjStorage_.resize(maxNumCells * geometry_.numAtomsICluster / geometry_.numAtomsJCluster);
 256         bbj_ = bbjStorage_;
 257     }
 258
 259     flags_.resize(maxNumCells);
 260     if (haveFep)
 261     {
 262         fep_.resize(maxNumCells * geometry_.numAtomsPerCell / geometry_.numAtomsICluster);
 263     }
 264
 265     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 266     copy_rvec(upperCorner, dimensions_.upperCorner);
 267     copy_rvec(size, dimensions_.gridSize);
 268 }
 269
 270 /* We need to sort paricles in grid columns on z-coordinate.
 271  * As particle are very often distributed homogeneously, we use a sorting
 272  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 273  * by a factor, cast to an int and try to store in that hole. If the hole
 274  * is full, we move this or another particle. A second pass is needed to make
 275  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 276  * 4 is the optimal value for homogeneous particle distribution and allows
 277  * for an O(#particles) sort up till distributions were all particles are
 278  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 279  * as it can be expensive to detect imhomogeneous particle distributions.
 280  */
 281 /*! \brief Ratio of grid cells to atoms */
 282 static constexpr int c_sortGridRatio = 4;
 283 /*! \brief Maximum ratio of holes used, in the worst case all particles
 284  * end up in the last hole and we need num. atoms extra holes at the end.
 285  */
 286 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 287
 288 /*! \brief Sorts particle index a on coordinates x along dim.
 289  *
 290  * Backwards tells if we want decreasing iso increasing coordinates.
 291  * h0 is the minimum of the coordinate range.
 292  * invh is the 1/length of the sorting range.
 293  * n_per_h (>=n) is the expected average number of particles per 1/invh
 294  * sort is the sorting work array.
 295  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 296  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 297  */
 298 static void sort_atoms(int                            dim,
 299                        gmx_bool                       Backwards,
 300                        int gmx_unused                 dd_zone,
 301                        bool gmx_unused                relevantAtomsAreWithinGridBounds,
 302                        int*                           a,
 303                        int                            n,
 304                        gmx::ArrayRef<const gmx::RVec> x,
 305                        real                           h0,
 306                        real                           invh,
 307                        int                            n_per_h,
 308                        gmx::ArrayRef<int>             sort)
 309 {
 310     if (n <= 1)
 311     {
 312         /* Nothing to do */
 313         return;
 314     }
 315
 316     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 317
 318     /* Transform the inverse range height into the inverse hole height */
 319     invh *= n_per_h * c_sortGridRatio;
 320
 321     /* Set nsort to the maximum possible number of holes used.
 322      * In worst case all n elements end up in the last bin.
 323      */
 324     int nsort = n_per_h * c_sortGridRatio + n;
 325
 326     /* Determine the index range used, so we can limit it for the second pass */
 327     int zi_min = INT_MAX;
 328     int zi_max = -1;
 329
 330     /* Sort the particles using a simple index sort */
 331     for (int i = 0; i < n; i++)
 332     {
 333         /* The cast takes care of float-point rounding effects below zero.
 334          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 335          * times the box height out of the box.
 336          */
 337         int zi = static_cast<int>((x[a[i]][dim] - h0) * invh);
 338
 339 #ifndef NDEBUG
 340         /* As we can have rounding effect, we use > iso >= here */
 341         if (relevantAtomsAreWithinGridBounds && (zi < 0 || (dd_zone == 0 && zi > n_per_h * c_sortGridRatio)))
 342         {
 343             gmx_fatal(FARGS,
 344                       "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 345                       a[i],
 346                       'x' + dim,
 347                       x[a[i]][dim],
 348                       h0,
 349                       invh,
 350                       zi,
 351                       n_per_h,
 352                       c_sortGridRatio);
 353         }
 354 #endif
 355         if (zi < 0)
 356         {
 357             zi = 0;
 358         }
 359
 360         /* In a non-local domain, particles communicated for bonded interactions
 361          * can be far beyond the grid size, which is set by the non-bonded
 362          * cut-off distance. We sort such particles into the last cell.
 363          */
 364         if (zi > n_per_h * c_sortGridRatio)
 365         {
 366             zi = n_per_h * c_sortGridRatio;
 367         }
 368
 369         /* Ideally this particle should go in sort cell zi,
 370          * but that might already be in use,
 371          * in that case find the first empty cell higher up
 372          */
 373         if (sort[zi] < 0)
 374         {
 375             sort[zi] = a[i];
 376             zi_min   = std::min(zi_min, zi);
 377             zi_max   = std::max(zi_max, zi);
 378         }
 379         else
 380         {
 381             /* We have multiple atoms in the same sorting slot.
 382              * Sort on real z for minimal bounding box size.
 383              * There is an extra check for identical z to ensure
 384              * well-defined output order, independent of input order
 385              * to ensure binary reproducibility after restarts.
 386              */
 387             while (sort[zi] >= 0
 388                    && (x[a[i]][dim] > x[sort[zi]][dim]
 389                        || (x[a[i]][dim] == x[sort[zi]][dim] && a[i] > sort[zi])))
 390             {
 391                 zi++;
 392             }
 393
 394             if (sort[zi] >= 0)
 395             {
 396                 /* Shift all elements by one slot until we find an empty slot */
 397                 int cp  = sort[zi];
 398                 int zim = zi + 1;
 399                 while (sort[zim] >= 0)
 400                 {
 401                     int tmp   = sort[zim];
 402                     sort[zim] = cp;
 403                     cp        = tmp;
 404                     zim++;
 405                 }
 406                 sort[zim] = cp;
 407                 zi_max    = std::max(zi_max, zim);
 408             }
 409             sort[zi] = a[i];
 410             zi_max   = std::max(zi_max, zi);
 411         }
 412     }
 413
 414     int c = 0;
 415     if (!Backwards)
 416     {
 417         for (int zi = 0; zi < nsort; zi++)
 418         {
 419             if (sort[zi] >= 0)
 420             {
 421                 a[c++]   = sort[zi];
 422                 sort[zi] = -1;
 423             }
 424         }
 425     }
 426     else
 427     {
 428         for (int zi = zi_max; zi >= zi_min; zi--)
 429         {
 430             if (sort[zi] >= 0)
 431             {
 432                 a[c++]   = sort[zi];
 433                 sort[zi] = -1;
 434             }
 435         }
 436     }
 437     if (c < n)
 438     {
 439         gmx_incons("Lost particles while sorting");
 440     }
 441 }
 442
 443 #if GMX_DOUBLE
 444 //! Returns double up to one least significant float bit smaller than x
 445 static double R2F_D(const float x)
 446 {
 447     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS) * x : (1 + GMX_FLOAT_EPS) * x);
 448 }
 449 //! Returns double up to one least significant float bit larger than x
 450 static double R2F_U(const float x)
 451 {
 452     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS) * x : (1 - GMX_FLOAT_EPS) * x);
 453 }
 454 #else
 455 //! Returns x
 456 static float R2F_D(const float x)
 457 {
 458     return x;
 459 }
 460 //! Returns x
 461 static float R2F_U(const float x)
 462 {
 463     return x;
 464 }
 465 #endif
 466
 467 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 468 static void calc_bounding_box(int na, int stride, const real* x, BoundingBox* bb)
 469 {
 470     int  i  = 0;
 471     real xl = x[i + XX];
 472     real xh = x[i + XX];
 473     real yl = x[i + YY];
 474     real yh = x[i + YY];
 475     real zl = x[i + ZZ];
 476     real zh = x[i + ZZ];
 477     i += stride;
 478     for (int j = 1; j < na; j++)
 479     {
 480         xl = std::min(xl, x[i + XX]);
 481         xh = std::max(xh, x[i + XX]);
 482         yl = std::min(yl, x[i + YY]);
 483         yh = std::max(yh, x[i + YY]);
 484         zl = std::min(zl, x[i + ZZ]);
 485         zh = std::max(zh, x[i + ZZ]);
 486         i += stride;
 487     }
 488     /* Note: possible double to float conversion here */
 489     bb->lower.x = R2F_D(xl);
 490     bb->lower.y = R2F_D(yl);
 491     bb->lower.z = R2F_D(zl);
 492     bb->upper.x = R2F_U(xh);
 493     bb->upper.y = R2F_U(yh);
 494     bb->upper.z = R2F_U(zh);
 495 }
 496
 497 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 498 static void calc_bounding_box_x_x4(int na, const real* x, BoundingBox* bb)
 499 {
 500     real xl = x[XX * c_packX4];
 501     real xh = x[XX * c_packX4];
 502     real yl = x[YY * c_packX4];
 503     real yh = x[YY * c_packX4];
 504     real zl = x[ZZ * c_packX4];
 505     real zh = x[ZZ * c_packX4];
 506     for (int j = 1; j < na; j++)
 507     {
 508         xl = std::min(xl, x[j + XX * c_packX4]);
 509         xh = std::max(xh, x[j + XX * c_packX4]);
 510         yl = std::min(yl, x[j + YY * c_packX4]);
 511         yh = std::max(yh, x[j + YY * c_packX4]);
 512         zl = std::min(zl, x[j + ZZ * c_packX4]);
 513         zh = std::max(zh, x[j + ZZ * c_packX4]);
 514     }
 515     /* Note: possible double to float conversion here */
 516     bb->lower.x = R2F_D(xl);
 517     bb->lower.y = R2F_D(yl);
 518     bb->lower.z = R2F_D(zl);
 519     bb->upper.x = R2F_U(xh);
 520     bb->upper.y = R2F_U(yh);
 521     bb->upper.z = R2F_U(zh);
 522 }
 523
 524 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 525 static void calc_bounding_box_x_x8(int na, const real* x, BoundingBox* bb)
 526 {
 527     real xl = x[XX * c_packX8];
 528     real xh = x[XX * c_packX8];
 529     real yl = x[YY * c_packX8];
 530     real yh = x[YY * c_packX8];
 531     real zl = x[ZZ * c_packX8];
 532     real zh = x[ZZ * c_packX8];
 533     for (int j = 1; j < na; j++)
 534     {
 535         xl = std::min(xl, x[j + XX * c_packX8]);
 536         xh = std::max(xh, x[j + XX * c_packX8]);
 537         yl = std::min(yl, x[j + YY * c_packX8]);
 538         yh = std::max(yh, x[j + YY * c_packX8]);
 539         zl = std::min(zl, x[j + ZZ * c_packX8]);
 540         zh = std::max(zh, x[j + ZZ * c_packX8]);
 541     }
 542     /* Note: possible double to float conversion here */
 543     bb->lower.x = R2F_D(xl);
 544     bb->lower.y = R2F_D(yl);
 545     bb->lower.z = R2F_D(zl);
 546     bb->upper.x = R2F_U(xh);
 547     bb->upper.y = R2F_U(yh);
 548     bb->upper.z = R2F_U(zh);
 549 }
 550
 551 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 552 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real* x, BoundingBox* bb, BoundingBox* bbj)
 553 {
 554     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 555     using namespace gmx;
 556
 557     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 558
 559     if (na > 2)
 560     {
 561         calc_bounding_box_x_x4(std::min(na - 2, 2), x + (c_packX4 >> 1), bbj + 1);
 562     }
 563     else
 564     {
 565         /* Set the "empty" bounding box to the same as the first one,
 566          * so we don't need to treat special cases in the rest of the code.
 567          */
 568 #if NBNXN_SEARCH_BB_SIMD4
 569         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 570         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 571 #else
 572         bbj[1] = bbj[0];
 573 #endif
 574     }
 575
 576 #if NBNXN_SEARCH_BB_SIMD4
 577     store4(bb->lower.ptr(), min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 578     store4(bb->upper.ptr(), max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 579 #else
 580     {
 581         bb->lower = BoundingBox::Corner::min(bbj[0].lower, bbj[1].lower);
 582         bb->upper = BoundingBox::Corner::max(bbj[0].upper, bbj[1].upper);
 583     }
 584 #endif
 585 }
 586
 587 #if NBNXN_BBXXXX
 588
 589 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 590 static void calc_bounding_box_xxxx(int na, int stride, const real* x, float* bb)
 591 {
 592     int  i  = 0;
 593     real xl = x[i + XX];
 594     real xh = x[i + XX];
 595     real yl = x[i + YY];
 596     real yh = x[i + YY];
 597     real zl = x[i + ZZ];
 598     real zh = x[i + ZZ];
 599     i += stride;
 600     for (int j = 1; j < na; j++)
 601     {
 602         xl = std::min(xl, x[i + XX]);
 603         xh = std::max(xh, x[i + XX]);
 604         yl = std::min(yl, x[i + YY]);
 605         yh = std::max(yh, x[i + YY]);
 606         zl = std::min(zl, x[i + ZZ]);
 607         zh = std::max(zh, x[i + ZZ]);
 608         i += stride;
 609     }
 610     /* Note: possible double to float conversion here */
 611     bb[0 * c_packedBoundingBoxesDimSize] = R2F_D(xl);
 612     bb[1 * c_packedBoundingBoxesDimSize] = R2F_D(yl);
 613     bb[2 * c_packedBoundingBoxesDimSize] = R2F_D(zl);
 614     bb[3 * c_packedBoundingBoxesDimSize] = R2F_U(xh);
 615     bb[4 * c_packedBoundingBoxesDimSize] = R2F_U(yh);
 616     bb[5 * c_packedBoundingBoxesDimSize] = R2F_U(zh);
 617 }
 618
 619 #endif /* NBNXN_BBXXXX */
 620
 621 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 622
 623 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 624 static void calc_bounding_box_simd4(int na, const float* x, BoundingBox* bb)
 625 {
 626     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 627     using namespace gmx;
 628
 629     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH * sizeof(float),
 630                   "The Corner struct should hold exactly 4 floats");
 631
 632     Simd4Float bb_0_S = load4(x);
 633     Simd4Float bb_1_S = bb_0_S;
 634
 635     for (int i = 1; i < na; i++)
 636     {
 637         Simd4Float x_S = load4(x + i * GMX_SIMD4_WIDTH);
 638         bb_0_S         = min(bb_0_S, x_S);
 639         bb_1_S         = max(bb_1_S, x_S);
 640     }
 641
 642     store4(bb->lower.ptr(), bb_0_S);
 643     store4(bb->upper.ptr(), bb_1_S);
 644 }
 645
 646 #    if NBNXN_BBXXXX
 647
 648 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 649 static void calc_bounding_box_xxxx_simd4(int na, const float* x, BoundingBox* bb_work_aligned, real* bb)
 650 {
 651     calc_bounding_box_simd4(na, x, bb_work_aligned);
 652
 653     bb[0 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 654     bb[1 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 655     bb[2 * c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 656     bb[3 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 657     bb[4 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 658     bb[5 * c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 659 }
 660
 661 #    endif /* NBNXN_BBXXXX */
 662
 663 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 664
 665
 666 /*! \brief Combines pairs of consecutive bounding boxes */
 667 static void combine_bounding_box_pairs(const Grid&                      grid,
 668                                        gmx::ArrayRef<const BoundingBox> bb,
 669                                        gmx::ArrayRef<BoundingBox>       bbj)
 670 {
 671     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 672     using namespace gmx;
 673
 674     for (int i = 0; i < grid.numColumns(); i++)
 675     {
 676         /* Starting bb in a column is expected to be 2-aligned */
 677         const int sc2 = grid.firstCellInColumn(i) >> 1;
 678         /* For odd numbers skip the last bb here */
 679         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 680         for (int c2 = sc2; c2 < sc2 + nc2; c2++)
 681         {
 682 #if NBNXN_SEARCH_BB_SIMD4
 683             Simd4Float min_S, max_S;
 684
 685             min_S = min(load4(bb[c2 * 2 + 0].lower.ptr()), load4(bb[c2 * 2 + 1].lower.ptr()));
 686             max_S = max(load4(bb[c2 * 2 + 0].upper.ptr()), load4(bb[c2 * 2 + 1].upper.ptr()));
 687             store4(bbj[c2].lower.ptr(), min_S);
 688             store4(bbj[c2].upper.ptr(), max_S);
 689 #else
 690             bbj[c2].lower = BoundingBox::Corner::min(bb[c2 * 2 + 0].lower, bb[c2 * 2 + 1].lower);
 691             bbj[c2].upper = BoundingBox::Corner::max(bb[c2 * 2 + 0].upper, bb[c2 * 2 + 1].upper);
 692 #endif
 693         }
 694         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 695         {
 696             /* The bb count in this column is odd: duplicate the last bb */
 697             int c2        = sc2 + nc2;
 698             bbj[c2].lower = bb[c2 * 2].lower;
 699             bbj[c2].upper = bb[c2 * 2].upper;
 700         }
 701     }
 702 }
 703
 704
 705 /*! \brief Prints the average bb size, used for debug output */
 706 static void print_bbsizes_simple(FILE* fp, const Grid& grid)
 707 {
 708     dvec ba = { 0 };
 709     for (int c = 0; c < grid.numCells(); c++)
 710     {
 711         const BoundingBox& bb = grid.iBoundingBoxes()[c];
 712         ba[XX] += bb.upper.x - bb.lower.x;
 713         ba[YY] += bb.upper.y - bb.lower.y;
 714         ba[ZZ] += bb.upper.z - bb.lower.z;
 715     }
 716     if (grid.numCells() > 0)
 717     {
 718         dsvmul(1.0 / grid.numCells(), ba, ba);
 719     }
 720
 721     const Grid::Dimensions& dims = grid.dimensions();
 722     real                    avgCellSizeZ =
 723             (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster
 724                                             / (dims.atomDensity * dims.cellSize[XX] * dims.cellSize[YY])
 725                                   : 0.0);
 726
 727     fprintf(fp,
 728             "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 729             dims.cellSize[XX],
 730             dims.cellSize[YY],
 731             avgCellSizeZ,
 732             ba[XX],
 733             ba[YY],
 734             ba[ZZ],
 735             ba[XX] * dims.invCellSize[XX],
 736             ba[YY] * dims.invCellSize[YY],
 737             dims.atomDensity > 0 ? ba[ZZ] / avgCellSizeZ : 0.0);
 738 }
 739
 740 /*! \brief Prints the average bb size, used for debug output */
 741 static void print_bbsizes_supersub(FILE* fp, const Grid& grid)
 742 {
 743     dvec ba;
 744
 745     clear_dvec(ba);
 746     int ns = 0;
 747     for (int c = 0; c < grid.numCells(); c++)
 748     {
 749 #if NBNXN_BBXXXX
 750         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 751         {
 752             int  cs_w          = (c * c_gpuNumClusterPerCell + s) / c_packedBoundingBoxesDimSize;
 753             auto boundingBoxes = grid.packedBoundingBoxes().subArray(
 754                     cs_w * c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 755             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 756             {
 757                 for (int d = 0; d < DIM; d++)
 758                 {
 759                     ba[d] += boundingBoxes[(DIM + d) * c_packedBoundingBoxesDimSize + i]
 760                              - boundingBoxes[(0 + d) * c_packedBoundingBoxesDimSize + i];
 761                 }
 762             }
 763         }
 764 #else
 765         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 766         {
 767             const BoundingBox& bb = grid.iBoundingBoxes()[c * c_gpuNumClusterPerCell + s];
 768             ba[XX] += bb.upper.x - bb.lower.x;
 769             ba[YY] += bb.upper.y - bb.lower.y;
 770             ba[ZZ] += bb.upper.z - bb.lower.z;
 771         }
 772 #endif
 773         ns += grid.numClustersPerCell()[c];
 774     }
 775     dsvmul(1.0 / ns, ba, ba);
 776
 777     const Grid::Dimensions& dims = grid.dimensions();
 778     const real              avgClusterSizeZ =
 779             (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell
 780                                             / (dims.atomDensity * dims.cellSize[XX]
 781                                                * dims.cellSize[YY] * c_gpuNumClusterPerCellZ)
 782                                   : 0.0);
 783
 784     fprintf(fp,
 785             "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 786             dims.cellSize[XX] / c_gpuNumClusterPerCellX,
 787             dims.cellSize[YY] / c_gpuNumClusterPerCellY,
 788             avgClusterSizeZ,
 789             ba[XX],
 790             ba[YY],
 791             ba[ZZ],
 792             ba[XX] * c_gpuNumClusterPerCellX * dims.invCellSize[XX],
 793             ba[YY] * c_gpuNumClusterPerCellY * dims.invCellSize[YY],
 794             dims.atomDensity > 0 ? ba[ZZ] / avgClusterSizeZ : 0.0);
 795 }
 796
 797 /*!\brief Set non-bonded interaction flags for the current cluster.
 798  *
 799  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 800  */
 801 static void sort_cluster_on_flag(int                      numAtomsInCluster,
 802                                  int                      atomStart,
 803                                  int                      atomEnd,
 804                                  gmx::ArrayRef<const int> atomInfo,
 805                                  gmx::ArrayRef<int>       order,
 806                                  int*                     flags)
 807 {
 808     constexpr int c_maxNumAtomsInCluster = 8;
 809     int           sort1[c_maxNumAtomsInCluster];
 810     int           sort2[c_maxNumAtomsInCluster];
 811
 812     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster,
 813                "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 814
 815     *flags = 0;
 816
 817     int subc = 0;
 818     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 819     {
 820         /* Make lists for this (sub-)cell on atoms with and without LJ */
 821         int  n1       = 0;
 822         int  n2       = 0;
 823         bool haveQ    = false;
 824         int  a_lj_max = -1;
 825         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 826         {
 827             haveQ = haveQ || bool(atomInfo[order[a]] & gmx::sc_atomInfo_HasCharge);
 828
 829             if (atomInfo[order[a]] & (gmx::sc_atomInfo_HasVdw))
 830             {
 831                 sort1[n1++] = order[a];
 832                 a_lj_max    = a;
 833             }
 834             else
 835             {
 836                 sort2[n2++] = order[a];
 837             }
 838         }
 839
 840         /* If we don't have atoms with LJ, there's nothing to sort */
 841         if (n1 > 0)
 842         {
 843             *flags |= NBNXN_CI_DO_LJ(subc);
 844
 845             if (2 * n1 <= numAtomsInCluster)
 846             {
 847                 /* Only sort when strictly necessary. Ordering particles
 848                  * Ordering particles can lead to less accurate summation
 849                  * due to rounding, both for LJ and Coulomb interactions.
 850                  */
 851                 if (2 * (a_lj_max - s) >= numAtomsInCluster)
 852                 {
 853                     for (int i = 0; i < n1; i++)
 854                     {
 855                         order[atomStart + i] = sort1[i];
 856                     }
 857                     for (int j = 0; j < n2; j++)
 858                     {
 859                         order[atomStart + n1 + j] = sort2[j];
 860                     }
 861                 }
 862
 863                 *flags |= NBNXN_CI_HALF_LJ(subc);
 864             }
 865         }
 866         if (haveQ)
 867         {
 868             *flags |= NBNXN_CI_DO_COUL(subc);
 869         }
 870         subc++;
 871     }
 872 }
 873
 874 /*! \brief Fill a pair search cell with atoms.
 875  *
 876  * Potentially sorts atoms and sets the interaction flags.
 877  */
 878 void Grid::fillCell(GridSetData*                   gridSetData,
 879                     nbnxn_atomdata_t*              nbat,
 880                     int                            atomStart,
 881                     int                            atomEnd,
 882                     gmx::ArrayRef<const int>       atomInfo,
 883                     gmx::ArrayRef<const gmx::RVec> x,
 884                     BoundingBox gmx_unused* bb_work_aligned)
 885 {
 886     const int numAtoms = atomEnd - atomStart;
 887
 888     const gmx::ArrayRef<int>& cells       = gridSetData->cells;
 889     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
 890
 891     if (geometry_.isSimple)
 892     {
 893         /* Note that non-local grids are already sorted.
 894          * Then sort_cluster_on_flag will only set the flags and the sorting
 895          * will not affect the atom order.
 896          */
 897         sort_cluster_on_flag(geometry_.numAtomsICluster,
 898                              atomStart,
 899                              atomEnd,
 900                              atomInfo,
 901                              atomIndices,
 902                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 903     }
 904
 905     if (haveFep_)
 906     {
 907         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 908
 909         /* The grid-local cluster/(sub-)cell index */
 910         int cell = atomToCluster(atomStart)
 911                    - cellOffset_ * (geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 912         fep_[cell] = 0;
 913         for (int at = atomStart; at < atomEnd; at++)
 914         {
 915             if (atomIndices[at] >= 0 && (atomInfo[atomIndices[at]] & gmx::sc_atomInfo_FreeEnergyPerturbation))
 916             {
 917                 fep_[cell] |= (1 << (at - atomStart));
 918             }
 919         }
 920     }
 921
 922     /* Now we have sorted the atoms, set the cell indices */
 923     for (int at = atomStart; at < atomEnd; at++)
 924     {
 925         cells[atomIndices[at]] = at;
 926     }
 927
 928     copy_rvec_to_nbat_real(atomIndices.data() + atomStart,
 929                            numAtoms,
 930                            geometry_.numAtomsICluster,
 931                            as_rvec_array(x.data()),
 932                            nbat->XFormat,
 933                            nbat->x().data(),
 934                            atomStart);
 935
 936     if (nbat->XFormat == nbatX4)
 937     {
 938         /* Store the bounding boxes as xyz.xyz. */
 939         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 940         BoundingBox* bb_ptr = bb_.data() + offset;
 941
 942 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 943         if (2 * geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 944         {
 945             calc_bounding_box_x_x4_halves(numAtoms,
 946                                           nbat->x().data() + atom_to_x_index<c_packX4>(atomStart),
 947                                           bb_ptr,
 948                                           bbj_.data() + offset * 2);
 949         }
 950         else
 951 #endif
 952         {
 953             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 954         }
 955     }
 956     else if (nbat->XFormat == nbatX8)
 957     {
 958         /* Store the bounding boxes as xyz.xyz. */
 959         size_t       offset = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsICluster);
 960         BoundingBox* bb_ptr = bb_.data() + offset;
 961
 962         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 963     }
 964 #if NBNXN_BBXXXX
 965     else if (!geometry_.isSimple)
 966     {
 967         /* Store the bounding boxes in a format convenient
 968          * for SIMD4 calculations: xxxxyyyyzzzz...
 969          */
 970         const int clusterIndex = ((atomStart - cellOffset_ * geometry_.numAtomsPerCell)
 971                                   >> geometry_.numAtomsICluster2Log);
 972         float*    pbb_ptr      = pbb_.data() + packedBoundingBoxesIndex(clusterIndex)
 973                          + (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 974
 975 #    if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 976         if (nbat->XFormat == nbatXYZQ)
 977         {
 978             GMX_ASSERT(bb_work_aligned != nullptr, "Must have valid aligned work structure");
 979             calc_bounding_box_xxxx_simd4(
 980                     numAtoms, nbat->x().data() + atomStart * nbat->xstride, bb_work_aligned, pbb_ptr);
 981         }
 982         else
 983 #    endif
 984         {
 985             calc_bounding_box_xxxx(
 986                     numAtoms, nbat->xstride, nbat->x().data() + atomStart * nbat->xstride, pbb_ptr);
 987         }
 988         if (gmx_debug_at)
 989         {
 990             fprintf(debug,
 991                     "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 992                     atomToCluster(atomStart),
 993                     pbb_ptr[0 * c_packedBoundingBoxesDimSize],
 994                     pbb_ptr[3 * c_packedBoundingBoxesDimSize],
 995                     pbb_ptr[1 * c_packedBoundingBoxesDimSize],
 996                     pbb_ptr[4 * c_packedBoundingBoxesDimSize],
 997                     pbb_ptr[2 * c_packedBoundingBoxesDimSize],
 998                     pbb_ptr[5 * c_packedBoundingBoxesDimSize]);
 999         }
1000     }
1001 #endif
1002     else
1003     {
1004         /* Store the bounding boxes as xyz.xyz. */
1005         BoundingBox* bb_ptr =
1006                 bb_.data() + atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
1007
1008         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart * nbat->xstride, bb_ptr);
1009
1010         if (gmx_debug_at)
1011         {
1012             int bbo = atomToCluster(atomStart - cellOffset_ * geometry_.numAtomsPerCell);
1013             fprintf(debug,
1014                     "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
1015                     atomToCluster(atomStart),
1016                     bb_[bbo].lower.x,
1017                     bb_[bbo].lower.y,
1018                     bb_[bbo].lower.z,
1019                     bb_[bbo].upper.x,
1020                     bb_[bbo].upper.y,
1021                     bb_[bbo].upper.z);
1022         }
1023     }
1024 }
1025
1026 void Grid::sortColumnsCpuGeometry(GridSetData*                   gridSetData,
1027                                   int                            dd_zone,
1028                                   gmx::ArrayRef<const int>       atomInfo,
1029                                   gmx::ArrayRef<const gmx::RVec> x,
1030                                   nbnxn_atomdata_t*              nbat,
1031                                   const gmx::Range<int>          columnRange,
1032                                   gmx::ArrayRef<int>             sort_work)
1033 {
1034     if (debug)
1035     {
1036         fprintf(debug,
1037                 "cell_offset %d sorting columns %d - %d\n",
1038                 cellOffset_,
1039                 *columnRange.begin(),
1040                 *columnRange.end());
1041     }
1042
1043     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1044
1045     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1046
1047     /* Sort the atoms within each x,y column in 3 dimensions */
1048     for (int cxy : columnRange)
1049     {
1050         const int numAtoms   = numAtomsInColumn(cxy);
1051         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1052         const int atomOffset = firstAtomInColumn(cxy);
1053
1054         /* Sort the atoms within each x,y column on z coordinate */
1055         sort_atoms(ZZ,
1056                    FALSE,
1057                    dd_zone,
1058                    relevantAtomsAreWithinGridBounds,
1059                    gridSetData->atomIndices.data() + atomOffset,
1060                    numAtoms,
1061                    x,
1062                    dimensions_.lowerCorner[ZZ],
1063                    1.0 / dimensions_.gridSize[ZZ],
1064                    numCellsZ * numAtomsPerCell,
1065                    sort_work);
1066
1067         /* Fill the ncz cells in this column */
1068         const int firstCell  = firstCellInColumn(cxy);
1069         int       cellFilled = firstCell;
1070         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1071         {
1072             const int cell = firstCell + cellZ;
1073
1074             const int atomOffsetCell       = atomOffset + cellZ * numAtomsPerCell;
1075             const int numAtomsLeftInColumn = std::max(numAtoms - (atomOffsetCell - atomOffset), 0);
1076             const int numAtomsCell         = std::min(numAtomsPerCell, numAtomsLeftInColumn);
1077
1078             fillCell(gridSetData, nbat, atomOffsetCell, atomOffsetCell + numAtomsCell, atomInfo, x, nullptr);
1079
1080             /* This copy to bbcz is not really necessary.
1081              * But it allows to use the same grid search code
1082              * for the simple and supersub cell setups.
1083              */
1084             if (numAtomsCell > 0)
1085             {
1086                 cellFilled = cell;
1087             }
1088             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1089             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1090         }
1091
1092         /* Set the unused atom indices to -1 */
1093         for (int ind = numAtoms; ind < numCellsZ * numAtomsPerCell; ind++)
1094         {
1095             gridSetData->atomIndices[atomOffset + ind] = -1;
1096         }
1097     }
1098 }
1099
1100 /* Spatially sort the atoms within one grid column */
1101 void Grid::sortColumnsGpuGeometry(GridSetData*                   gridSetData,
1102                                   int                            dd_zone,
1103                                   gmx::ArrayRef<const int>       atomInfo,
1104                                   gmx::ArrayRef<const gmx::RVec> x,
1105                                   nbnxn_atomdata_t*              nbat,
1106                                   const gmx::Range<int>          columnRange,
1107                                   gmx::ArrayRef<int>             sort_work)
1108 {
1109     BoundingBox bb_work_array[2];
1110     auto*       bb_work_aligned = reinterpret_cast<BoundingBox*>(
1111             (reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1112
1113     if (debug)
1114     {
1115         fprintf(debug,
1116                 "cell_offset %d sorting columns %d - %d\n",
1117                 cellOffset_,
1118                 *columnRange.begin(),
1119                 *columnRange.end());
1120     }
1121
1122     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1123
1124     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1125
1126     const int subdiv_x = geometry_.numAtomsICluster;
1127     const int subdiv_y = c_gpuNumClusterPerCellX * subdiv_x;
1128     const int subdiv_z = c_gpuNumClusterPerCellY * subdiv_y;
1129
1130     /* Extract the atom index array that will be filled here */
1131     const gmx::ArrayRef<int>& atomIndices = gridSetData->atomIndices;
1132
1133     /* Sort the atoms within each x,y column in 3 dimensions.
1134      * Loop over all columns on the x/y grid.
1135      */
1136     for (int cxy : columnRange)
1137     {
1138         const int gridX = cxy / dimensions_.numCells[YY];
1139         const int gridY = cxy - gridX * dimensions_.numCells[YY];
1140
1141         const int numAtomsInColumn = cxy_na_[cxy];
1142         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1143         const int atomOffset       = firstAtomInColumn(cxy);
1144
1145         /* Sort the atoms within each x,y column on z coordinate */
1146         sort_atoms(ZZ,
1147                    FALSE,
1148                    dd_zone,
1149                    relevantAtomsAreWithinGridBounds,
1150                    atomIndices.data() + atomOffset,
1151                    numAtomsInColumn,
1152                    x,
1153                    dimensions_.lowerCorner[ZZ],
1154                    1.0 / dimensions_.gridSize[ZZ],
1155                    numCellsInColumn * numAtomsPerCell,
1156                    sort_work);
1157
1158         /* This loop goes over the cells and clusters along z at once */
1159         for (int sub_z = 0; sub_z < numCellsInColumn * c_gpuNumClusterPerCellZ; sub_z++)
1160         {
1161             const int atomOffsetZ = atomOffset + sub_z * subdiv_z;
1162             const int numAtomsZ = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1163             int       cz        = -1;
1164             /* We have already sorted on z */
1165
1166             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1167             {
1168                 cz             = sub_z / c_gpuNumClusterPerCellZ;
1169                 const int cell = cxy_ind_[cxy] + cz;
1170
1171                 /* The number of atoms in this cell/super-cluster */
1172                 const int numAtoms =
1173                         std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1174
1175                 numClusters_[cell] = std::min(
1176                         c_gpuNumClusterPerCell,
1177                         (numAtoms + geometry_.numAtomsICluster - 1) / geometry_.numAtomsICluster);
1178
1179                 /* Store the z-boundaries of the bounding box of the cell */
1180                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1181                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1182             }
1183
1184             if (c_gpuNumClusterPerCellY > 1)
1185             {
1186                 /* Sort the atoms along y */
1187                 sort_atoms(YY,
1188                            (sub_z & 1) != 0,
1189                            dd_zone,
1190                            relevantAtomsAreWithinGridBounds,
1191                            atomIndices.data() + atomOffsetZ,
1192                            numAtomsZ,
1193                            x,
1194                            dimensions_.lowerCorner[YY] + gridY * dimensions_.cellSize[YY],
1195                            dimensions_.invCellSize[YY],
1196                            subdiv_z,
1197                            sort_work);
1198             }
1199
1200             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1201             {
1202                 const int atomOffsetY = atomOffsetZ + sub_y * subdiv_y;
1203                 const int numAtomsY = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1204
1205                 if (c_gpuNumClusterPerCellX > 1)
1206                 {
1207                     /* Sort the atoms along x */
1208                     sort_atoms(XX,
1209                                ((cz * c_gpuNumClusterPerCellY + sub_y) & 1) != 0,
1210                                dd_zone,
1211                                relevantAtomsAreWithinGridBounds,
1212                                atomIndices.data() + atomOffsetY,
1213                                numAtomsY,
1214                                x,
1215                                dimensions_.lowerCorner[XX] + gridX * dimensions_.cellSize[XX],
1216                                dimensions_.invCellSize[XX],
1217                                subdiv_y,
1218                                sort_work);
1219                 }
1220
1221                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1222                 {
1223                     const int atomOffsetX = atomOffsetY + sub_x * subdiv_x;
1224                     const int numAtomsX =
1225                             std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1226
1227                     fillCell(gridSetData, nbat, atomOffsetX, atomOffsetX + numAtomsX, atomInfo, x, bb_work_aligned);
1228                 }
1229             }
1230         }
1231
1232         /* Set the unused atom indices to -1 */
1233         for (int ind = numAtomsInColumn; ind < numCellsInColumn * numAtomsPerCell; ind++)
1234         {
1235             atomIndices[atomOffset + ind] = -1;
1236         }
1237     }
1238 }
1239
1240 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1241 static void setCellAndAtomCount(gmx::ArrayRef<int> cell, int cellIndex, gmx::ArrayRef<int> cxy_na, int atomIndex)
1242 {
1243     cell[atomIndex] = cellIndex;
1244     cxy_na[cellIndex] += 1;
1245 }
1246
1247 void Grid::calcColumnIndices(const Grid::Dimensions&        gridDims,
1248                              const gmx::UpdateGroupsCog*    updateGroupsCog,
1249                              const gmx::Range<int>          atomRange,
1250                              gmx::ArrayRef<const gmx::RVec> x,
1251                              const int                      dd_zone,
1252                              const int*                     move,
1253                              const int                      thread,
1254                              const int                      nthread,
1255                              gmx::ArrayRef<int>             cell,
1256                              gmx::ArrayRef<int>             cxy_na)
1257 {
1258     const int numColumns = gridDims.numCells[XX] * gridDims.numCells[YY];
1259
1260     /* We add one extra cell for particles which moved during DD */
1261     for (int i = 0; i < numColumns; i++)
1262     {
1263         cxy_na[i] = 0;
1264     }
1265
1266     int taskAtomStart = *atomRange.begin() + static_cast<int>((thread + 0) * atomRange.size()) / nthread;
1267     int taskAtomEnd = *atomRange.begin() + static_cast<int>((thread + 1) * atomRange.size()) / nthread;
1268
1269     if (dd_zone == 0)
1270     {
1271         /* Home zone */
1272         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1273         {
1274             if (move == nullptr || move[i] >= 0)
1275             {
1276
1277                 const gmx::RVec& coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1278
1279                 /* We need to be careful with rounding,
1280                  * particles might be a few bits outside the local zone.
1281                  * The int cast takes care of the lower bound,
1282                  * we will explicitly take care of the upper bound.
1283                  */
1284                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])
1285                                           * gridDims.invCellSize[XX]);
1286                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])
1287                                           * gridDims.invCellSize[YY]);
1288
1289 #ifndef NDEBUG
1290                 if (cx < 0 || cx > gridDims.numCells[XX] || cy < 0 || cy > gridDims.numCells[YY])
1291                 {
1292                     gmx_fatal(FARGS,
1293                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1294                               "atom %f %f %f, grid->c0 %f %f",
1295                               cx,
1296                               cy,
1297                               gridDims.numCells[XX],
1298                               gridDims.numCells[YY],
1299                               x[i][XX],
1300                               x[i][YY],
1301                               x[i][ZZ],
1302                               gridDims.lowerCorner[XX],
1303                               gridDims.lowerCorner[YY]);
1304                 }
1305 #endif
1306                 /* Take care of potential rounding issues */
1307                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1308                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1309
1310                 /* For the moment cell will contain only the, grid local,
1311                  * x and y indices, not z.
1312                  */
1313                 setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1314             }
1315             else
1316             {
1317                 /* Put this moved particle after the end of the grid,
1318                  * so we can process it later without using conditionals.
1319                  */
1320                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1321             }
1322         }
1323     }
1324     else
1325     {
1326         /* Non-home zone */
1327         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1328         {
1329             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX]) * gridDims.invCellSize[XX]);
1330             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY]) * gridDims.invCellSize[YY]);
1331
1332             /* For non-home zones there could be particles outside
1333              * the non-bonded cut-off range, which have been communicated
1334              * for bonded interactions only. For the result it doesn't
1335              * matter where these end up on the grid. For performance
1336              * we put them in an extra row at the border.
1337              */
1338             cx = std::max(cx, 0);
1339             cx = std::min(cx, gridDims.numCells[XX] - 1);
1340             cy = std::max(cy, 0);
1341             cy = std::min(cy, gridDims.numCells[YY] - 1);
1342
1343             /* For the moment cell will contain only the, grid local,
1344              * x and y indices, not z.
1345              */
1346             setCellAndAtomCount(cell, cx * gridDims.numCells[YY] + cy, cxy_na, i);
1347         }
1348     }
1349 }
1350
1351 /*! \brief Resizes grid and atom data which depend on the number of cells */
1352 static void resizeForNumberOfCells(const int         numNbnxnAtoms,
1353                                    const int         numAtomsMoved,
1354                                    GridSetData*      gridSetData,
1355                                    nbnxn_atomdata_t* nbat)
1356 {
1357     /* Note: gridSetData->cellIndices was already resized before */
1358
1359     /* To avoid conditionals we store the moved particles at the end of a,
1360      * make sure we have enough space.
1361      */
1362     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1363
1364     /* Make space in nbat for storing the atom coordinates */
1365     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1366 }
1367
1368 void Grid::setCellIndices(int                            ddZone,
1369                           int                            cellOffset,
1370                           GridSetData*                   gridSetData,
1371                           gmx::ArrayRef<GridWork>        gridWork,
1372                           const gmx::Range<int>          atomRange,
1373                           gmx::ArrayRef<const int>       atomInfo,
1374                           gmx::ArrayRef<const gmx::RVec> x,
1375                           const int                      numAtomsMoved,
1376                           nbnxn_atomdata_t*              nbat)
1377 {
1378     cellOffset_ = cellOffset;
1379
1380     srcAtomBegin_ = *atomRange.begin();
1381     srcAtomEnd_   = *atomRange.end();
1382
1383     const int nthread = gmx_omp_nthreads_get(ModuleMultiThread::Pairsearch);
1384
1385     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1386
1387     /* Make the cell index as a function of x and y */
1388     int ncz_max = 0;
1389     int ncz     = 0;
1390     cxy_ind_[0] = 0;
1391     for (int i = 0; i < numColumns() + 1; i++)
1392     {
1393         /* We set ncz_max at the beginning of the loop iso at the end
1394          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1395          * that do not need to be ordered on the grid.
1396          */
1397         if (ncz > ncz_max)
1398         {
1399             ncz_max = ncz;
1400         }
1401         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1402         for (int thread = 1; thread < nthread; thread++)
1403         {
1404             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1405         }
1406         ncz = (cxy_na_i + numAtomsPerCell - 1) / numAtomsPerCell;
1407         if (nbat->XFormat == nbatX8)
1408         {
1409             /* Make the number of cell a multiple of 2 */
1410             ncz = (ncz + 1) & ~1;
1411         }
1412         cxy_ind_[i + 1] = cxy_ind_[i] + ncz;
1413         /* Clear cxy_na_, so we can reuse the array below */
1414         cxy_na_[i] = 0;
1415     }
1416     numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
1417     numCellsColumnMax_ = ncz_max;
1418
1419     /* Resize grid and atom data which depend on the number of cells */
1420     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1421
1422     if (debug)
1423     {
1424         fprintf(debug,
1425                 "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1426                 numAtomsPerCell,
1427                 geometry_.numAtomsICluster,
1428                 numCellsTotal_,
1429                 dimensions_.numCells[XX],
1430                 dimensions_.numCells[YY],
1431                 numCellsTotal_ / (static_cast<double>(numColumns())),
1432                 ncz_max);
1433         if (gmx_debug_at)
1434         {
1435             int i = 0;
1436             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1437             {
1438                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1439                 {
1440                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1441                     i++;
1442                 }
1443                 fprintf(debug, "\n");
1444             }
1445         }
1446     }
1447
1448     /* Make sure the work array for sorting is large enough */
1449     const int worstCaseSortBufferSize = ncz_max * numAtomsPerCell * c_sortGridMaxSizeFactor;
1450     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1451     {
1452         for (GridWork& work : gridWork)
1453         {
1454             /* Elements not in use should be -1 */
1455             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1456         }
1457     }
1458
1459     /* Now we know the dimensions we can fill the grid.
1460      * This is the first, unsorted fill. We sort the columns after this.
1461      */
1462     gmx::ArrayRef<int> cells       = gridSetData->cells;
1463     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1464     for (int i : atomRange)
1465     {
1466         /* At this point nbs->cell contains the local grid x,y indices */
1467         const int cxy                                        = cells[i];
1468         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1469     }
1470
1471     if (ddZone == 0)
1472     {
1473         /* Set the cell indices for the moved particles */
1474         int n0 = numCellsTotal_ * numAtomsPerCell;
1475         int n1 = numCellsTotal_ * numAtomsPerCell + cxy_na_[numColumns()];
1476         for (int i = n0; i < n1; i++)
1477         {
1478             cells[atomIndices[i]] = i;
1479         }
1480     }
1481
1482     /* Sort the super-cell columns along z into the sub-cells. */
1483 #pragma omp parallel for num_threads(nthread) schedule(static)
1484     for (int thread = 0; thread < nthread; thread++)
1485     {
1486         try
1487         {
1488             gmx::Range<int> columnRange(((thread + 0) * numColumns()) / nthread,
1489                                         ((thread + 1) * numColumns()) / nthread);
1490             if (geometry_.isSimple)
1491             {
1492                 sortColumnsCpuGeometry(
1493                         gridSetData, ddZone, atomInfo, x, nbat, columnRange, gridWork[thread].sortBuffer);
1494             }
1495             else
1496             {
1497                 sortColumnsGpuGeometry(
1498                         gridSetData, ddZone, atomInfo, x, nbat, columnRange, gridWork[thread].sortBuffer);
1499             }
1500         }
1501         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1502     }
1503
1504     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1505     {
1506         combine_bounding_box_pairs(*this, bb_, bbj_);
1507     }
1508
1509     if (!geometry_.isSimple)
1510     {
1511         numClustersTotal_ = 0;
1512         for (int i = 0; i < numCellsTotal_; i++)
1513         {
1514             numClustersTotal_ += numClusters_[i];
1515         }
1516     }
1517
1518     if (debug)
1519     {
1520         if (geometry_.isSimple)
1521         {
1522             print_bbsizes_simple(debug, *this);
1523         }
1524         else
1525         {
1526             fprintf(debug,
1527                     "ns non-zero sub-cells: %d average atoms %.2f\n",
1528                     numClustersTotal_,
1529                     atomRange.size() / static_cast<double>(numClustersTotal_));
1530
1531             print_bbsizes_supersub(debug, *this);
1532         }
1533     }
1534 }
1535
1536 } // namespace Nbnxm