src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief
  39  * Implements the Grid class.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_nbnxm
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "grid.h"
  48
  49 #include <cmath>
  50 #include <cstring>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/domdec/domdec_struct.h"
  55 #include "gromacs/math/utilities.h"
  56 #include "gromacs/math/vec.h"
  57 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  58 #include "gromacs/mdlib/updategroupscog.h"
  59 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  60 #include "gromacs/nbnxm/atomdata.h"
  61 #include "gromacs/nbnxm/nbnxm.h"
  62 #include "gromacs/nbnxm/nbnxm_geometry.h"
  63 #include "gromacs/nbnxm/pairlist.h"
  64 #include "gromacs/nbnxm/pairlistset.h"
  65 #include "gromacs/simd/simd.h"
  66 #include "gromacs/simd/vector_operations.h"
  67 #include "gromacs/utility/exceptions.h"
  68 #include "gromacs/utility/smalloc.h"
  69
  70 #include "internal.h"
  71
  72 struct gmx_domdec_zones_t;
  73
  74 namespace Nbnxm
  75 {
  76
  77 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  78     isSimple(pairlistType != PairlistType::Hierarchical8x8),
  79     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  80     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  81     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell)*numAtomsICluster),
  82     numAtomsICluster2Log(get_2log(numAtomsICluster))
  83 {
  84 }
  85
  86 Grid::Grid(const PairlistType pairlistType) :
  87     geometry_(pairlistType)
  88 {
  89 }
  90
  91 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  92 static real gridAtomDensity(int        numAtoms,
  93                             const rvec lowerCorner,
  94                             const rvec upperCorner)
  95 {
  96     rvec size;
  97
  98     if (numAtoms == 0)
  99     {
 100         /* To avoid zero density we use a minimum of 1 atom */
 101         numAtoms = 1;
 102     }
 103
 104     rvec_sub(upperCorner, lowerCorner, size);
 105
 106     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
 107 }
 108
 109 void Grid::setDimensions(const nbnxn_search *nbs,
 110                          const int           ddZone,
 111                          const int           numAtoms,
 112                          const rvec          lowerCorner,
 113                          const rvec          upperCorner,
 114                          real                atomDensity,
 115                          const real          maxAtomGroupRadius)
 116 {
 117     /* For the home zone we compute the density when not set (=-1) or when =0 */
 118     if (ddZone == 0 && atomDensity <= 0)
 119     {
 120         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 121     }
 122
 123     dimensions_.atomDensity        = atomDensity;
 124     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 125
 126     rvec size;
 127     rvec_sub(upperCorner, lowerCorner, size);
 128
 129     if (numAtoms > geometry_.numAtomsPerCell)
 130     {
 131         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 132
 133         /* target cell length */
 134         real tlen_x;
 135         real tlen_y;
 136         if (geometry_.isSimple)
 137         {
 138             /* To minimize the zero interactions, we should make
 139              * the largest of the i/j cell cubic.
 140              */
 141             int numAtomsInCell = std::max(geometry_.numAtomsICluster,
 142                                           geometry_.numAtomsJCluster);
 143
 144             /* Approximately cubic cells */
 145             real tlen = std::cbrt(numAtomsInCell/atomDensity);
 146             tlen_x    = tlen;
 147             tlen_y    = tlen;
 148         }
 149         else
 150         {
 151             /* Approximately cubic sub cells */
 152             real tlen = std::cbrt(geometry_.numAtomsICluster/atomDensity);
 153             tlen_x    = tlen*c_gpuNumClusterPerCellX;
 154             tlen_y    = tlen*c_gpuNumClusterPerCellY;
 155         }
 156         /* We round ncx and ncy down, because we get less cell pairs
 157          * in the nbsist when the fixed cell dimensions (x,y) are
 158          * larger than the variable one (z) than the other way around.
 159          */
 160         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 161         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 162     }
 163     else
 164     {
 165         dimensions_.numCells[XX] = 1;
 166         dimensions_.numCells[YY] = 1;
 167     }
 168
 169     for (int d = 0; d < DIM - 1; d++)
 170     {
 171         dimensions_.cellSize[d]    = size[d]/dimensions_.numCells[d];
 172         dimensions_.invCellSize[d] = 1/dimensions_.cellSize[d];
 173     }
 174
 175     if (ddZone > 0)
 176     {
 177         /* This is a non-home zone, add an extra row of cells
 178          * for particles communicated for bonded interactions.
 179          * These can be beyond the cut-off. It doesn't matter where
 180          * they end up on the grid, but for performance it's better
 181          * if they don't end up in cells that can be within cut-off range.
 182          */
 183         dimensions_.numCells[XX]++;
 184         dimensions_.numCells[YY]++;
 185     }
 186
 187     /* We need one additional cell entry for particles moved by DD */
 188     cxy_na_.resize(numColumns() + 1);
 189     cxy_ind_.resize(numColumns() + 2);
 190
 191     for (nbnxn_search_work_t &work : nbs->work)
 192     {
 193         work.cxy_na.resize(numColumns() + 1);
 194     }
 195
 196     /* Worst case scenario of 1 atom in each last cell */
 197     int maxNumCells;
 198     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 199     {
 200         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns();
 201     }
 202     else
 203     {
 204         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns()*geometry_.numAtomsJCluster/geometry_.numAtomsICluster;
 205     }
 206
 207     if (!geometry_.isSimple)
 208     {
 209         numClusters_.resize(maxNumCells);
 210     }
 211     bbcz_.resize(maxNumCells*NNBSBB_D);
 212
 213     /* This resize also zeros the contents, this avoid possible
 214      * floating exceptions in SIMD with the unused bb elements.
 215      */
 216     if (geometry_.isSimple)
 217     {
 218         bb_.resize(maxNumCells);
 219     }
 220     else
 221     {
 222 #if NBNXN_BBXXXX
 223         pbb_.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 224 #else
 225         bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
 226 #endif
 227     }
 228
 229     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 230     {
 231         bbj_ = bb_;
 232     }
 233     else
 234     {
 235         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 236
 237         bbjStorage_.resize(maxNumCells*geometry_.numAtomsICluster/geometry_.numAtomsJCluster);
 238         bbj_ = bbjStorage_;
 239     }
 240
 241     flags_.resize(maxNumCells);
 242     if (nbs->bFEP)
 243     {
 244         fep_.resize(maxNumCells*geometry_.numAtomsPerCell/geometry_.numAtomsICluster);
 245     }
 246
 247     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 248     copy_rvec(upperCorner, dimensions_.upperCorner);
 249     copy_rvec(size,        dimensions_.gridSize);
 250 }
 251
 252 /* We need to sort paricles in grid columns on z-coordinate.
 253  * As particle are very often distributed homogeneously, we use a sorting
 254  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 255  * by a factor, cast to an int and try to store in that hole. If the hole
 256  * is full, we move this or another particle. A second pass is needed to make
 257  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 258  * 4 is the optimal value for homogeneous particle distribution and allows
 259  * for an O(#particles) sort up till distributions were all particles are
 260  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 261  * as it can be expensive to detect imhomogeneous particle distributions.
 262  */
 263 /*! \brief Ratio of grid cells to atoms */
 264 static constexpr int c_sortGridRatio         = 4;
 265 /*! \brief Maximum ratio of holes used, in the worst case all particles
 266  * end up in the last hole and we need num. atoms extra holes at the end.
 267  */
 268 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 269
 270 /*! \brief Sorts particle index a on coordinates x along dim.
 271  *
 272  * Backwards tells if we want decreasing iso increasing coordinates.
 273  * h0 is the minimum of the coordinate range.
 274  * invh is the 1/length of the sorting range.
 275  * n_per_h (>=n) is the expected average number of particles per 1/invh
 276  * sort is the sorting work array.
 277  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 278  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 279  */
 280 static void sort_atoms(int dim, gmx_bool Backwards,
 281                        int gmx_unused dd_zone,
 282                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 283                        int *a, int n,
 284                        gmx::ArrayRef<const gmx::RVec> x,
 285                        real h0, real invh, int n_per_h,
 286                        gmx::ArrayRef<int> sort)
 287 {
 288     if (n <= 1)
 289     {
 290         /* Nothing to do */
 291         return;
 292     }
 293
 294     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 295
 296     /* Transform the inverse range height into the inverse hole height */
 297     invh *= n_per_h*c_sortGridRatio;
 298
 299     /* Set nsort to the maximum possible number of holes used.
 300      * In worst case all n elements end up in the last bin.
 301      */
 302     int nsort = n_per_h*c_sortGridRatio + n;
 303
 304     /* Determine the index range used, so we can limit it for the second pass */
 305     int zi_min = INT_MAX;
 306     int zi_max = -1;
 307
 308     /* Sort the particles using a simple index sort */
 309     for (int i = 0; i < n; i++)
 310     {
 311         /* The cast takes care of float-point rounding effects below zero.
 312          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 313          * times the box height out of the box.
 314          */
 315         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 316
 317 #ifndef NDEBUG
 318         /* As we can have rounding effect, we use > iso >= here */
 319         if (relevantAtomsAreWithinGridBounds &&
 320             (zi < 0 || (dd_zone == 0 && zi > n_per_h*c_sortGridRatio)))
 321         {
 322             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 323                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 324                       n_per_h, c_sortGridRatio);
 325         }
 326 #endif
 327         if (zi < 0)
 328         {
 329             zi = 0;
 330         }
 331
 332         /* In a non-local domain, particles communcated for bonded interactions
 333          * can be far beyond the grid size, which is set by the non-bonded
 334          * cut-off distance. We sort such particles into the last cell.
 335          */
 336         if (zi > n_per_h*c_sortGridRatio)
 337         {
 338             zi = n_per_h*c_sortGridRatio;
 339         }
 340
 341         /* Ideally this particle should go in sort cell zi,
 342          * but that might already be in use,
 343          * in that case find the first empty cell higher up
 344          */
 345         if (sort[zi] < 0)
 346         {
 347             sort[zi] = a[i];
 348             zi_min   = std::min(zi_min, zi);
 349             zi_max   = std::max(zi_max, zi);
 350         }
 351         else
 352         {
 353             /* We have multiple atoms in the same sorting slot.
 354              * Sort on real z for minimal bounding box size.
 355              * There is an extra check for identical z to ensure
 356              * well-defined output order, independent of input order
 357              * to ensure binary reproducibility after restarts.
 358              */
 359             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 360                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 361                                        a[i] > sort[zi])))
 362             {
 363                 zi++;
 364             }
 365
 366             if (sort[zi] >= 0)
 367             {
 368                 /* Shift all elements by one slot until we find an empty slot */
 369                 int cp  = sort[zi];
 370                 int zim = zi + 1;
 371                 while (sort[zim] >= 0)
 372                 {
 373                     int tmp   = sort[zim];
 374                     sort[zim] = cp;
 375                     cp        = tmp;
 376                     zim++;
 377                 }
 378                 sort[zim] = cp;
 379                 zi_max    = std::max(zi_max, zim);
 380             }
 381             sort[zi] = a[i];
 382             zi_max   = std::max(zi_max, zi);
 383         }
 384     }
 385
 386     int c = 0;
 387     if (!Backwards)
 388     {
 389         for (int zi = 0; zi < nsort; zi++)
 390         {
 391             if (sort[zi] >= 0)
 392             {
 393                 a[c++]   = sort[zi];
 394                 sort[zi] = -1;
 395             }
 396         }
 397     }
 398     else
 399     {
 400         for (int zi = zi_max; zi >= zi_min; zi--)
 401         {
 402             if (sort[zi] >= 0)
 403             {
 404                 a[c++]   = sort[zi];
 405                 sort[zi] = -1;
 406             }
 407         }
 408     }
 409     if (c < n)
 410     {
 411         gmx_incons("Lost particles while sorting");
 412     }
 413 }
 414
 415 #if GMX_DOUBLE
 416 //! Returns double up to one least significant float bit smaller than x
 417 static double R2F_D(const float x)
 418 {
 419     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS)*x : (1 + GMX_FLOAT_EPS)*x);
 420 }
 421 //! Returns double up to one least significant float bit larger than x
 422 static double R2F_U(const float x)
 423 {
 424     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS)*x : (1 - GMX_FLOAT_EPS)*x);
 425 }
 426 #else
 427 //! Returns x
 428 static float R2F_D(const float x)
 429 {
 430     return x;
 431 }
 432 //! Returns x
 433 static float R2F_U(const float x)
 434 {
 435     return x;
 436 }
 437 #endif
 438
 439 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 440 static void calc_bounding_box(int na, int stride, const real *x,
 441                               BoundingBox *bb)
 442 {
 443     int  i;
 444     real xl, xh, yl, yh, zl, zh;
 445
 446     i  = 0;
 447     xl = x[i+XX];
 448     xh = x[i+XX];
 449     yl = x[i+YY];
 450     yh = x[i+YY];
 451     zl = x[i+ZZ];
 452     zh = x[i+ZZ];
 453     i += stride;
 454     for (int j = 1; j < na; j++)
 455     {
 456         xl = std::min(xl, x[i+XX]);
 457         xh = std::max(xh, x[i+XX]);
 458         yl = std::min(yl, x[i+YY]);
 459         yh = std::max(yh, x[i+YY]);
 460         zl = std::min(zl, x[i+ZZ]);
 461         zh = std::max(zh, x[i+ZZ]);
 462         i += stride;
 463     }
 464     /* Note: possible double to float conversion here */
 465     bb->lower[BB_X] = R2F_D(xl);
 466     bb->lower[BB_Y] = R2F_D(yl);
 467     bb->lower[BB_Z] = R2F_D(zl);
 468     bb->upper[BB_X] = R2F_U(xh);
 469     bb->upper[BB_Y] = R2F_U(yh);
 470     bb->upper[BB_Z] = R2F_U(zh);
 471 }
 472
 473 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 474 static void calc_bounding_box_x_x4(int na, const real *x,
 475                                    BoundingBox *bb)
 476 {
 477     real xl, xh, yl, yh, zl, zh;
 478
 479     xl = x[XX*c_packX4];
 480     xh = x[XX*c_packX4];
 481     yl = x[YY*c_packX4];
 482     yh = x[YY*c_packX4];
 483     zl = x[ZZ*c_packX4];
 484     zh = x[ZZ*c_packX4];
 485     for (int j = 1; j < na; j++)
 486     {
 487         xl = std::min(xl, x[j+XX*c_packX4]);
 488         xh = std::max(xh, x[j+XX*c_packX4]);
 489         yl = std::min(yl, x[j+YY*c_packX4]);
 490         yh = std::max(yh, x[j+YY*c_packX4]);
 491         zl = std::min(zl, x[j+ZZ*c_packX4]);
 492         zh = std::max(zh, x[j+ZZ*c_packX4]);
 493     }
 494     /* Note: possible double to float conversion here */
 495     bb->lower[BB_X] = R2F_D(xl);
 496     bb->lower[BB_Y] = R2F_D(yl);
 497     bb->lower[BB_Z] = R2F_D(zl);
 498     bb->upper[BB_X] = R2F_U(xh);
 499     bb->upper[BB_Y] = R2F_U(yh);
 500     bb->upper[BB_Z] = R2F_U(zh);
 501 }
 502
 503 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 504 static void calc_bounding_box_x_x8(int na, const real *x,
 505                                    BoundingBox *bb)
 506 {
 507     real xl, xh, yl, yh, zl, zh;
 508
 509     xl = x[XX*c_packX8];
 510     xh = x[XX*c_packX8];
 511     yl = x[YY*c_packX8];
 512     yh = x[YY*c_packX8];
 513     zl = x[ZZ*c_packX8];
 514     zh = x[ZZ*c_packX8];
 515     for (int j = 1; j < na; j++)
 516     {
 517         xl = std::min(xl, x[j+XX*c_packX8]);
 518         xh = std::max(xh, x[j+XX*c_packX8]);
 519         yl = std::min(yl, x[j+YY*c_packX8]);
 520         yh = std::max(yh, x[j+YY*c_packX8]);
 521         zl = std::min(zl, x[j+ZZ*c_packX8]);
 522         zh = std::max(zh, x[j+ZZ*c_packX8]);
 523     }
 524     /* Note: possible double to float conversion here */
 525     bb->lower[BB_X] = R2F_D(xl);
 526     bb->lower[BB_Y] = R2F_D(yl);
 527     bb->lower[BB_Z] = R2F_D(zl);
 528     bb->upper[BB_X] = R2F_U(xh);
 529     bb->upper[BB_Y] = R2F_U(yh);
 530     bb->upper[BB_Z] = R2F_U(zh);
 531 }
 532
 533 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 534 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 535                                                      BoundingBox *bb,
 536                                                      BoundingBox *bbj)
 537 {
 538     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 539     using namespace gmx;
 540
 541     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 542
 543     if (na > 2)
 544     {
 545         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 546     }
 547     else
 548     {
 549         /* Set the "empty" bounding box to the same as the first one,
 550          * so we don't need to treat special cases in the rest of the code.
 551          */
 552 #if NBNXN_SEARCH_BB_SIMD4
 553         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 554         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 555 #else
 556         bbj[1] = bbj[0];
 557 #endif
 558     }
 559
 560 #if NBNXN_SEARCH_BB_SIMD4
 561     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 562     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 563 #else
 564     {
 565         int i;
 566
 567         for (i = 0; i < NNBSBB_C; i++)
 568         {
 569             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 570             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 571         }
 572     }
 573 #endif
 574 }
 575
 576 #if NBNXN_SEARCH_BB_SIMD4
 577
 578 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 579 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 580 {
 581     int  i;
 582     real xl, xh, yl, yh, zl, zh;
 583
 584     i  = 0;
 585     xl = x[i+XX];
 586     xh = x[i+XX];
 587     yl = x[i+YY];
 588     yh = x[i+YY];
 589     zl = x[i+ZZ];
 590     zh = x[i+ZZ];
 591     i += stride;
 592     for (int j = 1; j < na; j++)
 593     {
 594         xl = std::min(xl, x[i+XX]);
 595         xh = std::max(xh, x[i+XX]);
 596         yl = std::min(yl, x[i+YY]);
 597         yh = std::max(yh, x[i+YY]);
 598         zl = std::min(zl, x[i+ZZ]);
 599         zh = std::max(zh, x[i+ZZ]);
 600         i += stride;
 601     }
 602     /* Note: possible double to float conversion here */
 603     bb[0*STRIDE_PBB] = R2F_D(xl);
 604     bb[1*STRIDE_PBB] = R2F_D(yl);
 605     bb[2*STRIDE_PBB] = R2F_D(zl);
 606     bb[3*STRIDE_PBB] = R2F_U(xh);
 607     bb[4*STRIDE_PBB] = R2F_U(yh);
 608     bb[5*STRIDE_PBB] = R2F_U(zh);
 609 }
 610
 611 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 612
 613 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 614
 615 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 616 static void calc_bounding_box_simd4(int na, const float *x,
 617                                     BoundingBox *bb)
 618 {
 619     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 620     using namespace gmx;
 621
 622     Simd4Float bb_0_S, bb_1_S;
 623     Simd4Float x_S;
 624
 625     bb_0_S = load4(x);
 626     bb_1_S = bb_0_S;
 627
 628     for (int i = 1; i < na; i++)
 629     {
 630         x_S    = load4(x+i*NNBSBB_C);
 631         bb_0_S = min(bb_0_S, x_S);
 632         bb_1_S = max(bb_1_S, x_S);
 633     }
 634
 635     store4(&bb->lower[0], bb_0_S);
 636     store4(&bb->upper[0], bb_1_S);
 637 }
 638
 639 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 640 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 641                                          BoundingBox *bb_work_aligned,
 642                                          real *bb)
 643 {
 644     calc_bounding_box_simd4(na, x, bb_work_aligned);
 645
 646     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 647     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 648     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 649     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 650     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 651     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 652 }
 653
 654 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 655
 656
 657 /*! \brief Combines pairs of consecutive bounding boxes */
 658 static void combine_bounding_box_pairs(const Grid                       &grid,
 659                                        gmx::ArrayRef<const BoundingBox>  bb,
 660                                        gmx::ArrayRef<BoundingBox>        bbj)
 661 {
 662     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 663     using namespace gmx;
 664
 665     for (int i = 0; i < grid.numColumns(); i++)
 666     {
 667         /* Starting bb in a column is expected to be 2-aligned */
 668         const int sc2 = grid.firstCellInColumn(i) >> 1;
 669         /* For odd numbers skip the last bb here */
 670         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 671         int       c2;
 672         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 673         {
 674 #if NBNXN_SEARCH_BB_SIMD4
 675             Simd4Float min_S, max_S;
 676
 677             min_S = min(load4(&bb[c2*2+0].lower[0]),
 678                         load4(&bb[c2*2+1].lower[0]));
 679             max_S = max(load4(&bb[c2*2+0].upper[0]),
 680                         load4(&bb[c2*2+1].upper[0]));
 681             store4(&bbj[c2].lower[0], min_S);
 682             store4(&bbj[c2].upper[0], max_S);
 683 #else
 684             for (int j = 0; j < NNBSBB_C; j++)
 685             {
 686                 bbj[c2].lower[j] = std::min(bb[c2*2 + 0].lower[j],
 687                                             bb[c2*2 + 1].lower[j]);
 688                 bbj[c2].upper[j] = std::max(bb[c2*2 + 0].upper[j],
 689                                             bb[c2*2 + 1].upper[j]);
 690             }
 691 #endif
 692         }
 693         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 694         {
 695             /* The bb count in this column is odd: duplicate the last bb */
 696             for (int j = 0; j < NNBSBB_C; j++)
 697             {
 698                 bbj[c2].lower[j] = bb[c2*2].lower[j];
 699                 bbj[c2].upper[j] = bb[c2*2].upper[j];
 700             }
 701         }
 702     }
 703 }
 704
 705
 706 /*! \brief Prints the average bb size, used for debug output */
 707 static void print_bbsizes_simple(FILE       *fp,
 708                                  const Grid &grid)
 709 {
 710     dvec ba;
 711
 712     clear_dvec(ba);
 713     for (int c = 0; c < grid.numCells(); c++)
 714     {
 715         for (int d = 0; d < DIM; d++)
 716         {
 717             ba[d] += grid.iBoundingBoxes()[c].upper[d] - grid.iBoundingBoxes()[c].lower[d];
 718         }
 719     }
 720     dsvmul(1.0/grid.numCells(), ba, ba);
 721
 722     const Grid::Dimensions &dims         = grid.dimensions();
 723     real                    avgCellSizeZ = (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]) : 0.0);
 724
 725     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 726             dims.cellSize[XX],
 727             dims.cellSize[YY],
 728             avgCellSizeZ,
 729             ba[XX], ba[YY], ba[ZZ],
 730             ba[XX]*dims.invCellSize[XX],
 731             ba[YY]*dims.invCellSize[YY],
 732             dims.atomDensity > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 733 }
 734
 735 /*! \brief Prints the average bb size, used for debug output */
 736 static void print_bbsizes_supersub(FILE       *fp,
 737                                    const Grid &grid)
 738 {
 739     int  ns;
 740     dvec ba;
 741
 742     clear_dvec(ba);
 743     ns = 0;
 744     for (int c = 0; c < grid.numCells(); c++)
 745     {
 746 #if NBNXN_BBXXXX
 747         for (int s = 0; s < grid.numClustersPerCell()[c]; s += STRIDE_PBB)
 748         {
 749             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 750             for (int i = 0; i < STRIDE_PBB; i++)
 751             {
 752                 for (int d = 0; d < DIM; d++)
 753                 {
 754                     ba[d] +=
 755                         grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX + (DIM + d)*STRIDE_PBB + i] -
 756                         grid.packedBoundingBoxes()[cs_w*NNBSBB_XXXX +        d *STRIDE_PBB + i];
 757                 }
 758             }
 759         }
 760 #else
 761         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 762         {
 763             int cs = c*c_gpuNumClusterPerCell + s;
 764             for (int d = 0; d < DIM; d++)
 765             {
 766                 ba[d] += grid.iBoundingBoxes()[cs].upper[d] - grid.iBoundingBoxes()[cs].lower[d];
 767             }
 768         }
 769 #endif
 770         ns += grid.numClustersPerCell()[c];
 771     }
 772     dsvmul(1.0/ns, ba, ba);
 773
 774     const Grid::Dimensions &dims            = grid.dimensions();
 775     const real              avgClusterSizeZ =
 776         (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 777
 778     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 779             dims.cellSize[XX]/c_gpuNumClusterPerCellX,
 780             dims.cellSize[YY]/c_gpuNumClusterPerCellY,
 781             avgClusterSizeZ,
 782             ba[XX], ba[YY], ba[ZZ],
 783             ba[XX]*c_gpuNumClusterPerCellX*dims.invCellSize[XX],
 784             ba[YY]*c_gpuNumClusterPerCellY*dims.invCellSize[YY],
 785             dims.atomDensity > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 786 }
 787
 788 /*!\brief Set non-bonded interaction flags for the current cluster.
 789  *
 790  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 791  */
 792 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 793                                  int                 atomStart,
 794                                  int                 atomEnd,
 795                                  const int          *atinfo,
 796                                  gmx::ArrayRef<int>  order,
 797                                  int                *flags)
 798 {
 799     constexpr int c_maxNumAtomsInCluster = 8;
 800     int           sort1[c_maxNumAtomsInCluster];
 801     int           sort2[c_maxNumAtomsInCluster];
 802
 803     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 804
 805     *flags = 0;
 806
 807     int subc = 0;
 808     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 809     {
 810         /* Make lists for this (sub-)cell on atoms with and without LJ */
 811         int      n1         = 0;
 812         int      n2         = 0;
 813         gmx_bool haveQ      = FALSE;
 814         int      a_lj_max   = -1;
 815         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 816         {
 817             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 818
 819             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 820             {
 821                 sort1[n1++] = order[a];
 822                 a_lj_max    = a;
 823             }
 824             else
 825             {
 826                 sort2[n2++] = order[a];
 827             }
 828         }
 829
 830         /* If we don't have atoms with LJ, there's nothing to sort */
 831         if (n1 > 0)
 832         {
 833             *flags |= NBNXN_CI_DO_LJ(subc);
 834
 835             if (2*n1 <= numAtomsInCluster)
 836             {
 837                 /* Only sort when strictly necessary. Ordering particles
 838                  * Ordering particles can lead to less accurate summation
 839                  * due to rounding, both for LJ and Coulomb interactions.
 840                  */
 841                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 842                 {
 843                     for (int i = 0; i < n1; i++)
 844                     {
 845                         order[atomStart + i]      = sort1[i];
 846                     }
 847                     for (int j = 0; j < n2; j++)
 848                     {
 849                         order[atomStart + n1 + j] = sort2[j];
 850                     }
 851                 }
 852
 853                 *flags |= NBNXN_CI_HALF_LJ(subc);
 854             }
 855         }
 856         if (haveQ)
 857         {
 858             *flags |= NBNXN_CI_DO_COUL(subc);
 859         }
 860         subc++;
 861     }
 862 }
 863
 864 /*! \brief Fill a pair search cell with atoms.
 865  *
 866  * Potentially sorts atoms and sets the interaction flags.
 867  */
 868 void Grid::fillCell(nbnxn_search                   *nbs,
 869                     nbnxn_atomdata_t               *nbat,
 870                     int                             atomStart,
 871                     int                             atomEnd,
 872                     const int                      *atinfo,
 873                     gmx::ArrayRef<const gmx::RVec>  x,
 874                     BoundingBox gmx_unused         *bb_work_aligned)
 875 {
 876     const int numAtoms = atomEnd - atomStart;
 877
 878     if (geometry_.isSimple)
 879     {
 880         /* Note that non-local grids are already sorted.
 881          * Then sort_cluster_on_flag will only set the flags and the sorting
 882          * will not affect the atom order.
 883          */
 884         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, nbs->a,
 885                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 886     }
 887
 888     if (nbs->bFEP)
 889     {
 890         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 891
 892         /* The grid-local cluster/(sub-)cell index */
 893         int cell = atomToCluster(atomStart) - cellOffset_*(geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 894         fep_[cell] = 0;
 895         for (int at = atomStart; at < atomEnd; at++)
 896         {
 897             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 898             {
 899                 fep_[cell] |= (1 << (at - atomStart));
 900             }
 901         }
 902     }
 903
 904     /* Now we have sorted the atoms, set the cell indices */
 905     for (int at = atomStart; at < atomEnd; at++)
 906     {
 907         nbs->cell[nbs->a[at]] = at;
 908     }
 909
 910     copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 911                            as_rvec_array(x.data()),
 912                            nbat->XFormat, nbat->x().data(), atomStart);
 913
 914     if (nbat->XFormat == nbatX4)
 915     {
 916         /* Store the bounding boxes as xyz.xyz. */
 917         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 918         BoundingBox *bb_ptr = bb_.data() + offset;
 919
 920 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 921         if (2*geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 922         {
 923             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 924                                           bbj_.data() + offset*2);
 925         }
 926         else
 927 #endif
 928         {
 929             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 930         }
 931     }
 932     else if (nbat->XFormat == nbatX8)
 933     {
 934         /* Store the bounding boxes as xyz.xyz. */
 935         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 936         BoundingBox *bb_ptr = bb_.data() + offset;
 937
 938         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 939     }
 940 #if NBNXN_BBXXXX
 941     else if (!geometry_.isSimple)
 942     {
 943         /* Store the bounding boxes in a format convenient
 944          * for SIMD4 calculations: xxxxyyyyzzzz...
 945          */
 946         float *pbb_ptr =
 947             pbb_.data() +
 948             ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> (geometry_.numAtomsICluster2Log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 949             (((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log) & (STRIDE_PBB - 1));
 950
 951 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 952         if (nbat->XFormat == nbatXYZQ)
 953         {
 954             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 955                                          bb_work_aligned, pbb_ptr);
 956         }
 957         else
 958 #endif
 959         {
 960             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 961                                    pbb_ptr);
 962         }
 963         if (gmx_debug_at)
 964         {
 965             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 966                     atomToCluster(atomStart),
 967                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 968                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 969                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 970         }
 971     }
 972 #endif
 973     else
 974     {
 975         /* Store the bounding boxes as xyz.xyz. */
 976         BoundingBox *bb_ptr = bb_.data() + atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 977
 978         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 979                           bb_ptr);
 980
 981         if (gmx_debug_at)
 982         {
 983             int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 984             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 985                     atomToCluster(atomStart),
 986                     bb_[bbo].lower[BB_X],
 987                     bb_[bbo].lower[BB_Y],
 988                     bb_[bbo].lower[BB_Z],
 989                     bb_[bbo].upper[BB_X],
 990                     bb_[bbo].upper[BB_Y],
 991                     bb_[bbo].upper[BB_Z]);
 992         }
 993     }
 994 }
 995
 996 void Grid::sortColumnsCpuGeometry(nbnxn_search *nbs,
 997                                   int dd_zone,
 998                                   int atomStart, int atomEnd,
 999                                   const int *atinfo,
1000                                   gmx::ArrayRef<const gmx::RVec> x,
1001                                   nbnxn_atomdata_t *nbat,
1002                                   int cxy_start, int cxy_end,
1003                                   gmx::ArrayRef<int> sort_work)
1004 {
1005     if (debug)
1006     {
1007         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1008                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1009     }
1010
1011     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1012
1013     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1014
1015     /* Sort the atoms within each x,y column in 3 dimensions */
1016     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1017     {
1018         const int numAtoms   = numAtomsInColumn(cxy);
1019         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1020         const int atomOffset = firstAtomInColumn(cxy);
1021
1022         /* Sort the atoms within each x,y column on z coordinate */
1023         sort_atoms(ZZ, FALSE, dd_zone,
1024                    relevantAtomsAreWithinGridBounds,
1025                    nbs->a.data() + atomOffset, numAtoms, x,
1026                    dimensions_.lowerCorner[ZZ],
1027                    1.0/dimensions_.gridSize[ZZ], numCellsZ*numAtomsPerCell,
1028                    sort_work);
1029
1030         /* Fill the ncz cells in this column */
1031         const int firstCell  = firstCellInColumn(cxy);
1032         int       cellFilled = firstCell;
1033         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1034         {
1035             const int cell           = firstCell + cellZ;
1036
1037             const int atomOffsetCell = atomOffset + cellZ*numAtomsPerCell;
1038             const int numAtomsCell   = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1039
1040             fillCell(nbs, nbat,
1041                      atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x,
1042                      nullptr);
1043
1044             /* This copy to bbcz is not really necessary.
1045              * But it allows to use the same grid search code
1046              * for the simple and supersub cell setups.
1047              */
1048             if (numAtomsCell > 0)
1049             {
1050                 cellFilled = cell;
1051             }
1052             bbcz_[cell*NNBSBB_D    ] = bb_[cellFilled].lower[BB_Z];
1053             bbcz_[cell*NNBSBB_D + 1] = bb_[cellFilled].upper[BB_Z];
1054         }
1055
1056         /* Set the unused atom indices to -1 */
1057         for (int ind = numAtoms; ind < numCellsZ*numAtomsPerCell; ind++)
1058         {
1059             nbs->a[atomOffset + ind] = -1;
1060         }
1061     }
1062 }
1063
1064 /* Spatially sort the atoms within one grid column */
1065 void Grid::sortColumnsGpuGeometry(nbnxn_search *nbs,
1066                                   int dd_zone,
1067                                   int atomStart, int atomEnd,
1068                                   const int *atinfo,
1069                                   gmx::ArrayRef<const gmx::RVec> x,
1070                                   nbnxn_atomdata_t *nbat,
1071                                   int cxy_start, int cxy_end,
1072                                   gmx::ArrayRef<int> sort_work)
1073 {
1074     BoundingBox  bb_work_array[2];
1075     BoundingBox *bb_work_aligned = reinterpret_cast<BoundingBox *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1076
1077     if (debug)
1078     {
1079         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1080                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1081     }
1082
1083     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1084
1085     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1086
1087     const int  subdiv_x = geometry_.numAtomsICluster;
1088     const int  subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1089     const int  subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1090
1091     /* Sort the atoms within each x,y column in 3 dimensions.
1092      * Loop over all columns on the x/y grid.
1093      */
1094     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1095     {
1096         const int gridX            = cxy/dimensions_.numCells[YY];
1097         const int gridY            = cxy - gridX*dimensions_.numCells[YY];
1098
1099         const int numAtomsInColumn = cxy_na_[cxy];
1100         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1101         const int atomOffset       = firstAtomInColumn(cxy);
1102
1103         /* Sort the atoms within each x,y column on z coordinate */
1104         sort_atoms(ZZ, FALSE, dd_zone,
1105                    relevantAtomsAreWithinGridBounds,
1106                    nbs->a.data() + atomOffset, numAtomsInColumn, x,
1107                    dimensions_.lowerCorner[ZZ],
1108                    1.0/dimensions_.gridSize[ZZ], numCellsInColumn*numAtomsPerCell,
1109                    sort_work);
1110
1111         /* This loop goes over the cells and clusters along z at once */
1112         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1113         {
1114             const int atomOffsetZ = atomOffset + sub_z*subdiv_z;
1115             const int numAtomsZ   = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1116             int       cz          = -1;
1117             /* We have already sorted on z */
1118
1119             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1120             {
1121                 cz = sub_z/c_gpuNumClusterPerCellZ;
1122                 const int cell = cxy_ind_[cxy] + cz;
1123
1124                 /* The number of atoms in this cell/super-cluster */
1125                 const int numAtoms = std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1126
1127                 numClusters_[cell] = std::min(c_gpuNumClusterPerCell,
1128                                               (numAtoms + geometry_.numAtomsICluster - 1)/ geometry_.numAtomsICluster);
1129
1130                 /* Store the z-boundaries of the bounding box of the cell */
1131                 bbcz_[cell*NNBSBB_D  ] = x[nbs->a[atomOffsetZ]][ZZ];
1132                 bbcz_[cell*NNBSBB_D+1] = x[nbs->a[atomOffsetZ + numAtoms - 1]][ZZ];
1133             }
1134
1135             if (c_gpuNumClusterPerCellY > 1)
1136             {
1137                 /* Sort the atoms along y */
1138                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1139                            relevantAtomsAreWithinGridBounds,
1140                            nbs->a.data() + atomOffsetZ, numAtomsZ, x,
1141                            dimensions_.lowerCorner[YY] + gridY*dimensions_.cellSize[YY],
1142                            dimensions_.invCellSize[YY], subdiv_z,
1143                            sort_work);
1144             }
1145
1146             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1147             {
1148                 const int atomOffsetY = atomOffsetZ + sub_y*subdiv_y;
1149                 const int numAtomsY   = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1150
1151                 if (c_gpuNumClusterPerCellX > 1)
1152                 {
1153                     /* Sort the atoms along x */
1154                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1155                                relevantAtomsAreWithinGridBounds,
1156                                nbs->a.data() + atomOffsetY, numAtomsY, x,
1157                                dimensions_.lowerCorner[XX] + gridX*dimensions_.cellSize[XX],
1158                                dimensions_.invCellSize[XX], subdiv_y,
1159                                sort_work);
1160                 }
1161
1162                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1163                 {
1164                     const int atomOffsetX = atomOffsetY + sub_x*subdiv_x;
1165                     const int numAtomsX   = std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1166
1167                     fillCell(nbs, nbat,
1168                              atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1169                              bb_work_aligned);
1170                 }
1171             }
1172         }
1173
1174         /* Set the unused atom indices to -1 */
1175         for (int ind = numAtomsInColumn; ind < numCellsInColumn*numAtomsPerCell; ind++)
1176         {
1177             nbs->a[atomOffset + ind] = -1;
1178         }
1179     }
1180 }
1181
1182 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1183 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1184                                 int                 cellIndex,
1185                                 gmx::ArrayRef<int>  cxy_na,
1186                                 int                 atomIndex)
1187 {
1188     cell[atomIndex]    = cellIndex;
1189     cxy_na[cellIndex] += 1;
1190 }
1191
1192 /*! \brief Determine in which grid column atoms should go */
1193 static void calc_column_indices(const Grid::Dimensions &gridDims,
1194                                 const gmx::UpdateGroupsCog *updateGroupsCog,
1195                                 int atomStart, int atomEnd,
1196                                 gmx::ArrayRef<const gmx::RVec> x,
1197                                 int dd_zone, const int *move,
1198                                 int thread, int nthread,
1199                                 gmx::ArrayRef<int> cell,
1200                                 gmx::ArrayRef<int> cxy_na)
1201 {
1202     const int numColumns = gridDims.numCells[XX]*gridDims.numCells[YY];
1203
1204     /* We add one extra cell for particles which moved during DD */
1205     for (int i = 0; i < numColumns; i++)
1206     {
1207         cxy_na[i] = 0;
1208     }
1209
1210     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1211     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1212
1213     if (dd_zone == 0)
1214     {
1215         /* Home zone */
1216         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1217         {
1218             if (move == nullptr || move[i] >= 0)
1219             {
1220
1221                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1222
1223                 /* We need to be careful with rounding,
1224                  * particles might be a few bits outside the local zone.
1225                  * The int cast takes care of the lower bound,
1226                  * we will explicitly take care of the upper bound.
1227                  */
1228                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1229                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1230
1231 #ifndef NDEBUG
1232                 if (cx < 0 || cx > gridDims.numCells[XX] ||
1233                     cy < 0 || cy > gridDims.numCells[YY])
1234                 {
1235                     gmx_fatal(FARGS,
1236                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1237                               "atom %f %f %f, grid->c0 %f %f",
1238                               cx, cy,
1239                               gridDims.numCells[XX],
1240                               gridDims.numCells[YY],
1241                               x[i][XX], x[i][YY], x[i][ZZ],
1242                               gridDims.lowerCorner[XX],
1243                               gridDims.lowerCorner[YY]);
1244                 }
1245 #endif
1246                 /* Take care of potential rouding issues */
1247                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1248                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1249
1250                 /* For the moment cell will contain only the, grid local,
1251                  * x and y indices, not z.
1252                  */
1253                 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1254             }
1255             else
1256             {
1257                 /* Put this moved particle after the end of the grid,
1258                  * so we can process it later without using conditionals.
1259                  */
1260                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1261             }
1262         }
1263     }
1264     else
1265     {
1266         /* Non-home zone */
1267         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1268         {
1269             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1270             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1271
1272             /* For non-home zones there could be particles outside
1273              * the non-bonded cut-off range, which have been communicated
1274              * for bonded interactions only. For the result it doesn't
1275              * matter where these end up on the grid. For performance
1276              * we put them in an extra row at the border.
1277              */
1278             cx = std::max(cx, 0);
1279             cx = std::min(cx, gridDims.numCells[XX] - 1);
1280             cy = std::max(cy, 0);
1281             cy = std::min(cy, gridDims.numCells[YY] - 1);
1282
1283             /* For the moment cell will contain only the, grid local,
1284              * x and y indices, not z.
1285              */
1286             setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1287         }
1288     }
1289 }
1290
1291 /*! \brief Resizes grid and atom data which depend on the number of cells */
1292 static void resizeForNumberOfCells(const int           numNbnxnAtoms,
1293                                    const int           numAtomsMoved,
1294                                    nbnxn_search       *nbs,
1295                                    nbnxn_atomdata_t   *nbat)
1296 {
1297     /* Note: nbs->cell was already resized before */
1298
1299     /* To avoid conditionals we store the moved particles at the end of a,
1300      * make sure we have enough space.
1301      */
1302     nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1303
1304     /* Make space in nbat for storing the atom coordinates */
1305     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1306 }
1307
1308 void Grid::calcCellIndices(nbnxn_search                   *nbs,
1309                            int                             ddZone,
1310                            int                             cellOffset,
1311                            const gmx::UpdateGroupsCog     *updateGroupsCog,
1312                            int                             atomStart,
1313                            int                             atomEnd,
1314                            const int                      *atinfo,
1315                            gmx::ArrayRef<const gmx::RVec>  x,
1316                            int                             numAtomsMoved,
1317                            const int                      *move,
1318                            nbnxn_atomdata_t               *nbat)
1319 {
1320     cellOffset_ = cellOffset;
1321
1322     /* First compute all grid/column indices and store them in nbs->cell */
1323     nbs->cell.resize(atomEnd);
1324
1325     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1326
1327 #pragma omp parallel for num_threads(nthread) schedule(static)
1328     for (int thread = 0; thread < nthread; thread++)
1329     {
1330         try
1331         {
1332             calc_column_indices(dimensions_,
1333                                 updateGroupsCog,
1334                                 atomStart, atomEnd, x,
1335                                 ddZone, move, thread, nthread,
1336                                 nbs->cell, nbs->work[thread].cxy_na);
1337         }
1338         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1339     }
1340
1341     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1342
1343     /* Make the cell index as a function of x and y */
1344     int ncz_max      = 0;
1345     int ncz          = 0;
1346     cxy_ind_[0]      = 0;
1347     for (int i = 0; i < numColumns() + 1; i++)
1348     {
1349         /* We set ncz_max at the beginning of the loop iso at the end
1350          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1351          * that do not need to be ordered on the grid.
1352          */
1353         if (ncz > ncz_max)
1354         {
1355             ncz_max = ncz;
1356         }
1357         int cxy_na_i = nbs->work[0].cxy_na[i];
1358         for (int thread = 1; thread < nthread; thread++)
1359         {
1360             cxy_na_i += nbs->work[thread].cxy_na[i];
1361         }
1362         ncz = (cxy_na_i + numAtomsPerCell - 1)/numAtomsPerCell;
1363         if (nbat->XFormat == nbatX8)
1364         {
1365             /* Make the number of cell a multiple of 2 */
1366             ncz = (ncz + 1) & ~1;
1367         }
1368         cxy_ind_[i+1] = cxy_ind_[i] + ncz;
1369         /* Clear cxy_na_, so we can reuse the array below */
1370         cxy_na_[i] = 0;
1371     }
1372     numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
1373
1374     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, nbs, nbat);
1375
1376     if (debug)
1377     {
1378         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1379                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_,
1380                 dimensions_.numCells[XX], dimensions_.numCells[YY],
1381                 numCellsTotal_/(static_cast<double>(numColumns())),
1382                 ncz_max);
1383         if (gmx_debug_at)
1384         {
1385             int i = 0;
1386             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1387             {
1388                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1389                 {
1390                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1391                     i++;
1392                 }
1393                 fprintf(debug, "\n");
1394             }
1395         }
1396     }
1397
1398     /* Make sure the work array for sorting is large enough */
1399     const int worstCaseSortBufferSize = ncz_max*numAtomsPerCell*c_sortGridMaxSizeFactor;
1400     if (worstCaseSortBufferSize > gmx::index(nbs->work[0].sortBuffer.size()))
1401     {
1402         for (nbnxn_search_work_t &work : nbs->work)
1403         {
1404             /* Elements not in use should be -1 */
1405             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1406         }
1407     }
1408
1409     /* Now we know the dimensions we can fill the grid.
1410      * This is the first, unsorted fill. We sort the columns after this.
1411      */
1412     for (int i = atomStart; i < atomEnd; i++)
1413     {
1414         /* At this point nbs->cell contains the local grid x,y indices */
1415         int cxy = nbs->cell[i];
1416         nbs->a[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1417     }
1418
1419     if (ddZone == 0)
1420     {
1421         /* Set the cell indices for the moved particles */
1422         int n0 = numCellsTotal_*numAtomsPerCell;
1423         int n1 = numCellsTotal_*numAtomsPerCell + cxy_na_[numColumns()];
1424         if (ddZone == 0)
1425         {
1426             for (int i = n0; i < n1; i++)
1427             {
1428                 nbs->cell[nbs->a[i]] = i;
1429             }
1430         }
1431     }
1432
1433     /* Sort the super-cell columns along z into the sub-cells. */
1434 #pragma omp parallel for num_threads(nthread) schedule(static)
1435     for (int thread = 0; thread < nthread; thread++)
1436     {
1437         try
1438         {
1439             int columnStart = ((thread + 0)*numColumns())/nthread;
1440             int columnEnd   = ((thread + 1)*numColumns())/nthread;
1441             if (geometry_.isSimple)
1442             {
1443                 sortColumnsCpuGeometry(nbs, ddZone, atomStart, atomEnd, atinfo, x, nbat,
1444                                        columnStart, columnEnd,
1445                                        nbs->work[thread].sortBuffer);
1446             }
1447             else
1448             {
1449                 sortColumnsGpuGeometry(nbs, ddZone, atomStart, atomEnd, atinfo, x, nbat,
1450                                        columnStart, columnEnd,
1451                                        nbs->work[thread].sortBuffer);
1452             }
1453         }
1454         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1455     }
1456
1457     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1458     {
1459         combine_bounding_box_pairs(*this, bb_, bbj_);
1460     }
1461
1462     if (!geometry_.isSimple)
1463     {
1464         numClustersTotal_ = 0;
1465         for (int i = 0; i < numCellsTotal_; i++)
1466         {
1467             numClustersTotal_ += numClusters_[i];
1468         }
1469     }
1470
1471     if (debug)
1472     {
1473         if (geometry_.isSimple)
1474         {
1475             print_bbsizes_simple(debug, *this);
1476         }
1477         else
1478         {
1479             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1480                     numClustersTotal_,
1481                     (atomEnd - atomStart)/static_cast<double>(numClustersTotal_));
1482
1483             print_bbsizes_supersub(debug, *this);
1484         }
1485     }
1486 }
1487
1488 } // namespace Nbnxm
1489
1490 // TODO: Move the functions below to nbnxn.cpp
1491
1492 /* Sets up a grid and puts the atoms on the grid.
1493  * This function only operates on one domain of the domain decompostion.
1494  * Note that without domain decomposition there is only one domain.
1495  */
1496 void nbnxn_put_on_grid(nonbonded_verlet_t             *nbv,
1497                        const matrix                    box,
1498                        int                             ddZone,
1499                        const rvec                      lowerCorner,
1500                        const rvec                      upperCorner,
1501                        const gmx::UpdateGroupsCog     *updateGroupsCog,
1502                        int                             atomStart,
1503                        int                             atomEnd,
1504                        real                            atomDensity,
1505                        const int                      *atinfo,
1506                        gmx::ArrayRef<const gmx::RVec>  x,
1507                        int                             numAtomsMoved,
1508                        const int                      *move)
1509 {
1510     nbnxn_search *nbs  = nbv->nbs.get();
1511     Nbnxm::Grid  &grid = nbs->grid[ddZone];
1512
1513     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1514
1515     int cellOffset;
1516     if (ddZone == 0)
1517     {
1518         cellOffset = 0;
1519     }
1520     else
1521     {
1522         const Nbnxm::Grid &previousGrid = nbs->grid[ddZone - 1];
1523         cellOffset = previousGrid.atomIndexEnd()/previousGrid.geometry().numAtomsPerCell;
1524     }
1525
1526     const int n = atomEnd - atomStart;
1527
1528     real      maxAtomGroupRadius;
1529     if (ddZone == 0)
1530     {
1531         copy_mat(box, nbs->box);
1532
1533         nbs->natoms_local    = atomEnd - numAtomsMoved;
1534         /* We assume that nbnxn_put_on_grid is called first
1535          * for the local atoms (ddZone=0).
1536          */
1537         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1538
1539         maxAtomGroupRadius = (updateGroupsCog ? updateGroupsCog->maxUpdateGroupRadius() : 0);
1540
1541         if (debug)
1542         {
1543             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1544                     nbs->natoms_local, atomDensity);
1545         }
1546     }
1547     else
1548     {
1549         const Nbnxm::Grid::Dimensions &dimsGrid0 = nbs->grid[0].dimensions();
1550         atomDensity        = dimsGrid0.atomDensity;
1551         maxAtomGroupRadius = dimsGrid0.maxAtomGroupRadius;
1552
1553         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1554     }
1555
1556     /* We always use the home zone (grid[0]) for setting the cell size,
1557      * since determining densities for non-local zones is difficult.
1558      */
1559     grid.setDimensions(nbs,
1560                        ddZone, n - numAtomsMoved,
1561                        lowerCorner, upperCorner,
1562                        atomDensity,
1563                        maxAtomGroupRadius);
1564
1565     nbnxn_atomdata_t *nbat = nbv->nbat.get();
1566
1567     grid.calcCellIndices(nbs, ddZone, cellOffset, updateGroupsCog, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1568
1569     if (ddZone == 0)
1570     {
1571         nbat->natoms_local = nbat->numAtoms();
1572     }
1573     if (ddZone == gmx::ssize(nbs->grid) - 1)
1574     {
1575         /* We are done setting up all grids, we can resize the force buffers */
1576         nbat->resizeForceBuffers();
1577     }
1578
1579     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1580 }
1581
1582 /* Calls nbnxn_put_on_grid for all non-local domains */
1583 void nbnxn_put_on_grid_nonlocal(nonbonded_verlet_t              *nbv,
1584                                 const struct gmx_domdec_zones_t *zones,
1585                                 const int                       *atinfo,
1586                                 gmx::ArrayRef<const gmx::RVec>   x)
1587 {
1588     for (int zone = 1; zone < zones->n; zone++)
1589     {
1590         rvec c0, c1;
1591         for (int d = 0; d < DIM; d++)
1592         {
1593             c0[d] = zones->size[zone].bb_x0[d];
1594             c1[d] = zones->size[zone].bb_x1[d];
1595         }
1596
1597         nbnxn_put_on_grid(nbv, nullptr,
1598                           zone, c0, c1,
1599                           nullptr,
1600                           zones->cg_range[zone],
1601                           zones->cg_range[zone+1],
1602                           -1,
1603                           atinfo,
1604                           x,
1605                           0, nullptr);
1606     }
1607 }
1608
1609 void nbnxn_get_ncells(const nbnxn_search *nbs, int *ncx, int *ncy)
1610 {
1611     *ncx = nbs->grid[0].dimensions().numCells[XX];
1612     *ncy = nbs->grid[0].dimensions().numCells[YY];
1613 }
1614
1615 gmx::ArrayRef<const int> nbnxn_get_atomorder(const nbnxn_search *nbs)
1616 {
1617     /* Return the atom order for the home cell (index 0) */
1618     const Nbnxm::Grid &grid       = nbs->grid[0];
1619
1620     const int          numIndices = grid.atomIndexEnd() - grid.firstAtomInColumn(0);
1621
1622     return gmx::constArrayRefFromArray(nbs->a.data(), numIndices);
1623 }
1624
1625 void nbnxn_set_atomorder(nbnxn_search *nbs)
1626 {
1627     /* Set the atom order for the home cell (index 0) */
1628     const Nbnxm::Grid &grid = nbs->grid[0];
1629
1630     int                atomIndex = 0;
1631     for (int cxy = 0; cxy < grid.numColumns(); cxy++)
1632     {
1633         const int numAtoms  = grid.numAtomsInColumn(cxy);
1634         int       cellIndex = grid.firstCellInColumn(cxy)*grid.geometry().numAtomsPerCell;
1635         for (int i = 0; i < numAtoms; i++)
1636         {
1637             nbs->a[cellIndex]    = atomIndex;
1638             nbs->cell[atomIndex] = cellIndex;
1639             atomIndex++;
1640             cellIndex++;
1641         }
1642     }
1643 }
1644
1645 gmx::ArrayRef<const int> nbnxn_get_gridindices(const nbnxn_search *nbs)
1646 {
1647     return nbs->cell;
1648 }