src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief
  39  * Implements the Grid class.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_nbnxm
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "grid.h"
  48
  49 #include <cmath>
  50 #include <cstring>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/math/utilities.h"
  55 #include "gromacs/math/vec.h"
  56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  57 #include "gromacs/mdlib/updategroupscog.h"
  58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  59 #include "gromacs/nbnxm/atomdata.h"
  60 #include "gromacs/nbnxm/nbnxm_geometry.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/vector_operations.h"
  63
  64 #include "boundingboxes.h"
  65 #include "gridsetdata.h"
  66 #include "pairlistparams.h"
  67
  68 namespace Nbnxm
  69 {
  70
  71 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  72     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  73     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  74     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  75     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell)*numAtomsICluster),
  76     numAtomsICluster2Log(get_2log(numAtomsICluster))
  77 {
  78 }
  79
  80 Grid::Grid(const PairlistType  pairlistType,
  81            const bool         &haveFep) :
  82     geometry_(pairlistType),
  83     haveFep_(haveFep)
  84 {
  85 }
  86
  87 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  88 static real gridAtomDensity(int        numAtoms,
  89                             const rvec lowerCorner,
  90                             const rvec upperCorner)
  91 {
  92     rvec size;
  93
  94     if (numAtoms == 0)
  95     {
  96         /* To avoid zero density we use a minimum of 1 atom */
  97         numAtoms = 1;
  98     }
  99
 100     rvec_sub(upperCorner, lowerCorner, size);
 101
 102     return static_cast<real>(numAtoms)/(size[XX]*size[YY]*size[ZZ]);
 103 }
 104
 105 void Grid::setDimensions(const int            ddZone,
 106                          const int            numAtoms,
 107                          gmx::RVec            lowerCorner,
 108                          gmx::RVec            upperCorner,
 109                          real                 atomDensity,
 110                          const real           maxAtomGroupRadius,
 111                          const bool           haveFep,
 112                          gmx::PinningPolicy   pinningPolicy)
 113 {
 114     /* We allow passing lowerCorner=upperCorner, in which case we need to
 115      * create a finite sized bounding box to avoid division by zero.
 116      * We use a minimum size such that the volume fits in float with some
 117      * margin for computing and using the atom number density.
 118      */
 119     constexpr real c_minimumGridSize = 1e-10;
 120     for (int d = 0; d < DIM; d++)
 121     {
 122         GMX_ASSERT(upperCorner[d] >= lowerCorner[d], "Upper corner should be larger than the lower corner");
 123         if (upperCorner[d] - lowerCorner[d] < c_minimumGridSize)
 124         {
 125             /* Ensure we apply a correction to the bounding box */
 126             real correction = std::max(std::abs(lowerCorner[d])*GMX_REAL_EPS,
 127                                        0.5_real*c_minimumGridSize);
 128             lowerCorner[d] -= correction;
 129             upperCorner[d] += correction;
 130         }
 131     }
 132
 133     /* For the home zone we compute the density when not set (=-1) or when =0 */
 134     if (ddZone == 0 && atomDensity <= 0)
 135     {
 136         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 137     }
 138
 139     dimensions_.atomDensity        = atomDensity;
 140     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 141
 142     rvec size;
 143     rvec_sub(upperCorner, lowerCorner, size);
 144
 145     if (numAtoms > geometry_.numAtomsPerCell)
 146     {
 147         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 148
 149         /* target cell length */
 150         real tlen_x;
 151         real tlen_y;
 152         if (geometry_.isSimple)
 153         {
 154             /* To minimize the zero interactions, we should make
 155              * the largest of the i/j cell cubic.
 156              */
 157             int numAtomsInCell = std::max(geometry_.numAtomsICluster,
 158                                           geometry_.numAtomsJCluster);
 159
 160             /* Approximately cubic cells */
 161             real tlen = std::cbrt(numAtomsInCell/atomDensity);
 162             tlen_x    = tlen;
 163             tlen_y    = tlen;
 164         }
 165         else
 166         {
 167             /* Approximately cubic sub cells */
 168             real tlen = std::cbrt(geometry_.numAtomsICluster/atomDensity);
 169             tlen_x    = tlen*c_gpuNumClusterPerCellX;
 170             tlen_y    = tlen*c_gpuNumClusterPerCellY;
 171         }
 172         /* We round ncx and ncy down, because we get less cell pairs
 173          * in the pairlist when the fixed cell dimensions (x,y) are
 174          * larger than the variable one (z) than the other way around.
 175          */
 176         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 177         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 178     }
 179     else
 180     {
 181         dimensions_.numCells[XX] = 1;
 182         dimensions_.numCells[YY] = 1;
 183     }
 184
 185     for (int d = 0; d < DIM - 1; d++)
 186     {
 187         dimensions_.cellSize[d]    = size[d]/dimensions_.numCells[d];
 188         dimensions_.invCellSize[d] = 1/dimensions_.cellSize[d];
 189     }
 190
 191     if (ddZone > 0)
 192     {
 193         /* This is a non-home zone, add an extra row of cells
 194          * for particles communicated for bonded interactions.
 195          * These can be beyond the cut-off. It doesn't matter where
 196          * they end up on the grid, but for performance it's better
 197          * if they don't end up in cells that can be within cut-off range.
 198          */
 199         dimensions_.numCells[XX]++;
 200         dimensions_.numCells[YY]++;
 201     }
 202
 203     /* We need one additional cell entry for particles moved by DD */
 204     cxy_na_.resize(numColumns() + 1);
 205     cxy_ind_.resize(numColumns() + 2);
 206     changePinningPolicy(&cxy_na_, pinningPolicy);
 207     changePinningPolicy(&cxy_ind_, pinningPolicy);
 208
 209     /* Worst case scenario of 1 atom in each last cell */
 210     int maxNumCells;
 211     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 212     {
 213         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns();
 214     }
 215     else
 216     {
 217         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns()*geometry_.numAtomsJCluster/geometry_.numAtomsICluster;
 218     }
 219
 220     if (!geometry_.isSimple)
 221     {
 222         numClusters_.resize(maxNumCells);
 223     }
 224     bbcz_.resize(maxNumCells);
 225
 226     /* This resize also zeros the contents, this avoid possible
 227      * floating exceptions in SIMD with the unused bb elements.
 228      */
 229     if (geometry_.isSimple)
 230     {
 231         bb_.resize(maxNumCells);
 232     }
 233     else
 234     {
 235 #if NBNXN_BBXXXX
 236         pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell));
 237 #else
 238         bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
 239 #endif
 240     }
 241
 242     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 243     {
 244         bbj_ = bb_;
 245     }
 246     else
 247     {
 248         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 249
 250         bbjStorage_.resize(maxNumCells*geometry_.numAtomsICluster/geometry_.numAtomsJCluster);
 251         bbj_ = bbjStorage_;
 252     }
 253
 254     flags_.resize(maxNumCells);
 255     if (haveFep)
 256     {
 257         fep_.resize(maxNumCells*geometry_.numAtomsPerCell/geometry_.numAtomsICluster);
 258     }
 259
 260     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 261     copy_rvec(upperCorner, dimensions_.upperCorner);
 262     copy_rvec(size,        dimensions_.gridSize);
 263 }
 264
 265 /* We need to sort paricles in grid columns on z-coordinate.
 266  * As particle are very often distributed homogeneously, we use a sorting
 267  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 268  * by a factor, cast to an int and try to store in that hole. If the hole
 269  * is full, we move this or another particle. A second pass is needed to make
 270  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 271  * 4 is the optimal value for homogeneous particle distribution and allows
 272  * for an O(#particles) sort up till distributions were all particles are
 273  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 274  * as it can be expensive to detect imhomogeneous particle distributions.
 275  */
 276 /*! \brief Ratio of grid cells to atoms */
 277 static constexpr int c_sortGridRatio         = 4;
 278 /*! \brief Maximum ratio of holes used, in the worst case all particles
 279  * end up in the last hole and we need num. atoms extra holes at the end.
 280  */
 281 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 282
 283 /*! \brief Sorts particle index a on coordinates x along dim.
 284  *
 285  * Backwards tells if we want decreasing iso increasing coordinates.
 286  * h0 is the minimum of the coordinate range.
 287  * invh is the 1/length of the sorting range.
 288  * n_per_h (>=n) is the expected average number of particles per 1/invh
 289  * sort is the sorting work array.
 290  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 291  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 292  */
 293 static void sort_atoms(int dim, gmx_bool Backwards,
 294                        int gmx_unused dd_zone,
 295                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 296                        int *a, int n,
 297                        gmx::ArrayRef<const gmx::RVec> x,
 298                        real h0, real invh, int n_per_h,
 299                        gmx::ArrayRef<int> sort)
 300 {
 301     if (n <= 1)
 302     {
 303         /* Nothing to do */
 304         return;
 305     }
 306
 307     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 308
 309     /* Transform the inverse range height into the inverse hole height */
 310     invh *= n_per_h*c_sortGridRatio;
 311
 312     /* Set nsort to the maximum possible number of holes used.
 313      * In worst case all n elements end up in the last bin.
 314      */
 315     int nsort = n_per_h*c_sortGridRatio + n;
 316
 317     /* Determine the index range used, so we can limit it for the second pass */
 318     int zi_min = INT_MAX;
 319     int zi_max = -1;
 320
 321     /* Sort the particles using a simple index sort */
 322     for (int i = 0; i < n; i++)
 323     {
 324         /* The cast takes care of float-point rounding effects below zero.
 325          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 326          * times the box height out of the box.
 327          */
 328         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 329
 330 #ifndef NDEBUG
 331         /* As we can have rounding effect, we use > iso >= here */
 332         if (relevantAtomsAreWithinGridBounds &&
 333             (zi < 0 || (dd_zone == 0 && zi > n_per_h*c_sortGridRatio)))
 334         {
 335             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 336                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 337                       n_per_h, c_sortGridRatio);
 338         }
 339 #endif
 340         if (zi < 0)
 341         {
 342             zi = 0;
 343         }
 344
 345         /* In a non-local domain, particles communcated for bonded interactions
 346          * can be far beyond the grid size, which is set by the non-bonded
 347          * cut-off distance. We sort such particles into the last cell.
 348          */
 349         if (zi > n_per_h*c_sortGridRatio)
 350         {
 351             zi = n_per_h*c_sortGridRatio;
 352         }
 353
 354         /* Ideally this particle should go in sort cell zi,
 355          * but that might already be in use,
 356          * in that case find the first empty cell higher up
 357          */
 358         if (sort[zi] < 0)
 359         {
 360             sort[zi] = a[i];
 361             zi_min   = std::min(zi_min, zi);
 362             zi_max   = std::max(zi_max, zi);
 363         }
 364         else
 365         {
 366             /* We have multiple atoms in the same sorting slot.
 367              * Sort on real z for minimal bounding box size.
 368              * There is an extra check for identical z to ensure
 369              * well-defined output order, independent of input order
 370              * to ensure binary reproducibility after restarts.
 371              */
 372             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 373                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 374                                        a[i] > sort[zi])))
 375             {
 376                 zi++;
 377             }
 378
 379             if (sort[zi] >= 0)
 380             {
 381                 /* Shift all elements by one slot until we find an empty slot */
 382                 int cp  = sort[zi];
 383                 int zim = zi + 1;
 384                 while (sort[zim] >= 0)
 385                 {
 386                     int tmp   = sort[zim];
 387                     sort[zim] = cp;
 388                     cp        = tmp;
 389                     zim++;
 390                 }
 391                 sort[zim] = cp;
 392                 zi_max    = std::max(zi_max, zim);
 393             }
 394             sort[zi] = a[i];
 395             zi_max   = std::max(zi_max, zi);
 396         }
 397     }
 398
 399     int c = 0;
 400     if (!Backwards)
 401     {
 402         for (int zi = 0; zi < nsort; zi++)
 403         {
 404             if (sort[zi] >= 0)
 405             {
 406                 a[c++]   = sort[zi];
 407                 sort[zi] = -1;
 408             }
 409         }
 410     }
 411     else
 412     {
 413         for (int zi = zi_max; zi >= zi_min; zi--)
 414         {
 415             if (sort[zi] >= 0)
 416             {
 417                 a[c++]   = sort[zi];
 418                 sort[zi] = -1;
 419             }
 420         }
 421     }
 422     if (c < n)
 423     {
 424         gmx_incons("Lost particles while sorting");
 425     }
 426 }
 427
 428 #if GMX_DOUBLE
 429 //! Returns double up to one least significant float bit smaller than x
 430 static double R2F_D(const float x)
 431 {
 432     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS)*x : (1 + GMX_FLOAT_EPS)*x);
 433 }
 434 //! Returns double up to one least significant float bit larger than x
 435 static double R2F_U(const float x)
 436 {
 437     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS)*x : (1 - GMX_FLOAT_EPS)*x);
 438 }
 439 #else
 440 //! Returns x
 441 static float R2F_D(const float x)
 442 {
 443     return x;
 444 }
 445 //! Returns x
 446 static float R2F_U(const float x)
 447 {
 448     return x;
 449 }
 450 #endif
 451
 452 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 453 static void calc_bounding_box(int na, int stride, const real *x,
 454                               BoundingBox *bb)
 455 {
 456     int  i;
 457     real xl, xh, yl, yh, zl, zh;
 458
 459     i  = 0;
 460     xl = x[i+XX];
 461     xh = x[i+XX];
 462     yl = x[i+YY];
 463     yh = x[i+YY];
 464     zl = x[i+ZZ];
 465     zh = x[i+ZZ];
 466     i += stride;
 467     for (int j = 1; j < na; j++)
 468     {
 469         xl = std::min(xl, x[i+XX]);
 470         xh = std::max(xh, x[i+XX]);
 471         yl = std::min(yl, x[i+YY]);
 472         yh = std::max(yh, x[i+YY]);
 473         zl = std::min(zl, x[i+ZZ]);
 474         zh = std::max(zh, x[i+ZZ]);
 475         i += stride;
 476     }
 477     /* Note: possible double to float conversion here */
 478     bb->lower.x = R2F_D(xl);
 479     bb->lower.y = R2F_D(yl);
 480     bb->lower.z = R2F_D(zl);
 481     bb->upper.x = R2F_U(xh);
 482     bb->upper.y = R2F_U(yh);
 483     bb->upper.z = R2F_U(zh);
 484 }
 485
 486 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 487 static void calc_bounding_box_x_x4(int na, const real *x,
 488                                    BoundingBox *bb)
 489 {
 490     real xl, xh, yl, yh, zl, zh;
 491
 492     xl = x[XX*c_packX4];
 493     xh = x[XX*c_packX4];
 494     yl = x[YY*c_packX4];
 495     yh = x[YY*c_packX4];
 496     zl = x[ZZ*c_packX4];
 497     zh = x[ZZ*c_packX4];
 498     for (int j = 1; j < na; j++)
 499     {
 500         xl = std::min(xl, x[j+XX*c_packX4]);
 501         xh = std::max(xh, x[j+XX*c_packX4]);
 502         yl = std::min(yl, x[j+YY*c_packX4]);
 503         yh = std::max(yh, x[j+YY*c_packX4]);
 504         zl = std::min(zl, x[j+ZZ*c_packX4]);
 505         zh = std::max(zh, x[j+ZZ*c_packX4]);
 506     }
 507     /* Note: possible double to float conversion here */
 508     bb->lower.x = R2F_D(xl);
 509     bb->lower.y = R2F_D(yl);
 510     bb->lower.z = R2F_D(zl);
 511     bb->upper.x = R2F_U(xh);
 512     bb->upper.y = R2F_U(yh);
 513     bb->upper.z = R2F_U(zh);
 514 }
 515
 516 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 517 static void calc_bounding_box_x_x8(int na, const real *x,
 518                                    BoundingBox *bb)
 519 {
 520     real xl, xh, yl, yh, zl, zh;
 521
 522     xl = x[XX*c_packX8];
 523     xh = x[XX*c_packX8];
 524     yl = x[YY*c_packX8];
 525     yh = x[YY*c_packX8];
 526     zl = x[ZZ*c_packX8];
 527     zh = x[ZZ*c_packX8];
 528     for (int j = 1; j < na; j++)
 529     {
 530         xl = std::min(xl, x[j+XX*c_packX8]);
 531         xh = std::max(xh, x[j+XX*c_packX8]);
 532         yl = std::min(yl, x[j+YY*c_packX8]);
 533         yh = std::max(yh, x[j+YY*c_packX8]);
 534         zl = std::min(zl, x[j+ZZ*c_packX8]);
 535         zh = std::max(zh, x[j+ZZ*c_packX8]);
 536     }
 537     /* Note: possible double to float conversion here */
 538     bb->lower.x = R2F_D(xl);
 539     bb->lower.y = R2F_D(yl);
 540     bb->lower.z = R2F_D(zl);
 541     bb->upper.x = R2F_U(xh);
 542     bb->upper.y = R2F_U(yh);
 543     bb->upper.z = R2F_U(zh);
 544 }
 545
 546 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 547 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 548                                                      BoundingBox *bb,
 549                                                      BoundingBox *bbj)
 550 {
 551     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 552     using namespace gmx;
 553
 554     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 555
 556     if (na > 2)
 557     {
 558         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 559     }
 560     else
 561     {
 562         /* Set the "empty" bounding box to the same as the first one,
 563          * so we don't need to treat special cases in the rest of the code.
 564          */
 565 #if NBNXN_SEARCH_BB_SIMD4
 566         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 567         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 568 #else
 569         bbj[1] = bbj[0];
 570 #endif
 571     }
 572
 573 #if NBNXN_SEARCH_BB_SIMD4
 574     store4(bb->lower.ptr(),
 575            min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 576     store4(bb->upper.ptr(),
 577            max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 578 #else
 579     {
 580         bb->lower = BoundingBox::Corner::min(bbj[0].lower,
 581                                              bbj[1].lower);
 582         bb->upper = BoundingBox::Corner::max(bbj[0].upper,
 583                                              bbj[1].upper);
 584     }
 585 #endif
 586 }
 587
 588 #if NBNXN_BBXXXX
 589
 590 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 591 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 592 {
 593     int  i;
 594     real xl, xh, yl, yh, zl, zh;
 595
 596     i  = 0;
 597     xl = x[i+XX];
 598     xh = x[i+XX];
 599     yl = x[i+YY];
 600     yh = x[i+YY];
 601     zl = x[i+ZZ];
 602     zh = x[i+ZZ];
 603     i += stride;
 604     for (int j = 1; j < na; j++)
 605     {
 606         xl = std::min(xl, x[i+XX]);
 607         xh = std::max(xh, x[i+XX]);
 608         yl = std::min(yl, x[i+YY]);
 609         yh = std::max(yh, x[i+YY]);
 610         zl = std::min(zl, x[i+ZZ]);
 611         zh = std::max(zh, x[i+ZZ]);
 612         i += stride;
 613     }
 614     /* Note: possible double to float conversion here */
 615     bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl);
 616     bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl);
 617     bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl);
 618     bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh);
 619     bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh);
 620     bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh);
 621 }
 622
 623 #endif /* NBNXN_BBXXXX */
 624
 625 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 626
 627 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 628 static void calc_bounding_box_simd4(int na, const float *x,
 629                                     BoundingBox *bb)
 630 {
 631     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 632     using namespace gmx;
 633
 634     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH*sizeof(float),
 635                   "The Corner struct should hold exactly 4 floats");
 636
 637     Simd4Float bb_0_S = load4(x);
 638     Simd4Float bb_1_S = bb_0_S;
 639
 640     for (int i = 1; i < na; i++)
 641     {
 642         Simd4Float x_S = load4(x + i*GMX_SIMD4_WIDTH);
 643         bb_0_S         = min(bb_0_S, x_S);
 644         bb_1_S         = max(bb_1_S, x_S);
 645     }
 646
 647     store4(bb->lower.ptr(), bb_0_S);
 648     store4(bb->upper.ptr(), bb_1_S);
 649 }
 650
 651 #if NBNXN_BBXXXX
 652
 653 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 654 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 655                                          BoundingBox *bb_work_aligned,
 656                                          real *bb)
 657 {
 658     calc_bounding_box_simd4(na, x, bb_work_aligned);
 659
 660     bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 661     bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 662     bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 663     bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 664     bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 665     bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 666 }
 667
 668 #endif /* NBNXN_BBXXXX */
 669
 670 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 671
 672
 673 /*! \brief Combines pairs of consecutive bounding boxes */
 674 static void combine_bounding_box_pairs(const Grid                       &grid,
 675                                        gmx::ArrayRef<const BoundingBox>  bb,
 676                                        gmx::ArrayRef<BoundingBox>        bbj)
 677 {
 678     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 679     using namespace gmx;
 680
 681     for (int i = 0; i < grid.numColumns(); i++)
 682     {
 683         /* Starting bb in a column is expected to be 2-aligned */
 684         const int sc2 = grid.firstCellInColumn(i) >> 1;
 685         /* For odd numbers skip the last bb here */
 686         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 687         int       c2;
 688         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 689         {
 690 #if NBNXN_SEARCH_BB_SIMD4
 691             Simd4Float min_S, max_S;
 692
 693             min_S = min(load4(bb[c2*2+0].lower.ptr()),
 694                         load4(bb[c2*2+1].lower.ptr()));
 695             max_S = max(load4(bb[c2*2+0].upper.ptr()),
 696                         load4(bb[c2*2+1].upper.ptr()));
 697             store4(bbj[c2].lower.ptr(), min_S);
 698             store4(bbj[c2].upper.ptr(), max_S);
 699 #else
 700             bbj[c2].lower = BoundingBox::Corner::min(bb[c2*2 + 0].lower,
 701                                                      bb[c2*2 + 1].lower);
 702             bbj[c2].upper = BoundingBox::Corner::max(bb[c2*2 + 0].upper,
 703                                                      bb[c2*2 + 1].upper);
 704 #endif
 705         }
 706         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 707         {
 708             /* The bb count in this column is odd: duplicate the last bb */
 709             bbj[c2].lower = bb[c2*2].lower;
 710             bbj[c2].upper = bb[c2*2].upper;
 711         }
 712     }
 713 }
 714
 715
 716 /*! \brief Prints the average bb size, used for debug output */
 717 static void print_bbsizes_simple(FILE       *fp,
 718                                  const Grid &grid)
 719 {
 720     dvec ba = { 0 };
 721     for (int c = 0; c < grid.numCells(); c++)
 722     {
 723         const BoundingBox &bb = grid.iBoundingBoxes()[c];
 724         ba[XX] += bb.upper.x - bb.lower.x;
 725         ba[YY] += bb.upper.y - bb.lower.y;
 726         ba[ZZ] += bb.upper.z - bb.lower.z;
 727     }
 728     dsvmul(1.0/grid.numCells(), ba, ba);
 729
 730     const Grid::Dimensions &dims         = grid.dimensions();
 731     real                    avgCellSizeZ = (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]) : 0.0);
 732
 733     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 734             dims.cellSize[XX],
 735             dims.cellSize[YY],
 736             avgCellSizeZ,
 737             ba[XX], ba[YY], ba[ZZ],
 738             ba[XX]*dims.invCellSize[XX],
 739             ba[YY]*dims.invCellSize[YY],
 740             dims.atomDensity > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 741 }
 742
 743 /*! \brief Prints the average bb size, used for debug output */
 744 static void print_bbsizes_supersub(FILE       *fp,
 745                                    const Grid &grid)
 746 {
 747     int  ns;
 748     dvec ba;
 749
 750     clear_dvec(ba);
 751     ns = 0;
 752     for (int c = 0; c < grid.numCells(); c++)
 753     {
 754 #if NBNXN_BBXXXX
 755         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 756         {
 757             int  cs_w          = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize;
 758             auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 759             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 760             {
 761                 for (int d = 0; d < DIM; d++)
 762                 {
 763                     ba[d] +=
 764                         boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] -
 765                         boundingBoxes[(0   + d)*c_packedBoundingBoxesDimSize + i];
 766                 }
 767             }
 768         }
 769 #else
 770         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 771         {
 772             const BoundingBox &bb = grid.iBoundingBoxes()[c*c_gpuNumClusterPerCell + s];
 773             ba[XX] += bb.upper.x - bb.lower.x;
 774             ba[YY] += bb.upper.y - bb.lower.y;
 775             ba[ZZ] += bb.upper.z - bb.lower.z;
 776         }
 777 #endif
 778         ns += grid.numClustersPerCell()[c];
 779     }
 780     dsvmul(1.0/ns, ba, ba);
 781
 782     const Grid::Dimensions &dims            = grid.dimensions();
 783     const real              avgClusterSizeZ =
 784         (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 785
 786     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 787             dims.cellSize[XX]/c_gpuNumClusterPerCellX,
 788             dims.cellSize[YY]/c_gpuNumClusterPerCellY,
 789             avgClusterSizeZ,
 790             ba[XX], ba[YY], ba[ZZ],
 791             ba[XX]*c_gpuNumClusterPerCellX*dims.invCellSize[XX],
 792             ba[YY]*c_gpuNumClusterPerCellY*dims.invCellSize[YY],
 793             dims.atomDensity > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 794 }
 795
 796 /*!\brief Set non-bonded interaction flags for the current cluster.
 797  *
 798  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 799  */
 800 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 801                                  int                 atomStart,
 802                                  int                 atomEnd,
 803                                  const int          *atinfo,
 804                                  gmx::ArrayRef<int>  order,
 805                                  int                *flags)
 806 {
 807     constexpr int c_maxNumAtomsInCluster = 8;
 808     int           sort1[c_maxNumAtomsInCluster];
 809     int           sort2[c_maxNumAtomsInCluster];
 810
 811     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 812
 813     *flags = 0;
 814
 815     int subc = 0;
 816     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 817     {
 818         /* Make lists for this (sub-)cell on atoms with and without LJ */
 819         int      n1         = 0;
 820         int      n2         = 0;
 821         gmx_bool haveQ      = FALSE;
 822         int      a_lj_max   = -1;
 823         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 824         {
 825             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 826
 827             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 828             {
 829                 sort1[n1++] = order[a];
 830                 a_lj_max    = a;
 831             }
 832             else
 833             {
 834                 sort2[n2++] = order[a];
 835             }
 836         }
 837
 838         /* If we don't have atoms with LJ, there's nothing to sort */
 839         if (n1 > 0)
 840         {
 841             *flags |= NBNXN_CI_DO_LJ(subc);
 842
 843             if (2*n1 <= numAtomsInCluster)
 844             {
 845                 /* Only sort when strictly necessary. Ordering particles
 846                  * Ordering particles can lead to less accurate summation
 847                  * due to rounding, both for LJ and Coulomb interactions.
 848                  */
 849                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 850                 {
 851                     for (int i = 0; i < n1; i++)
 852                     {
 853                         order[atomStart + i]      = sort1[i];
 854                     }
 855                     for (int j = 0; j < n2; j++)
 856                     {
 857                         order[atomStart + n1 + j] = sort2[j];
 858                     }
 859                 }
 860
 861                 *flags |= NBNXN_CI_HALF_LJ(subc);
 862             }
 863         }
 864         if (haveQ)
 865         {
 866             *flags |= NBNXN_CI_DO_COUL(subc);
 867         }
 868         subc++;
 869     }
 870 }
 871
 872 /*! \brief Fill a pair search cell with atoms.
 873  *
 874  * Potentially sorts atoms and sets the interaction flags.
 875  */
 876 void Grid::fillCell(GridSetData                    *gridSetData,
 877                     nbnxn_atomdata_t               *nbat,
 878                     int                             atomStart,
 879                     int                             atomEnd,
 880                     const int                      *atinfo,
 881                     gmx::ArrayRef<const gmx::RVec>  x,
 882                     BoundingBox gmx_unused         *bb_work_aligned)
 883 {
 884     const int                 numAtoms = atomEnd - atomStart;
 885
 886     const gmx::ArrayRef<int> &cells       = gridSetData->cells;
 887     const gmx::ArrayRef<int> &atomIndices = gridSetData->atomIndices;
 888
 889     if (geometry_.isSimple)
 890     {
 891         /* Note that non-local grids are already sorted.
 892          * Then sort_cluster_on_flag will only set the flags and the sorting
 893          * will not affect the atom order.
 894          */
 895         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
 896                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 897     }
 898
 899     if (haveFep_)
 900     {
 901         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 902
 903         /* The grid-local cluster/(sub-)cell index */
 904         int cell = atomToCluster(atomStart) - cellOffset_*(geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 905         fep_[cell] = 0;
 906         for (int at = atomStart; at < atomEnd; at++)
 907         {
 908             if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
 909             {
 910                 fep_[cell] |= (1 << (at - atomStart));
 911             }
 912         }
 913     }
 914
 915     /* Now we have sorted the atoms, set the cell indices */
 916     for (int at = atomStart; at < atomEnd; at++)
 917     {
 918         cells[atomIndices[at]] = at;
 919     }
 920
 921     copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 922                            as_rvec_array(x.data()),
 923                            nbat->XFormat, nbat->x().data(), atomStart);
 924
 925     if (nbat->XFormat == nbatX4)
 926     {
 927         /* Store the bounding boxes as xyz.xyz. */
 928         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 929         BoundingBox *bb_ptr = bb_.data() + offset;
 930
 931 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 932         if (2*geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 933         {
 934             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 935                                           bbj_.data() + offset*2);
 936         }
 937         else
 938 #endif
 939         {
 940             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 941         }
 942     }
 943     else if (nbat->XFormat == nbatX8)
 944     {
 945         /* Store the bounding boxes as xyz.xyz. */
 946         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 947         BoundingBox *bb_ptr = bb_.data() + offset;
 948
 949         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 950     }
 951 #if NBNXN_BBXXXX
 952     else if (!geometry_.isSimple)
 953     {
 954         /* Store the bounding boxes in a format convenient
 955          * for SIMD4 calculations: xxxxyyyyzzzz...
 956          */
 957         const int clusterIndex    = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log);
 958         float    *pbb_ptr         =
 959             pbb_.data() +
 960             packedBoundingBoxesIndex(clusterIndex) +
 961             (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 962
 963 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 964         if (nbat->XFormat == nbatXYZQ)
 965         {
 966             GMX_ASSERT(bb_work_aligned != nullptr, "Must have valid aligned work structure");
 967             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 968                                          bb_work_aligned, pbb_ptr);
 969         }
 970         else
 971 #endif
 972         {
 973             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 974                                    pbb_ptr);
 975         }
 976         if (gmx_debug_at)
 977         {
 978             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 979                     atomToCluster(atomStart),
 980                     pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize],
 981                     pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize],
 982                     pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]);
 983         }
 984     }
 985 #endif
 986     else
 987     {
 988         /* Store the bounding boxes as xyz.xyz. */
 989         BoundingBox *bb_ptr = bb_.data() + atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 990
 991         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 992                           bb_ptr);
 993
 994         if (gmx_debug_at)
 995         {
 996             int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 997             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 998                     atomToCluster(atomStart),
 999                     bb_[bbo].lower.x,
1000                     bb_[bbo].lower.y,
1001                     bb_[bbo].lower.z,
1002                     bb_[bbo].upper.x,
1003                     bb_[bbo].upper.y,
1004                     bb_[bbo].upper.z);
1005         }
1006     }
1007 }
1008
1009 void Grid::sortColumnsCpuGeometry(GridSetData *gridSetData,
1010                                   int dd_zone,
1011                                   int atomStart, int atomEnd,
1012                                   const int *atinfo,
1013                                   gmx::ArrayRef<const gmx::RVec> x,
1014                                   nbnxn_atomdata_t *nbat,
1015                                   int cxy_start, int cxy_end,
1016                                   gmx::ArrayRef<int> sort_work)
1017 {
1018     if (debug)
1019     {
1020         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1021                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1022     }
1023
1024     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1025
1026     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1027
1028     /* Sort the atoms within each x,y column in 3 dimensions */
1029     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1030     {
1031         const int numAtoms   = numAtomsInColumn(cxy);
1032         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1033         const int atomOffset = firstAtomInColumn(cxy);
1034
1035         /* Sort the atoms within each x,y column on z coordinate */
1036         sort_atoms(ZZ, FALSE, dd_zone,
1037                    relevantAtomsAreWithinGridBounds,
1038                    gridSetData->atomIndices.data() + atomOffset, numAtoms, x,
1039                    dimensions_.lowerCorner[ZZ],
1040                    1.0/dimensions_.gridSize[ZZ], numCellsZ*numAtomsPerCell,
1041                    sort_work);
1042
1043         /* Fill the ncz cells in this column */
1044         const int firstCell  = firstCellInColumn(cxy);
1045         int       cellFilled = firstCell;
1046         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1047         {
1048             const int cell           = firstCell + cellZ;
1049
1050             const int atomOffsetCell = atomOffset + cellZ*numAtomsPerCell;
1051             const int numAtomsCell   = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1052
1053             fillCell(gridSetData, nbat,
1054                      atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x,
1055                      nullptr);
1056
1057             /* This copy to bbcz is not really necessary.
1058              * But it allows to use the same grid search code
1059              * for the simple and supersub cell setups.
1060              */
1061             if (numAtomsCell > 0)
1062             {
1063                 cellFilled = cell;
1064             }
1065             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1066             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1067         }
1068
1069         /* Set the unused atom indices to -1 */
1070         for (int ind = numAtoms; ind < numCellsZ*numAtomsPerCell; ind++)
1071         {
1072             gridSetData->atomIndices[atomOffset + ind] = -1;
1073         }
1074     }
1075 }
1076
1077 /* Spatially sort the atoms within one grid column */
1078 void Grid::sortColumnsGpuGeometry(GridSetData *gridSetData,
1079                                   int dd_zone,
1080                                   int atomStart, int atomEnd,
1081                                   const int *atinfo,
1082                                   gmx::ArrayRef<const gmx::RVec> x,
1083                                   nbnxn_atomdata_t *nbat,
1084                                   int cxy_start, int cxy_end,
1085                                   gmx::ArrayRef<int> sort_work)
1086 {
1087     BoundingBox  bb_work_array[2];
1088     BoundingBox *bb_work_aligned = reinterpret_cast<BoundingBox *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1089
1090     if (debug)
1091     {
1092         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1093                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1094     }
1095
1096     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1097
1098     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1099
1100     const int  subdiv_x = geometry_.numAtomsICluster;
1101     const int  subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1102     const int  subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1103
1104     /* Extract the atom index array that will be filled here */
1105     const gmx::ArrayRef<int> &atomIndices = gridSetData->atomIndices;
1106
1107     /* Sort the atoms within each x,y column in 3 dimensions.
1108      * Loop over all columns on the x/y grid.
1109      */
1110     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1111     {
1112         const int gridX            = cxy/dimensions_.numCells[YY];
1113         const int gridY            = cxy - gridX*dimensions_.numCells[YY];
1114
1115         const int numAtomsInColumn = cxy_na_[cxy];
1116         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1117         const int atomOffset       = firstAtomInColumn(cxy);
1118
1119         /* Sort the atoms within each x,y column on z coordinate */
1120         sort_atoms(ZZ, FALSE, dd_zone,
1121                    relevantAtomsAreWithinGridBounds,
1122                    atomIndices.data() + atomOffset, numAtomsInColumn, x,
1123                    dimensions_.lowerCorner[ZZ],
1124                    1.0/dimensions_.gridSize[ZZ], numCellsInColumn*numAtomsPerCell,
1125                    sort_work);
1126
1127         /* This loop goes over the cells and clusters along z at once */
1128         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1129         {
1130             const int atomOffsetZ = atomOffset + sub_z*subdiv_z;
1131             const int numAtomsZ   = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1132             int       cz          = -1;
1133             /* We have already sorted on z */
1134
1135             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1136             {
1137                 cz = sub_z/c_gpuNumClusterPerCellZ;
1138                 const int cell = cxy_ind_[cxy] + cz;
1139
1140                 /* The number of atoms in this cell/super-cluster */
1141                 const int numAtoms = std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1142
1143                 numClusters_[cell] = std::min(c_gpuNumClusterPerCell,
1144                                               (numAtoms + geometry_.numAtomsICluster - 1)/ geometry_.numAtomsICluster);
1145
1146                 /* Store the z-boundaries of the bounding box of the cell */
1147                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1148                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1149             }
1150
1151             if (c_gpuNumClusterPerCellY > 1)
1152             {
1153                 /* Sort the atoms along y */
1154                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1155                            relevantAtomsAreWithinGridBounds,
1156                            atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1157                            dimensions_.lowerCorner[YY] + gridY*dimensions_.cellSize[YY],
1158                            dimensions_.invCellSize[YY], subdiv_z,
1159                            sort_work);
1160             }
1161
1162             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1163             {
1164                 const int atomOffsetY = atomOffsetZ + sub_y*subdiv_y;
1165                 const int numAtomsY   = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1166
1167                 if (c_gpuNumClusterPerCellX > 1)
1168                 {
1169                     /* Sort the atoms along x */
1170                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1171                                relevantAtomsAreWithinGridBounds,
1172                                atomIndices.data() + atomOffsetY, numAtomsY, x,
1173                                dimensions_.lowerCorner[XX] + gridX*dimensions_.cellSize[XX],
1174                                dimensions_.invCellSize[XX], subdiv_y,
1175                                sort_work);
1176                 }
1177
1178                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1179                 {
1180                     const int atomOffsetX = atomOffsetY + sub_x*subdiv_x;
1181                     const int numAtomsX   = std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1182
1183                     fillCell(gridSetData, nbat,
1184                              atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1185                              bb_work_aligned);
1186                 }
1187             }
1188         }
1189
1190         /* Set the unused atom indices to -1 */
1191         for (int ind = numAtomsInColumn; ind < numCellsInColumn*numAtomsPerCell; ind++)
1192         {
1193             atomIndices[atomOffset + ind] = -1;
1194         }
1195     }
1196 }
1197
1198 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1199 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1200                                 int                 cellIndex,
1201                                 gmx::ArrayRef<int>  cxy_na,
1202                                 int                 atomIndex)
1203 {
1204     cell[atomIndex]    = cellIndex;
1205     cxy_na[cellIndex] += 1;
1206 }
1207
1208 void Grid::calcColumnIndices(const Grid::Dimensions         &gridDims,
1209                              const gmx::UpdateGroupsCog     *updateGroupsCog,
1210                              const int                       atomStart,
1211                              const int                       atomEnd,
1212                              gmx::ArrayRef<const gmx::RVec>  x,
1213                              const int                       dd_zone,
1214                              const int                      *move,
1215                              const int                       thread,
1216                              const int                       nthread,
1217                              gmx::ArrayRef<int>              cell,
1218                              gmx::ArrayRef<int>              cxy_na)
1219 {
1220     const int numColumns = gridDims.numCells[XX]*gridDims.numCells[YY];
1221
1222     /* We add one extra cell for particles which moved during DD */
1223     for (int i = 0; i < numColumns; i++)
1224     {
1225         cxy_na[i] = 0;
1226     }
1227
1228     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1229     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1230
1231     if (dd_zone == 0)
1232     {
1233         /* Home zone */
1234         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1235         {
1236             if (move == nullptr || move[i] >= 0)
1237             {
1238
1239                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1240
1241                 /* We need to be careful with rounding,
1242                  * particles might be a few bits outside the local zone.
1243                  * The int cast takes care of the lower bound,
1244                  * we will explicitly take care of the upper bound.
1245                  */
1246                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1247                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1248
1249 #ifndef NDEBUG
1250                 if (cx < 0 || cx > gridDims.numCells[XX] ||
1251                     cy < 0 || cy > gridDims.numCells[YY])
1252                 {
1253                     gmx_fatal(FARGS,
1254                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1255                               "atom %f %f %f, grid->c0 %f %f",
1256                               cx, cy,
1257                               gridDims.numCells[XX],
1258                               gridDims.numCells[YY],
1259                               x[i][XX], x[i][YY], x[i][ZZ],
1260                               gridDims.lowerCorner[XX],
1261                               gridDims.lowerCorner[YY]);
1262                 }
1263 #endif
1264                 /* Take care of potential rouding issues */
1265                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1266                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1267
1268                 /* For the moment cell will contain only the, grid local,
1269                  * x and y indices, not z.
1270                  */
1271                 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1272             }
1273             else
1274             {
1275                 /* Put this moved particle after the end of the grid,
1276                  * so we can process it later without using conditionals.
1277                  */
1278                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1279             }
1280         }
1281     }
1282     else
1283     {
1284         /* Non-home zone */
1285         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1286         {
1287             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1288             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1289
1290             /* For non-home zones there could be particles outside
1291              * the non-bonded cut-off range, which have been communicated
1292              * for bonded interactions only. For the result it doesn't
1293              * matter where these end up on the grid. For performance
1294              * we put them in an extra row at the border.
1295              */
1296             cx = std::max(cx, 0);
1297             cx = std::min(cx, gridDims.numCells[XX] - 1);
1298             cy = std::max(cy, 0);
1299             cy = std::min(cy, gridDims.numCells[YY] - 1);
1300
1301             /* For the moment cell will contain only the, grid local,
1302              * x and y indices, not z.
1303              */
1304             setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1305         }
1306     }
1307 }
1308
1309 /*! \brief Resizes grid and atom data which depend on the number of cells */
1310 static void resizeForNumberOfCells(const int           numNbnxnAtoms,
1311                                    const int           numAtomsMoved,
1312                                    GridSetData        *gridSetData,
1313                                    nbnxn_atomdata_t   *nbat)
1314 {
1315     /* Note: gridSetData->cellIndices was already resized before */
1316
1317     /* To avoid conditionals we store the moved particles at the end of a,
1318      * make sure we have enough space.
1319      */
1320     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1321
1322     /* Make space in nbat for storing the atom coordinates */
1323     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1324 }
1325
1326 void Grid::setCellIndices(int                             ddZone,
1327                           int                             cellOffset,
1328                           GridSetData                    *gridSetData,
1329                           gmx::ArrayRef<GridWork>         gridWork,
1330                           int                             atomStart,
1331                           int                             atomEnd,
1332                           const int                      *atinfo,
1333                           gmx::ArrayRef<const gmx::RVec>  x,
1334                           const int                       numAtomsMoved,
1335                           nbnxn_atomdata_t               *nbat)
1336 {
1337     cellOffset_   = cellOffset;
1338
1339     srcAtomBegin_ = atomStart;
1340     srcAtomEnd_   = atomEnd;
1341
1342     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1343
1344     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1345
1346     /* Make the cell index as a function of x and y */
1347     int ncz_max      = 0;
1348     int ncz          = 0;
1349     cxy_ind_[0]      = 0;
1350     for (int i = 0; i < numColumns() + 1; i++)
1351     {
1352         /* We set ncz_max at the beginning of the loop iso at the end
1353          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1354          * that do not need to be ordered on the grid.
1355          */
1356         if (ncz > ncz_max)
1357         {
1358             ncz_max = ncz;
1359         }
1360         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1361         for (int thread = 1; thread < nthread; thread++)
1362         {
1363             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1364         }
1365         ncz = (cxy_na_i + numAtomsPerCell - 1)/numAtomsPerCell;
1366         if (nbat->XFormat == nbatX8)
1367         {
1368             /* Make the number of cell a multiple of 2 */
1369             ncz = (ncz + 1) & ~1;
1370         }
1371         cxy_ind_[i+1] = cxy_ind_[i] + ncz;
1372         /* Clear cxy_na_, so we can reuse the array below */
1373         cxy_na_[i] = 0;
1374     }
1375     numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
1376     numCellsColumnMax_ = ncz_max;
1377
1378     /* Resize grid and atom data which depend on the number of cells */
1379     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1380
1381     if (debug)
1382     {
1383         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1384                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_,
1385                 dimensions_.numCells[XX], dimensions_.numCells[YY],
1386                 numCellsTotal_/(static_cast<double>(numColumns())),
1387                 ncz_max);
1388         if (gmx_debug_at)
1389         {
1390             int i = 0;
1391             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1392             {
1393                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1394                 {
1395                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1396                     i++;
1397                 }
1398                 fprintf(debug, "\n");
1399             }
1400         }
1401     }
1402
1403     /* Make sure the work array for sorting is large enough */
1404     const int worstCaseSortBufferSize = ncz_max*numAtomsPerCell*c_sortGridMaxSizeFactor;
1405     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1406     {
1407         for (GridWork &work : gridWork)
1408         {
1409             /* Elements not in use should be -1 */
1410             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1411         }
1412     }
1413
1414     /* Now we know the dimensions we can fill the grid.
1415      * This is the first, unsorted fill. We sort the columns after this.
1416      */
1417     gmx::ArrayRef<int> cells       = gridSetData->cells;
1418     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1419     for (int i = atomStart; i < atomEnd; i++)
1420     {
1421         /* At this point nbs->cell contains the local grid x,y indices */
1422         const int cxy = cells[i];
1423         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1424     }
1425
1426     if (ddZone == 0)
1427     {
1428         /* Set the cell indices for the moved particles */
1429         int n0 = numCellsTotal_*numAtomsPerCell;
1430         int n1 = numCellsTotal_*numAtomsPerCell + cxy_na_[numColumns()];
1431         for (int i = n0; i < n1; i++)
1432         {
1433             cells[atomIndices[i]] = i;
1434         }
1435     }
1436
1437     /* Sort the super-cell columns along z into the sub-cells. */
1438 #pragma omp parallel for num_threads(nthread) schedule(static)
1439     for (int thread = 0; thread < nthread; thread++)
1440     {
1441         try
1442         {
1443             int columnStart = ((thread + 0)*numColumns())/nthread;
1444             int columnEnd   = ((thread + 1)*numColumns())/nthread;
1445             if (geometry_.isSimple)
1446             {
1447                 sortColumnsCpuGeometry(gridSetData, ddZone,
1448                                        atomStart, atomEnd, atinfo, x, nbat,
1449                                        columnStart, columnEnd,
1450                                        gridWork[thread].sortBuffer);
1451             }
1452             else
1453             {
1454                 sortColumnsGpuGeometry(gridSetData, ddZone,
1455                                        atomStart, atomEnd, atinfo, x, nbat,
1456                                        columnStart, columnEnd,
1457                                        gridWork[thread].sortBuffer);
1458             }
1459         }
1460         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1461     }
1462
1463     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1464     {
1465         combine_bounding_box_pairs(*this, bb_, bbj_);
1466     }
1467
1468     if (!geometry_.isSimple)
1469     {
1470         numClustersTotal_ = 0;
1471         for (int i = 0; i < numCellsTotal_; i++)
1472         {
1473             numClustersTotal_ += numClusters_[i];
1474         }
1475     }
1476
1477     if (debug)
1478     {
1479         if (geometry_.isSimple)
1480         {
1481             print_bbsizes_simple(debug, *this);
1482         }
1483         else
1484         {
1485             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1486                     numClustersTotal_,
1487                     (atomEnd - atomStart)/static_cast<double>(numClustersTotal_));
1488
1489             print_bbsizes_supersub(debug, *this);
1490         }
1491     }
1492 }
1493
1494 } // namespace Nbnxm