src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "grid.h"
  39
  40 #include <cmath>
  41 #include <cstring>
  42
  43 #include <algorithm>
  44
  45 #include "gromacs/domdec/domdec_struct.h"
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  49 #include "gromacs/mdlib/updategroupscog.h"
  50 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  51 #include "gromacs/nbnxm/atomdata.h"
  52 #include "gromacs/nbnxm/nbnxm.h"
  53 #include "gromacs/nbnxm/nbnxm_geometry.h"
  54 #include "gromacs/simd/simd.h"
  55 #include "gromacs/simd/vector_operations.h"
  56 #include "gromacs/utility/exceptions.h"
  57 #include "gromacs/utility/smalloc.h"
  58
  59 #include "internal.h"
  60
  61 struct gmx_domdec_zones_t;
  62
  63 static real grid_atom_density(int        numAtoms,
  64                               const rvec lowerCorner,
  65                               const rvec upperCorner)
  66 {
  67     rvec size;
  68
  69     if (numAtoms == 0)
  70     {
  71         /* To avoid zero density we use a minimum of 1 atom */
  72         numAtoms = 1;
  73     }
  74
  75     rvec_sub(upperCorner, lowerCorner, size);
  76
  77     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  78 }
  79
  80 static void set_grid_size_xy(const nbnxn_search   *nbs,
  81                              nbnxn_grid_t         *grid,
  82                              int                   ddZone,
  83                              int                   numAtoms,
  84                              const rvec            lowerCorner,
  85                              const rvec            upperCorner,
  86                              real                  atomDensity)
  87 {
  88     rvec size;
  89     real tlen, tlen_x, tlen_y;
  90
  91     rvec_sub(upperCorner, lowerCorner, size);
  92
  93     if (numAtoms > grid->na_sc)
  94     {
  95         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
  96
  97         /* target cell length */
  98         if (grid->bSimple)
  99         {
 100             /* To minimize the zero interactions, we should make
 101              * the largest of the i/j cell cubic.
 102              */
 103             int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
 104
 105             /* Approximately cubic cells */
 106             tlen   = std::cbrt(numAtomsInCell/atomDensity);
 107             tlen_x = tlen;
 108             tlen_y = tlen;
 109         }
 110         else
 111         {
 112             /* Approximately cubic sub cells */
 113             tlen   = std::cbrt(grid->na_c/atomDensity);
 114             tlen_x = tlen*c_gpuNumClusterPerCellX;
 115             tlen_y = tlen*c_gpuNumClusterPerCellY;
 116         }
 117         /* We round ncx and ncy down, because we get less cell pairs
 118          * in the nbsist when the fixed cell dimensions (x,y) are
 119          * larger than the variable one (z) than the other way around.
 120          */
 121         grid->numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 122         grid->numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 123     }
 124     else
 125     {
 126         grid->numCells[XX] = 1;
 127         grid->numCells[YY] = 1;
 128     }
 129
 130     for (int d = 0; d < DIM - 1; d++)
 131     {
 132         grid->cellSize[d]    = size[d]/grid->numCells[d];
 133         grid->invCellSize[d] = 1/grid->cellSize[d];
 134     }
 135
 136     if (ddZone > 0)
 137     {
 138         /* This is a non-home zone, add an extra row of cells
 139          * for particles communicated for bonded interactions.
 140          * These can be beyond the cut-off. It doesn't matter where
 141          * they end up on the grid, but for performance it's better
 142          * if they don't end up in cells that can be within cut-off range.
 143          */
 144         grid->numCells[XX]++;
 145         grid->numCells[YY]++;
 146     }
 147
 148     /* We need one additional cell entry for particles moved by DD */
 149     int numCellsXY = grid->numCells[XX]*grid->numCells[YY];
 150     grid->cxy_na.resize(numCellsXY + 1);
 151     grid->cxy_ind.resize(numCellsXY + 2);
 152
 153     for (nbnxn_search_work_t &work : nbs->work)
 154     {
 155         work.cxy_na.resize(numCellsXY + 1);
 156     }
 157
 158     /* Worst case scenario of 1 atom in each last cell */
 159     int maxNumCells;
 160     if (grid->na_cj <= grid->na_c)
 161     {
 162         maxNumCells = numAtoms/grid->na_sc + numCellsXY;
 163     }
 164     else
 165     {
 166         maxNumCells = numAtoms/grid->na_sc + numCellsXY*grid->na_cj/grid->na_c;
 167     }
 168
 169     grid->nsubc.resize(maxNumCells);
 170     grid->bbcz.resize(maxNumCells*NNBSBB_D);
 171
 172     /* This resize also zeros the contents, this avoid possible
 173      * floating exceptions in SIMD with the unused bb elements.
 174      */
 175     if (grid->bSimple)
 176     {
 177         grid->bb.resize(maxNumCells);
 178     }
 179     else
 180     {
 181 #if NBNXN_BBXXXX
 182         grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 183 #else
 184         grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
 185 #endif
 186     }
 187
 188     if (grid->bSimple)
 189     {
 190         if (grid->na_cj == grid->na_c)
 191         {
 192             grid->bbj = grid->bb;
 193         }
 194         else
 195         {
 196             grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
 197             grid->bbj = grid->bbjStorage;
 198         }
 199     }
 200
 201     grid->flags.resize(maxNumCells);
 202     if (nbs->bFEP)
 203     {
 204         grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
 205     }
 206
 207     copy_rvec(lowerCorner, grid->c0);
 208     copy_rvec(upperCorner, grid->c1);
 209     copy_rvec(size,        grid->size);
 210 }
 211
 212 /* We need to sort paricles in grid columns on z-coordinate.
 213  * As particle are very often distributed homogeneously, we a sorting
 214  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 215  * by a factor, cast to an int and try to store in that hole. If the hole
 216  * is full, we move this or another particle. A second pass is needed to make
 217  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 218  * 4 is the optimal value for homogeneous particle distribution and allows
 219  * for an O(#particles) sort up till distributions were all particles are
 220  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 221  * as it can be expensive to detect imhomogeneous particle distributions.
 222  * SGSF is the maximum ratio of holes used, in the worst case all particles
 223  * end up in the last hole and we need #particles extra holes at the end.
 224  */
 225 #define SORT_GRID_OVERSIZE 4
 226 #define SGSF (SORT_GRID_OVERSIZE + 1)
 227
 228 /* Sort particle index a on coordinates x along dim.
 229  * Backwards tells if we want decreasing iso increasing coordinates.
 230  * h0 is the minimum of the coordinate range.
 231  * invh is the 1/length of the sorting range.
 232  * n_per_h (>=n) is the expected average number of particles per 1/invh
 233  * sort is the sorting work array.
 234  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 235  * or easier, allocate at least n*SGSF elements.
 236  */
 237 static void sort_atoms(int dim, gmx_bool Backwards,
 238                        int gmx_unused dd_zone,
 239                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 240                        int *a, int n,
 241                        gmx::ArrayRef<const gmx::RVec> x,
 242                        real h0, real invh, int n_per_h,
 243                        gmx::ArrayRef<int> sort)
 244 {
 245     if (n <= 1)
 246     {
 247         /* Nothing to do */
 248         return;
 249     }
 250
 251     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 252
 253     /* Transform the inverse range height into the inverse hole height */
 254     invh *= n_per_h*SORT_GRID_OVERSIZE;
 255
 256     /* Set nsort to the maximum possible number of holes used.
 257      * In worst case all n elements end up in the last bin.
 258      */
 259     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 260
 261     /* Determine the index range used, so we can limit it for the second pass */
 262     int zi_min = INT_MAX;
 263     int zi_max = -1;
 264
 265     /* Sort the particles using a simple index sort */
 266     for (int i = 0; i < n; i++)
 267     {
 268         /* The cast takes care of float-point rounding effects below zero.
 269          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 270          * times the box height out of the box.
 271          */
 272         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 273
 274 #ifndef NDEBUG
 275         /* As we can have rounding effect, we use > iso >= here */
 276         if (relevantAtomsAreWithinGridBounds &&
 277             (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE)))
 278         {
 279             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 280                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 281                       n_per_h, SORT_GRID_OVERSIZE);
 282         }
 283 #endif
 284         if (zi < 0)
 285         {
 286             zi = 0;
 287         }
 288
 289         /* In a non-local domain, particles communcated for bonded interactions
 290          * can be far beyond the grid size, which is set by the non-bonded
 291          * cut-off distance. We sort such particles into the last cell.
 292          */
 293         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 294         {
 295             zi = n_per_h*SORT_GRID_OVERSIZE;
 296         }
 297
 298         /* Ideally this particle should go in sort cell zi,
 299          * but that might already be in use,
 300          * in that case find the first empty cell higher up
 301          */
 302         if (sort[zi] < 0)
 303         {
 304             sort[zi] = a[i];
 305             zi_min   = std::min(zi_min, zi);
 306             zi_max   = std::max(zi_max, zi);
 307         }
 308         else
 309         {
 310             /* We have multiple atoms in the same sorting slot.
 311              * Sort on real z for minimal bounding box size.
 312              * There is an extra check for identical z to ensure
 313              * well-defined output order, independent of input order
 314              * to ensure binary reproducibility after restarts.
 315              */
 316             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 317                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 318                                        a[i] > sort[zi])))
 319             {
 320                 zi++;
 321             }
 322
 323             if (sort[zi] >= 0)
 324             {
 325                 /* Shift all elements by one slot until we find an empty slot */
 326                 int cp  = sort[zi];
 327                 int zim = zi + 1;
 328                 while (sort[zim] >= 0)
 329                 {
 330                     int tmp   = sort[zim];
 331                     sort[zim] = cp;
 332                     cp        = tmp;
 333                     zim++;
 334                 }
 335                 sort[zim] = cp;
 336                 zi_max    = std::max(zi_max, zim);
 337             }
 338             sort[zi] = a[i];
 339             zi_max   = std::max(zi_max, zi);
 340         }
 341     }
 342
 343     int c = 0;
 344     if (!Backwards)
 345     {
 346         for (int zi = 0; zi < nsort; zi++)
 347         {
 348             if (sort[zi] >= 0)
 349             {
 350                 a[c++]   = sort[zi];
 351                 sort[zi] = -1;
 352             }
 353         }
 354     }
 355     else
 356     {
 357         for (int zi = zi_max; zi >= zi_min; zi--)
 358         {
 359             if (sort[zi] >= 0)
 360             {
 361                 a[c++]   = sort[zi];
 362                 sort[zi] = -1;
 363             }
 364         }
 365     }
 366     if (c < n)
 367     {
 368         gmx_incons("Lost particles while sorting");
 369     }
 370 }
 371
 372 #if GMX_DOUBLE
 373 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 374 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 375 #else
 376 #define R2F_D(x) (x)
 377 #define R2F_U(x) (x)
 378 #endif
 379
 380 /* Coordinate order x,y,z, bb order xyz0 */
 381 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 382 {
 383     int  i;
 384     real xl, xh, yl, yh, zl, zh;
 385
 386     i  = 0;
 387     xl = x[i+XX];
 388     xh = x[i+XX];
 389     yl = x[i+YY];
 390     yh = x[i+YY];
 391     zl = x[i+ZZ];
 392     zh = x[i+ZZ];
 393     i += stride;
 394     for (int j = 1; j < na; j++)
 395     {
 396         xl = std::min(xl, x[i+XX]);
 397         xh = std::max(xh, x[i+XX]);
 398         yl = std::min(yl, x[i+YY]);
 399         yh = std::max(yh, x[i+YY]);
 400         zl = std::min(zl, x[i+ZZ]);
 401         zh = std::max(zh, x[i+ZZ]);
 402         i += stride;
 403     }
 404     /* Note: possible double to float conversion here */
 405     bb->lower[BB_X] = R2F_D(xl);
 406     bb->lower[BB_Y] = R2F_D(yl);
 407     bb->lower[BB_Z] = R2F_D(zl);
 408     bb->upper[BB_X] = R2F_U(xh);
 409     bb->upper[BB_Y] = R2F_U(yh);
 410     bb->upper[BB_Z] = R2F_U(zh);
 411 }
 412
 413 /* Packed coordinates, bb order xyz0 */
 414 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 415 {
 416     real xl, xh, yl, yh, zl, zh;
 417
 418     xl = x[XX*c_packX4];
 419     xh = x[XX*c_packX4];
 420     yl = x[YY*c_packX4];
 421     yh = x[YY*c_packX4];
 422     zl = x[ZZ*c_packX4];
 423     zh = x[ZZ*c_packX4];
 424     for (int j = 1; j < na; j++)
 425     {
 426         xl = std::min(xl, x[j+XX*c_packX4]);
 427         xh = std::max(xh, x[j+XX*c_packX4]);
 428         yl = std::min(yl, x[j+YY*c_packX4]);
 429         yh = std::max(yh, x[j+YY*c_packX4]);
 430         zl = std::min(zl, x[j+ZZ*c_packX4]);
 431         zh = std::max(zh, x[j+ZZ*c_packX4]);
 432     }
 433     /* Note: possible double to float conversion here */
 434     bb->lower[BB_X] = R2F_D(xl);
 435     bb->lower[BB_Y] = R2F_D(yl);
 436     bb->lower[BB_Z] = R2F_D(zl);
 437     bb->upper[BB_X] = R2F_U(xh);
 438     bb->upper[BB_Y] = R2F_U(yh);
 439     bb->upper[BB_Z] = R2F_U(zh);
 440 }
 441
 442 /* Packed coordinates, bb order xyz0 */
 443 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 444 {
 445     real xl, xh, yl, yh, zl, zh;
 446
 447     xl = x[XX*c_packX8];
 448     xh = x[XX*c_packX8];
 449     yl = x[YY*c_packX8];
 450     yh = x[YY*c_packX8];
 451     zl = x[ZZ*c_packX8];
 452     zh = x[ZZ*c_packX8];
 453     for (int j = 1; j < na; j++)
 454     {
 455         xl = std::min(xl, x[j+XX*c_packX8]);
 456         xh = std::max(xh, x[j+XX*c_packX8]);
 457         yl = std::min(yl, x[j+YY*c_packX8]);
 458         yh = std::max(yh, x[j+YY*c_packX8]);
 459         zl = std::min(zl, x[j+ZZ*c_packX8]);
 460         zh = std::max(zh, x[j+ZZ*c_packX8]);
 461     }
 462     /* Note: possible double to float conversion here */
 463     bb->lower[BB_X] = R2F_D(xl);
 464     bb->lower[BB_Y] = R2F_D(yl);
 465     bb->lower[BB_Z] = R2F_D(zl);
 466     bb->upper[BB_X] = R2F_U(xh);
 467     bb->upper[BB_Y] = R2F_U(yh);
 468     bb->upper[BB_Z] = R2F_U(zh);
 469 }
 470
 471 /* Packed coordinates, bb order xyz0 */
 472 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 473                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 474 {
 475     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 476     using namespace gmx;
 477
 478     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 479
 480     if (na > 2)
 481     {
 482         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 483     }
 484     else
 485     {
 486         /* Set the "empty" bounding box to the same as the first one,
 487          * so we don't need to treat special cases in the rest of the code.
 488          */
 489 #if NBNXN_SEARCH_BB_SIMD4
 490         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 491         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 492 #else
 493         bbj[1] = bbj[0];
 494 #endif
 495     }
 496
 497 #if NBNXN_SEARCH_BB_SIMD4
 498     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 499     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 500 #else
 501     {
 502         int i;
 503
 504         for (i = 0; i < NNBSBB_C; i++)
 505         {
 506             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 507             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 508         }
 509     }
 510 #endif
 511 }
 512
 513 #if NBNXN_SEARCH_BB_SIMD4
 514
 515 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 516 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 517 {
 518     int  i;
 519     real xl, xh, yl, yh, zl, zh;
 520
 521     i  = 0;
 522     xl = x[i+XX];
 523     xh = x[i+XX];
 524     yl = x[i+YY];
 525     yh = x[i+YY];
 526     zl = x[i+ZZ];
 527     zh = x[i+ZZ];
 528     i += stride;
 529     for (int j = 1; j < na; j++)
 530     {
 531         xl = std::min(xl, x[i+XX]);
 532         xh = std::max(xh, x[i+XX]);
 533         yl = std::min(yl, x[i+YY]);
 534         yh = std::max(yh, x[i+YY]);
 535         zl = std::min(zl, x[i+ZZ]);
 536         zh = std::max(zh, x[i+ZZ]);
 537         i += stride;
 538     }
 539     /* Note: possible double to float conversion here */
 540     bb[0*STRIDE_PBB] = R2F_D(xl);
 541     bb[1*STRIDE_PBB] = R2F_D(yl);
 542     bb[2*STRIDE_PBB] = R2F_D(zl);
 543     bb[3*STRIDE_PBB] = R2F_U(xh);
 544     bb[4*STRIDE_PBB] = R2F_U(yh);
 545     bb[5*STRIDE_PBB] = R2F_U(zh);
 546 }
 547
 548 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 549
 550 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 551
 552 /* Coordinate order xyz?, bb order xyz0 */
 553 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 554 {
 555     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 556     using namespace gmx;
 557
 558     Simd4Float bb_0_S, bb_1_S;
 559     Simd4Float x_S;
 560
 561     bb_0_S = load4(x);
 562     bb_1_S = bb_0_S;
 563
 564     for (int i = 1; i < na; i++)
 565     {
 566         x_S    = load4(x+i*NNBSBB_C);
 567         bb_0_S = min(bb_0_S, x_S);
 568         bb_1_S = max(bb_1_S, x_S);
 569     }
 570
 571     store4(&bb->lower[0], bb_0_S);
 572     store4(&bb->upper[0], bb_1_S);
 573 }
 574
 575 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 576 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 577                                          nbnxn_bb_t *bb_work_aligned,
 578                                          real *bb)
 579 {
 580     calc_bounding_box_simd4(na, x, bb_work_aligned);
 581
 582     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 583     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 584     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 585     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 586     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 587     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 588 }
 589
 590 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 591
 592
 593 /* Combines pairs of consecutive bounding boxes */
 594 static void combine_bounding_box_pairs(nbnxn_grid_t                    *grid,
 595                                        gmx::ArrayRef<const nbnxn_bb_t>  bb)
 596 {
 597     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 598     using namespace gmx;
 599
 600     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY]; i++)
 601     {
 602         /* Starting bb in a column is expected to be 2-aligned */
 603         int sc2 = grid->cxy_ind[i]>>1;
 604         /* For odd numbers skip the last bb here */
 605         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 606         int c2;
 607         for (c2 = sc2; c2 < sc2+nc2; c2++)
 608         {
 609 #if NBNXN_SEARCH_BB_SIMD4
 610             Simd4Float min_S, max_S;
 611
 612             min_S = min(load4(&bb[c2*2+0].lower[0]),
 613                         load4(&bb[c2*2+1].lower[0]));
 614             max_S = max(load4(&bb[c2*2+0].upper[0]),
 615                         load4(&bb[c2*2+1].upper[0]));
 616             store4(&grid->bbj[c2].lower[0], min_S);
 617             store4(&grid->bbj[c2].upper[0], max_S);
 618 #else
 619             for (int j = 0; j < NNBSBB_C; j++)
 620             {
 621                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 622                                                   bb[c2*2+1].lower[j]);
 623                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 624                                                   bb[c2*2+1].upper[j]);
 625             }
 626 #endif
 627         }
 628         if (((grid->cxy_na[i]+3)>>2) & 1)
 629         {
 630             /* The bb count in this column is odd: duplicate the last bb */
 631             for (int j = 0; j < NNBSBB_C; j++)
 632             {
 633                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 634                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 635             }
 636         }
 637     }
 638 }
 639
 640
 641 /* Prints the average bb size, used for debug output */
 642 static void print_bbsizes_simple(FILE                *fp,
 643                                  const nbnxn_grid_t  *grid)
 644 {
 645     dvec ba;
 646
 647     clear_dvec(ba);
 648     for (int c = 0; c < grid->nc; c++)
 649     {
 650         for (int d = 0; d < DIM; d++)
 651         {
 652             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 653         }
 654     }
 655     dsvmul(1.0/grid->nc, ba, ba);
 656
 657     real avgCellSizeZ = (grid->atom_density > 0 ? grid->na_c/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]) : 0.0);
 658
 659     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 660             grid->cellSize[XX],
 661             grid->cellSize[YY],
 662             avgCellSizeZ,
 663             ba[XX], ba[YY], ba[ZZ],
 664             ba[XX]*grid->invCellSize[XX],
 665             ba[YY]*grid->invCellSize[YY],
 666             grid->atom_density > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 667 }
 668
 669 /* Prints the average bb size, used for debug output */
 670 static void print_bbsizes_supersub(FILE                *fp,
 671                                    const nbnxn_grid_t  *grid)
 672 {
 673     int  ns;
 674     dvec ba;
 675
 676     clear_dvec(ba);
 677     ns = 0;
 678     for (int c = 0; c < grid->nc; c++)
 679     {
 680 #if NBNXN_BBXXXX
 681         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 682         {
 683             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 684             for (int i = 0; i < STRIDE_PBB; i++)
 685             {
 686                 for (int d = 0; d < DIM; d++)
 687                 {
 688                     ba[d] +=
 689                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 690                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 691                 }
 692             }
 693         }
 694 #else
 695         for (int s = 0; s < grid->nsubc[c]; s++)
 696         {
 697             int cs = c*c_gpuNumClusterPerCell + s;
 698             for (int d = 0; d < DIM; d++)
 699             {
 700                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 701             }
 702         }
 703 #endif
 704         ns += grid->nsubc[c];
 705     }
 706     dsvmul(1.0/ns, ba, ba);
 707
 708     real avgClusterSizeZ = (grid->atom_density > 0 ? grid->na_sc/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 709
 710     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 711             grid->cellSize[XX]/c_gpuNumClusterPerCellX,
 712             grid->cellSize[YY]/c_gpuNumClusterPerCellY,
 713             avgClusterSizeZ,
 714             ba[XX], ba[YY], ba[ZZ],
 715             ba[XX]*c_gpuNumClusterPerCellX*grid->invCellSize[XX],
 716             ba[YY]*c_gpuNumClusterPerCellY*grid->invCellSize[YY],
 717             grid->atom_density > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 718 }
 719
 720 /* Set non-bonded interaction flags for the current cluster.
 721  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 722  */
 723 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 724                                  int                 atomStart,
 725                                  int                 atomEnd,
 726                                  const int          *atinfo,
 727                                  gmx::ArrayRef<int>  order,
 728                                  int                *flags)
 729 {
 730     constexpr int c_maxNumAtomsInCluster = 8;
 731     int           sort1[c_maxNumAtomsInCluster];
 732     int           sort2[c_maxNumAtomsInCluster];
 733
 734     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 735
 736     *flags = 0;
 737
 738     int subc = 0;
 739     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 740     {
 741         /* Make lists for this (sub-)cell on atoms with and without LJ */
 742         int      n1         = 0;
 743         int      n2         = 0;
 744         gmx_bool haveQ      = FALSE;
 745         int      a_lj_max   = -1;
 746         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 747         {
 748             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 749
 750             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 751             {
 752                 sort1[n1++] = order[a];
 753                 a_lj_max    = a;
 754             }
 755             else
 756             {
 757                 sort2[n2++] = order[a];
 758             }
 759         }
 760
 761         /* If we don't have atoms with LJ, there's nothing to sort */
 762         if (n1 > 0)
 763         {
 764             *flags |= NBNXN_CI_DO_LJ(subc);
 765
 766             if (2*n1 <= numAtomsInCluster)
 767             {
 768                 /* Only sort when strictly necessary. Ordering particles
 769                  * Ordering particles can lead to less accurate summation
 770                  * due to rounding, both for LJ and Coulomb interactions.
 771                  */
 772                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 773                 {
 774                     for (int i = 0; i < n1; i++)
 775                     {
 776                         order[atomStart + i]      = sort1[i];
 777                     }
 778                     for (int j = 0; j < n2; j++)
 779                     {
 780                         order[atomStart + n1 + j] = sort2[j];
 781                     }
 782                 }
 783
 784                 *flags |= NBNXN_CI_HALF_LJ(subc);
 785             }
 786         }
 787         if (haveQ)
 788         {
 789             *flags |= NBNXN_CI_DO_COUL(subc);
 790         }
 791         subc++;
 792     }
 793 }
 794
 795 /* Fill a pair search cell with atoms.
 796  * Potentially sorts atoms and sets the interaction flags.
 797  */
 798 static void fill_cell(nbnxn_search                  *nbs,
 799                       nbnxn_grid_t                  *grid,
 800                       nbnxn_atomdata_t              *nbat,
 801                       int                            atomStart,
 802                       int                            atomEnd,
 803                       const int                     *atinfo,
 804                       gmx::ArrayRef<const gmx::RVec> x,
 805                       nbnxn_bb_t gmx_unused         *bb_work_aligned)
 806 {
 807     const int numAtoms = atomEnd - atomStart;
 808
 809     if (grid->bSimple)
 810     {
 811         /* Note that non-local grids are already sorted.
 812          * Then sort_cluster_on_flag will only set the flags and the sorting
 813          * will not affect the atom order.
 814          */
 815         sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
 816                              grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
 817     }
 818
 819     if (nbs->bFEP)
 820     {
 821         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 822
 823         /* The grid-local cluster/(sub-)cell index */
 824         int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 825         grid->fep[cell] = 0;
 826         for (int at = atomStart; at < atomEnd; at++)
 827         {
 828             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 829             {
 830                 grid->fep[cell] |= (1 << (at - atomStart));
 831             }
 832         }
 833     }
 834
 835     /* Now we have sorted the atoms, set the cell indices */
 836     for (int at = atomStart; at < atomEnd; at++)
 837     {
 838         nbs->cell[nbs->a[at]] = at;
 839     }
 840
 841     copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c,
 842                            as_rvec_array(x.data()),
 843                            nbat->XFormat, nbat->x().data(), atomStart);
 844
 845     if (nbat->XFormat == nbatX4)
 846     {
 847         /* Store the bounding boxes as xyz.xyz. */
 848         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 849         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 850
 851 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 852         if (2*grid->na_cj == grid->na_c)
 853         {
 854             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 855                                           grid->bbj.data() + offset*2);
 856         }
 857         else
 858 #endif
 859         {
 860             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 861         }
 862     }
 863     else if (nbat->XFormat == nbatX8)
 864     {
 865         /* Store the bounding boxes as xyz.xyz. */
 866         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 867         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 868
 869         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 870     }
 871 #if NBNXN_BBXXXX
 872     else if (!grid->bSimple)
 873     {
 874         /* Store the bounding boxes in a format convenient
 875          * for SIMD4 calculations: xxxxyyyyzzzz...
 876          */
 877         float *pbb_ptr =
 878             grid->pbb.data() +
 879             ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 880             (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
 881
 882 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 883         if (nbat->XFormat == nbatXYZQ)
 884         {
 885             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 886                                          bb_work_aligned, pbb_ptr);
 887         }
 888         else
 889 #endif
 890         {
 891             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 892                                    pbb_ptr);
 893         }
 894         if (gmx_debug_at)
 895         {
 896             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 897                     atomStart >> grid->na_c_2log,
 898                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 899                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 900                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 901         }
 902     }
 903 #endif
 904     else
 905     {
 906         /* Store the bounding boxes as xyz.xyz. */
 907         nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
 908
 909         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 910                           bb_ptr);
 911
 912         if (gmx_debug_at)
 913         {
 914             int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
 915             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 916                     atomStart >> grid->na_c_2log,
 917                     grid->bb[bbo].lower[BB_X],
 918                     grid->bb[bbo].lower[BB_Y],
 919                     grid->bb[bbo].lower[BB_Z],
 920                     grid->bb[bbo].upper[BB_X],
 921                     grid->bb[bbo].upper[BB_Y],
 922                     grid->bb[bbo].upper[BB_Z]);
 923         }
 924     }
 925 }
 926
 927 /* Spatially sort the atoms within one grid column */
 928 static void sort_columns_simple(nbnxn_search *nbs,
 929                                 int dd_zone,
 930                                 nbnxn_grid_t *grid,
 931                                 int atomStart, int atomEnd,
 932                                 const int *atinfo,
 933                                 gmx::ArrayRef<const gmx::RVec> x,
 934                                 nbnxn_atomdata_t *nbat,
 935                                 int cxy_start, int cxy_end,
 936                                 gmx::ArrayRef<int> sort_work)
 937 {
 938     if (debug)
 939     {
 940         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 941                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
 942     }
 943
 944     const bool relevantAtomsAreWithinGridBounds = (grid->maxAtomGroupRadius == 0);
 945
 946     /* Sort the atoms within each x,y column in 3 dimensions */
 947     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 948     {
 949         int na  = grid->cxy_na[cxy];
 950         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 951         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 952
 953         /* Sort the atoms within each x,y column on z coordinate */
 954         sort_atoms(ZZ, FALSE, dd_zone,
 955                    relevantAtomsAreWithinGridBounds,
 956                    nbs->a.data() + ash, na, x,
 957                    grid->c0[ZZ],
 958                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 959                    sort_work);
 960
 961         /* Fill the ncz cells in this column */
 962         int cfilled = grid->cxy_ind[cxy];
 963         for (int cz = 0; cz < ncz; cz++)
 964         {
 965             int c     = grid->cxy_ind[cxy] + cz;
 966
 967             int ash_c = ash + cz*grid->na_sc;
 968             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
 969
 970             fill_cell(nbs, grid, nbat,
 971                       ash_c, ash_c+na_c, atinfo, x,
 972                       nullptr);
 973
 974             /* This copy to bbcz is not really necessary.
 975              * But it allows to use the same grid search code
 976              * for the simple and supersub cell setups.
 977              */
 978             if (na_c > 0)
 979             {
 980                 cfilled = c;
 981             }
 982             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
 983             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 984         }
 985
 986         /* Set the unused atom indices to -1 */
 987         for (int ind = na; ind < ncz*grid->na_sc; ind++)
 988         {
 989             nbs->a[ash+ind] = -1;
 990         }
 991     }
 992 }
 993
 994 /* Spatially sort the atoms within one grid column */
 995 static void sort_columns_supersub(nbnxn_search *nbs,
 996                                   int dd_zone,
 997                                   nbnxn_grid_t *grid,
 998                                   int atomStart, int atomEnd,
 999                                   const int *atinfo,
1000                                   gmx::ArrayRef<const gmx::RVec> x,
1001                                   nbnxn_atomdata_t *nbat,
1002                                   int cxy_start, int cxy_end,
1003                                   gmx::ArrayRef<int> sort_work)
1004 {
1005     nbnxn_bb_t  bb_work_array[2];
1006     nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1007
1008     if (debug)
1009     {
1010         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1011                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1012     }
1013
1014     const bool relevantAtomsAreWithinGridBounds = (grid->maxAtomGroupRadius == 0);
1015
1016     int        subdiv_x = grid->na_c;
1017     int        subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1018     int        subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1019
1020     /* Sort the atoms within each x,y column in 3 dimensions.
1021      * Loop over all columns on the x/y grid.
1022      */
1023     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1024     {
1025         int gridX            = cxy/grid->numCells[YY];
1026         int gridY            = cxy - gridX*grid->numCells[YY];
1027
1028         int numAtomsInColumn = grid->cxy_na[cxy];
1029         int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1030         int ash              = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1031
1032         /* Sort the atoms within each x,y column on z coordinate */
1033         sort_atoms(ZZ, FALSE, dd_zone,
1034                    relevantAtomsAreWithinGridBounds,
1035                    nbs->a.data() + ash, numAtomsInColumn, x,
1036                    grid->c0[ZZ],
1037                    1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1038                    sort_work);
1039
1040         /* This loop goes over the cells and clusters along z at once */
1041         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1042         {
1043             int ash_z = ash + sub_z*subdiv_z;
1044             int na_z  = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1045             int cz    = -1;
1046             /* We have already sorted on z */
1047
1048             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1049             {
1050                 cz = sub_z/c_gpuNumClusterPerCellZ;
1051                 int cell = grid->cxy_ind[cxy] + cz;
1052
1053                 /* The number of atoms in this cell/super-cluster */
1054                 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1055
1056                 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1057                                              (numAtoms + grid->na_c - 1)/grid->na_c);
1058
1059                 /* Store the z-boundaries of the bounding box of the cell */
1060                 grid->bbcz[cell*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1061                 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1062             }
1063
1064             if (c_gpuNumClusterPerCellY > 1)
1065             {
1066                 /* Sort the atoms along y */
1067                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1068                            relevantAtomsAreWithinGridBounds,
1069                            nbs->a.data() + ash_z, na_z, x,
1070                            grid->c0[YY] + gridY*grid->cellSize[YY],
1071                            grid->invCellSize[YY], subdiv_z,
1072                            sort_work);
1073             }
1074
1075             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1076             {
1077                 int ash_y = ash_z + sub_y*subdiv_y;
1078                 int na_y  = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1079
1080                 if (c_gpuNumClusterPerCellX > 1)
1081                 {
1082                     /* Sort the atoms along x */
1083                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1084                                relevantAtomsAreWithinGridBounds,
1085                                nbs->a.data() + ash_y, na_y, x,
1086                                grid->c0[XX] + gridX*grid->cellSize[XX],
1087                                grid->invCellSize[XX], subdiv_y,
1088                                sort_work);
1089                 }
1090
1091                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1092                 {
1093                     int ash_x = ash_y + sub_x*subdiv_x;
1094                     int na_x  = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1095
1096                     fill_cell(nbs, grid, nbat,
1097                               ash_x, ash_x + na_x, atinfo, x,
1098                               bb_work_aligned);
1099                 }
1100             }
1101         }
1102
1103         /* Set the unused atom indices to -1 */
1104         for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1105         {
1106             nbs->a[ash + ind] = -1;
1107         }
1108     }
1109 }
1110
1111 /* Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1112 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1113                                 int                 cellIndex,
1114                                 gmx::ArrayRef<int>  cxy_na,
1115                                 int                 atomIndex)
1116 {
1117     cell[atomIndex]    = cellIndex;
1118     cxy_na[cellIndex] += 1;
1119 }
1120
1121 /* Determine in which grid column atoms should go */
1122 static void calc_column_indices(nbnxn_grid_t *grid,
1123                                 const gmx::UpdateGroupsCog *updateGroupsCog,
1124                                 int atomStart, int atomEnd,
1125                                 gmx::ArrayRef<const gmx::RVec> x,
1126                                 int dd_zone, const int *move,
1127                                 int thread, int nthread,
1128                                 gmx::ArrayRef<int> cell,
1129                                 gmx::ArrayRef<int> cxy_na)
1130 {
1131     /* We add one extra cell for particles which moved during DD */
1132     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1133     {
1134         cxy_na[i] = 0;
1135     }
1136
1137     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1138     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1139     if (dd_zone == 0)
1140     {
1141         /* Home zone */
1142         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1143         {
1144             if (move == nullptr || move[i] >= 0)
1145             {
1146
1147                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1148
1149                 /* We need to be careful with rounding,
1150                  * particles might be a few bits outside the local zone.
1151                  * The int cast takes care of the lower bound,
1152                  * we will explicitly take care of the upper bound.
1153                  */
1154                 int cx = static_cast<int>((coord[XX] - grid->c0[XX])*grid->invCellSize[XX]);
1155                 int cy = static_cast<int>((coord[YY] - grid->c0[YY])*grid->invCellSize[YY]);
1156
1157 #ifndef NDEBUG
1158                 if (cx < 0 || cx > grid->numCells[XX] ||
1159                     cy < 0 || cy > grid->numCells[YY])
1160                 {
1161                     gmx_fatal(FARGS,
1162                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1163                               "atom %f %f %f, grid->c0 %f %f",
1164                               cx, cy, grid->numCells[XX], grid->numCells[YY],
1165                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1166                 }
1167 #endif
1168                 /* Take care of potential rouding issues */
1169                 cx = std::min(cx, grid->numCells[XX] - 1);
1170                 cy = std::min(cy, grid->numCells[YY] - 1);
1171
1172                 /* For the moment cell will contain only the, grid local,
1173                  * x and y indices, not z.
1174                  */
1175                 setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1176             }
1177             else
1178             {
1179                 /* Put this moved particle after the end of the grid,
1180                  * so we can process it later without using conditionals.
1181                  */
1182                 setCellAndAtomCount(cell, grid->numCells[XX]*grid->numCells[YY], cxy_na, i);
1183             }
1184         }
1185     }
1186     else
1187     {
1188         /* Non-home zone */
1189         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1190         {
1191             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->invCellSize[XX]);
1192             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->invCellSize[YY]);
1193
1194             /* For non-home zones there could be particles outside
1195              * the non-bonded cut-off range, which have been communicated
1196              * for bonded interactions only. For the result it doesn't
1197              * matter where these end up on the grid. For performance
1198              * we put them in an extra row at the border.
1199              */
1200             cx = std::max(cx, 0);
1201             cx = std::min(cx, grid->numCells[XX] - 1);
1202             cy = std::max(cy, 0);
1203             cy = std::min(cy, grid->numCells[YY] - 1);
1204
1205             /* For the moment cell will contain only the, grid local,
1206              * x and y indices, not z.
1207              */
1208             setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1209         }
1210     }
1211 }
1212
1213 /* Resizes grid and atom data which depend on the number of cells */
1214 static void resizeForNumberOfCells(const nbnxn_grid_t &grid,
1215                                    int                 numAtomsMoved,
1216                                    nbnxn_search       *nbs,
1217                                    nbnxn_atomdata_t   *nbat)
1218 {
1219     int numNbnxnAtoms = (grid.cell0 + grid.nc)*grid.na_sc;
1220
1221     /* Note: nbs->cell was already resized before */
1222
1223     /* To avoid conditionals we store the moved particles at the end of a,
1224      * make sure we have enough space.
1225      */
1226     nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1227
1228     /* Make space in nbat for storing the atom coordinates */
1229     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1230 }
1231
1232 /* Determine in which grid cells the atoms should go */
1233 static void
1234 calc_cell_indices(nbnxn_search                   *nbs,
1235                   int                             ddZone,
1236                   nbnxn_grid_t                   *grid,
1237                   const gmx::UpdateGroupsCog     *updateGroupsCog,
1238                   int                             atomStart,
1239                   int                             atomEnd,
1240                   const int                      *atinfo,
1241                   gmx::ArrayRef<const gmx::RVec>  x,
1242                   int                             numAtomsMoved,
1243                   const int                      *move,
1244                   nbnxn_atomdata_t               *nbat)
1245 {
1246     /* First compute all grid/column indices and store them in nbs->cell */
1247     nbs->cell.resize(atomEnd);
1248
1249     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1250
1251 #pragma omp parallel for num_threads(nthread) schedule(static)
1252     for (int thread = 0; thread < nthread; thread++)
1253     {
1254         try
1255         {
1256             calc_column_indices(grid, updateGroupsCog,
1257                                 atomStart, atomEnd, x,
1258                                 ddZone, move, thread, nthread,
1259                                 nbs->cell, nbs->work[thread].cxy_na);
1260         }
1261         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1262     }
1263
1264     /* Make the cell index as a function of x and y */
1265     int ncz_max      = 0;
1266     int ncz          = 0;
1267     grid->cxy_ind[0] = 0;
1268     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1269     {
1270         /* We set ncz_max at the beginning of the loop iso at the end
1271          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1272          * that do not need to be ordered on the grid.
1273          */
1274         if (ncz > ncz_max)
1275         {
1276             ncz_max = ncz;
1277         }
1278         int cxy_na_i = nbs->work[0].cxy_na[i];
1279         for (int thread = 1; thread < nthread; thread++)
1280         {
1281             cxy_na_i += nbs->work[thread].cxy_na[i];
1282         }
1283         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1284         if (nbat->XFormat == nbatX8)
1285         {
1286             /* Make the number of cell a multiple of 2 */
1287             ncz = (ncz + 1) & ~1;
1288         }
1289         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1290         /* Clear cxy_na, so we can reuse the array below */
1291         grid->cxy_na[i] = 0;
1292     }
1293     grid->nc = grid->cxy_ind[grid->numCells[XX]*grid->numCells[YY]] - grid->cxy_ind[0];
1294
1295     resizeForNumberOfCells(*grid, numAtomsMoved, nbs, nbat);
1296
1297     if (debug)
1298     {
1299         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1300                 grid->na_sc, grid->na_c, grid->nc,
1301                 grid->numCells[XX], grid->numCells[YY], grid->nc/(static_cast<double>(grid->numCells[XX]*grid->numCells[YY])),
1302                 ncz_max);
1303         if (gmx_debug_at)
1304         {
1305             int i = 0;
1306             for (int cy = 0; cy < grid->numCells[YY]; cy++)
1307             {
1308                 for (int cx = 0; cx < grid->numCells[XX]; cx++)
1309                 {
1310                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1311                     i++;
1312                 }
1313                 fprintf(debug, "\n");
1314             }
1315         }
1316     }
1317
1318     /* Make sure the work array for sorting is large enough */
1319     if (ncz_max*grid->na_sc*SGSF > gmx::index(nbs->work[0].sortBuffer.size()))
1320     {
1321         for (nbnxn_search_work_t &work : nbs->work)
1322         {
1323             /* Elements not in use should be -1 */
1324             work.sortBuffer.resize(ncz_max*grid->na_sc*SGSF, -1);
1325         }
1326     }
1327
1328     /* Now we know the dimensions we can fill the grid.
1329      * This is the first, unsorted fill. We sort the columns after this.
1330      */
1331     for (int i = atomStart; i < atomEnd; i++)
1332     {
1333         /* At this point nbs->cell contains the local grid x,y indices */
1334         int cxy = nbs->cell[i];
1335         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1336     }
1337
1338     if (ddZone == 0)
1339     {
1340         /* Set the cell indices for the moved particles */
1341         int n0 = grid->nc*grid->na_sc;
1342         int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->numCells[XX]*grid->numCells[YY]];
1343         if (ddZone == 0)
1344         {
1345             for (int i = n0; i < n1; i++)
1346             {
1347                 nbs->cell[nbs->a[i]] = i;
1348             }
1349         }
1350     }
1351
1352     /* Sort the super-cell columns along z into the sub-cells. */
1353 #pragma omp parallel for num_threads(nthread) schedule(static)
1354     for (int thread = 0; thread < nthread; thread++)
1355     {
1356         try
1357         {
1358             int columnStart = ((thread + 0)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1359             int columnEnd   = ((thread + 1)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1360             if (grid->bSimple)
1361             {
1362                 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1363                                     columnStart, columnEnd,
1364                                     nbs->work[thread].sortBuffer);
1365             }
1366             else
1367             {
1368                 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1369                                       columnStart, columnEnd,
1370                                       nbs->work[thread].sortBuffer);
1371             }
1372         }
1373         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1374     }
1375
1376     if (grid->bSimple && nbat->XFormat == nbatX8)
1377     {
1378         combine_bounding_box_pairs(grid, grid->bb);
1379     }
1380
1381     if (!grid->bSimple)
1382     {
1383         grid->nsubc_tot = 0;
1384         for (int i = 0; i < grid->nc; i++)
1385         {
1386             grid->nsubc_tot += grid->nsubc[i];
1387         }
1388     }
1389
1390     if (debug)
1391     {
1392         if (grid->bSimple)
1393         {
1394             print_bbsizes_simple(debug, grid);
1395         }
1396         else
1397         {
1398             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1399                     grid->nsubc_tot, (atomEnd - atomStart)/static_cast<double>(grid->nsubc_tot));
1400
1401             print_bbsizes_supersub(debug, grid);
1402         }
1403     }
1404 }
1405
1406 /* Sets up a grid and puts the atoms on the grid.
1407  * This function only operates on one domain of the domain decompostion.
1408  * Note that without domain decomposition there is only one domain.
1409  */
1410 void nbnxn_put_on_grid(nonbonded_verlet_t             *nbv,
1411                        const matrix                    box,
1412                        int                             ddZone,
1413                        const rvec                      lowerCorner,
1414                        const rvec                      upperCorner,
1415                        const gmx::UpdateGroupsCog     *updateGroupsCog,
1416                        int                             atomStart,
1417                        int                             atomEnd,
1418                        real                            atomDensity,
1419                        const int                      *atinfo,
1420                        gmx::ArrayRef<const gmx::RVec>  x,
1421                        int                             numAtomsMoved,
1422                        const int                      *move)
1423 {
1424     nbnxn_search *nbs  = nbv->nbs.get();
1425     nbnxn_grid_t *grid = &nbs->grid[ddZone];
1426
1427     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1428
1429     grid->bSimple = nbv->pairlistIsSimple();
1430
1431     grid->na_c      = IClusterSizePerListType[nbv->pairlistSets().params().pairlistType];
1432     grid->na_cj     = JClusterSizePerListType[nbv->pairlistSets().params().pairlistType];
1433     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1434     grid->na_c_2log = get_2log(grid->na_c);
1435
1436     if (ddZone == 0)
1437     {
1438         grid->cell0 = 0;
1439     }
1440     else
1441     {
1442         grid->cell0 =
1443             (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1444             nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1445     }
1446
1447     const int n = atomEnd - atomStart;
1448
1449     if (ddZone == 0)
1450     {
1451         copy_mat(box, nbs->box);
1452
1453         /* Avoid zero density */
1454         if (atomDensity > 0)
1455         {
1456             grid->atom_density = atomDensity;
1457         }
1458         else
1459         {
1460             grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1461         }
1462
1463         grid->cell0 = 0;
1464
1465         nbs->natoms_local    = atomEnd - numAtomsMoved;
1466         /* We assume that nbnxn_put_on_grid is called first
1467          * for the local atoms (ddZone=0).
1468          */
1469         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1470
1471         /* When not using atom groups, all atoms should be within the grid bounds */
1472         grid->maxAtomGroupRadius = (updateGroupsCog ? updateGroupsCog->maxUpdateGroupRadius() : 0);
1473         /* For the non-local grids the situation is the same as for the local */
1474         for (size_t g = 1; g < nbs->grid.size(); g++)
1475         {
1476             nbs->grid[g].maxAtomGroupRadius = grid->maxAtomGroupRadius;
1477         }
1478
1479         if (debug)
1480         {
1481             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1482                     nbs->natoms_local, grid->atom_density);
1483         }
1484     }
1485     else
1486     {
1487         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1488     }
1489
1490     /* We always use the home zone (grid[0]) for setting the cell size,
1491      * since determining densities for non-local zones is difficult.
1492      */
1493     set_grid_size_xy(nbs, grid,
1494                      ddZone, n - numAtomsMoved,
1495                      lowerCorner, upperCorner,
1496                      nbs->grid[0].atom_density);
1497
1498     nbnxn_atomdata_t *nbat = nbv->nbat.get();
1499
1500     calc_cell_indices(nbs, ddZone, grid, updateGroupsCog, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1501
1502     if (ddZone == 0)
1503     {
1504         nbat->natoms_local = nbat->numAtoms();
1505     }
1506     if (ddZone == gmx::ssize(nbs->grid) - 1)
1507     {
1508         /* We are done setting up all grids, we can resize the force buffers */
1509         nbat->resizeForceBuffers();
1510     }
1511
1512     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1513 }
1514
1515 /* Calls nbnxn_put_on_grid for all non-local domains */
1516 void nbnxn_put_on_grid_nonlocal(nonbonded_verlet_t              *nbv,
1517                                 const struct gmx_domdec_zones_t *zones,
1518                                 const int                       *atinfo,
1519                                 gmx::ArrayRef<const gmx::RVec>   x)
1520 {
1521     for (int zone = 1; zone < zones->n; zone++)
1522     {
1523         rvec c0, c1;
1524         for (int d = 0; d < DIM; d++)
1525         {
1526             c0[d] = zones->size[zone].bb_x0[d];
1527             c1[d] = zones->size[zone].bb_x1[d];
1528         }
1529
1530         nbnxn_put_on_grid(nbv, nullptr,
1531                           zone, c0, c1,
1532                           nullptr,
1533                           zones->cg_range[zone],
1534                           zones->cg_range[zone+1],
1535                           -1,
1536                           atinfo,
1537                           x,
1538                           0, nullptr);
1539     }
1540 }
1541
1542 void nbnxn_get_ncells(const nbnxn_search *nbs, int *ncx, int *ncy)
1543 {
1544     *ncx = nbs->grid[0].numCells[XX];
1545     *ncy = nbs->grid[0].numCells[YY];
1546 }
1547
1548 gmx::ArrayRef<const int> nbnxn_get_atomorder(const nbnxn_search *nbs)
1549 {
1550     /* Return the atom order for the home cell (index 0) */
1551     const nbnxn_grid_t &grid       = nbs->grid[0];
1552
1553     int                 numIndices = grid.cxy_ind[grid.numCells[XX]*grid.numCells[YY]]*grid.na_sc;
1554
1555     return gmx::constArrayRefFromArray(nbs->a.data(), numIndices);
1556 }
1557
1558 void nbnxn_set_atomorder(nbnxn_search *nbs)
1559 {
1560     /* Set the atom order for the home cell (index 0) */
1561     nbnxn_grid_t *grid = &nbs->grid[0];
1562
1563     int           ao = 0;
1564     for (int cx = 0; cx < grid->numCells[XX]; cx++)
1565     {
1566         for (int cy = 0; cy < grid->numCells[YY]; cy++)
1567         {
1568             int cxy = cx*grid->numCells[YY] + cy;
1569             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1570             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1571             {
1572                 nbs->a[j]     = ao;
1573                 nbs->cell[ao] = j;
1574                 ao++;
1575                 j++;
1576             }
1577         }
1578     }
1579 }
1580
1581 gmx::ArrayRef<const int> nbnxn_get_gridindices(const nbnxn_search *nbs)
1582 {
1583     return nbs->cell;
1584 }