src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include <cmath>
  39 #include <cstring>
  40
  41 #include <algorithm>
  42
  43 #include "gromacs/domdec/domdec_struct.h"
  44 #include "gromacs/math/utilities.h"
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  47 #include "gromacs/mdlib/updategroupscog.h"
  48 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  49 #include "gromacs/nbnxm/atomdata.h"
  50 #include "gromacs/nbnxm/nbnxm.h"
  51 #include "gromacs/nbnxm/nbnxm_geometry.h"
  52 #include "gromacs/simd/simd.h"
  53 #include "gromacs/simd/vector_operations.h"
  54 #include "gromacs/utility/exceptions.h"
  55 #include "gromacs/utility/smalloc.h"
  56
  57 #include "internal.h"
  58
  59 struct gmx_domdec_zones_t;
  60
  61 static real grid_atom_density(int        numAtoms,
  62                               const rvec lowerCorner,
  63                               const rvec upperCorner)
  64 {
  65     rvec size;
  66
  67     if (numAtoms == 0)
  68     {
  69         /* To avoid zero density we use a minimum of 1 atom */
  70         numAtoms = 1;
  71     }
  72
  73     rvec_sub(upperCorner, lowerCorner, size);
  74
  75     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  76 }
  77
  78 static void set_grid_size_xy(const nbnxn_search   *nbs,
  79                              nbnxn_grid_t         *grid,
  80                              int                   ddZone,
  81                              int                   numAtoms,
  82                              const rvec            lowerCorner,
  83                              const rvec            upperCorner,
  84                              real                  atomDensity)
  85 {
  86     rvec size;
  87     real tlen, tlen_x, tlen_y;
  88
  89     rvec_sub(upperCorner, lowerCorner, size);
  90
  91     if (numAtoms > grid->na_sc)
  92     {
  93         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
  94
  95         /* target cell length */
  96         if (grid->bSimple)
  97         {
  98             /* To minimize the zero interactions, we should make
  99              * the largest of the i/j cell cubic.
 100              */
 101             int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
 102
 103             /* Approximately cubic cells */
 104             tlen   = std::cbrt(numAtomsInCell/atomDensity);
 105             tlen_x = tlen;
 106             tlen_y = tlen;
 107         }
 108         else
 109         {
 110             /* Approximately cubic sub cells */
 111             tlen   = std::cbrt(grid->na_c/atomDensity);
 112             tlen_x = tlen*c_gpuNumClusterPerCellX;
 113             tlen_y = tlen*c_gpuNumClusterPerCellY;
 114         }
 115         /* We round ncx and ncy down, because we get less cell pairs
 116          * in the nbsist when the fixed cell dimensions (x,y) are
 117          * larger than the variable one (z) than the other way around.
 118          */
 119         grid->numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 120         grid->numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 121     }
 122     else
 123     {
 124         grid->numCells[XX] = 1;
 125         grid->numCells[YY] = 1;
 126     }
 127
 128     for (int d = 0; d < DIM - 1; d++)
 129     {
 130         grid->cellSize[d]    = size[d]/grid->numCells[d];
 131         grid->invCellSize[d] = 1/grid->cellSize[d];
 132     }
 133
 134     if (ddZone > 0)
 135     {
 136         /* This is a non-home zone, add an extra row of cells
 137          * for particles communicated for bonded interactions.
 138          * These can be beyond the cut-off. It doesn't matter where
 139          * they end up on the grid, but for performance it's better
 140          * if they don't end up in cells that can be within cut-off range.
 141          */
 142         grid->numCells[XX]++;
 143         grid->numCells[YY]++;
 144     }
 145
 146     /* We need one additional cell entry for particles moved by DD */
 147     int numCellsXY = grid->numCells[XX]*grid->numCells[YY];
 148     grid->cxy_na.resize(numCellsXY + 1);
 149     grid->cxy_ind.resize(numCellsXY + 2);
 150
 151     for (nbnxn_search_work_t &work : nbs->work)
 152     {
 153         work.cxy_na.resize(numCellsXY + 1);
 154     }
 155
 156     /* Worst case scenario of 1 atom in each last cell */
 157     int maxNumCells;
 158     if (grid->na_cj <= grid->na_c)
 159     {
 160         maxNumCells = numAtoms/grid->na_sc + numCellsXY;
 161     }
 162     else
 163     {
 164         maxNumCells = numAtoms/grid->na_sc + numCellsXY*grid->na_cj/grid->na_c;
 165     }
 166
 167     grid->nsubc.resize(maxNumCells);
 168     grid->bbcz.resize(maxNumCells*NNBSBB_D);
 169
 170     /* This resize also zeros the contents, this avoid possible
 171      * floating exceptions in SIMD with the unused bb elements.
 172      */
 173     if (grid->bSimple)
 174     {
 175         grid->bb.resize(maxNumCells);
 176     }
 177     else
 178     {
 179 #if NBNXN_BBXXXX
 180         grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 181 #else
 182         grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
 183 #endif
 184     }
 185
 186     if (grid->bSimple)
 187     {
 188         if (grid->na_cj == grid->na_c)
 189         {
 190             grid->bbj = grid->bb;
 191         }
 192         else
 193         {
 194             grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
 195             grid->bbj = grid->bbjStorage;
 196         }
 197     }
 198
 199     grid->flags.resize(maxNumCells);
 200     if (nbs->bFEP)
 201     {
 202         grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
 203     }
 204
 205     copy_rvec(lowerCorner, grid->c0);
 206     copy_rvec(upperCorner, grid->c1);
 207     copy_rvec(size,        grid->size);
 208 }
 209
 210 /* We need to sort paricles in grid columns on z-coordinate.
 211  * As particle are very often distributed homogeneously, we a sorting
 212  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 213  * by a factor, cast to an int and try to store in that hole. If the hole
 214  * is full, we move this or another particle. A second pass is needed to make
 215  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 216  * 4 is the optimal value for homogeneous particle distribution and allows
 217  * for an O(#particles) sort up till distributions were all particles are
 218  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 219  * as it can be expensive to detect imhomogeneous particle distributions.
 220  * SGSF is the maximum ratio of holes used, in the worst case all particles
 221  * end up in the last hole and we need #particles extra holes at the end.
 222  */
 223 #define SORT_GRID_OVERSIZE 4
 224 #define SGSF (SORT_GRID_OVERSIZE + 1)
 225
 226 /* Sort particle index a on coordinates x along dim.
 227  * Backwards tells if we want decreasing iso increasing coordinates.
 228  * h0 is the minimum of the coordinate range.
 229  * invh is the 1/length of the sorting range.
 230  * n_per_h (>=n) is the expected average number of particles per 1/invh
 231  * sort is the sorting work array.
 232  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 233  * or easier, allocate at least n*SGSF elements.
 234  */
 235 static void sort_atoms(int dim, gmx_bool Backwards,
 236                        int gmx_unused dd_zone,
 237                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 238                        int *a, int n,
 239                        gmx::ArrayRef<const gmx::RVec> x,
 240                        real h0, real invh, int n_per_h,
 241                        gmx::ArrayRef<int> sort)
 242 {
 243     if (n <= 1)
 244     {
 245         /* Nothing to do */
 246         return;
 247     }
 248
 249     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 250
 251     /* Transform the inverse range height into the inverse hole height */
 252     invh *= n_per_h*SORT_GRID_OVERSIZE;
 253
 254     /* Set nsort to the maximum possible number of holes used.
 255      * In worst case all n elements end up in the last bin.
 256      */
 257     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 258
 259     /* Determine the index range used, so we can limit it for the second pass */
 260     int zi_min = INT_MAX;
 261     int zi_max = -1;
 262
 263     /* Sort the particles using a simple index sort */
 264     for (int i = 0; i < n; i++)
 265     {
 266         /* The cast takes care of float-point rounding effects below zero.
 267          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 268          * times the box height out of the box.
 269          */
 270         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 271
 272 #ifndef NDEBUG
 273         /* As we can have rounding effect, we use > iso >= here */
 274         if (relevantAtomsAreWithinGridBounds &&
 275             (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE)))
 276         {
 277             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 278                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 279                       n_per_h, SORT_GRID_OVERSIZE);
 280         }
 281 #endif
 282         if (zi < 0)
 283         {
 284             zi = 0;
 285         }
 286
 287         /* In a non-local domain, particles communcated for bonded interactions
 288          * can be far beyond the grid size, which is set by the non-bonded
 289          * cut-off distance. We sort such particles into the last cell.
 290          */
 291         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 292         {
 293             zi = n_per_h*SORT_GRID_OVERSIZE;
 294         }
 295
 296         /* Ideally this particle should go in sort cell zi,
 297          * but that might already be in use,
 298          * in that case find the first empty cell higher up
 299          */
 300         if (sort[zi] < 0)
 301         {
 302             sort[zi] = a[i];
 303             zi_min   = std::min(zi_min, zi);
 304             zi_max   = std::max(zi_max, zi);
 305         }
 306         else
 307         {
 308             /* We have multiple atoms in the same sorting slot.
 309              * Sort on real z for minimal bounding box size.
 310              * There is an extra check for identical z to ensure
 311              * well-defined output order, independent of input order
 312              * to ensure binary reproducibility after restarts.
 313              */
 314             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 315                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 316                                        a[i] > sort[zi])))
 317             {
 318                 zi++;
 319             }
 320
 321             if (sort[zi] >= 0)
 322             {
 323                 /* Shift all elements by one slot until we find an empty slot */
 324                 int cp  = sort[zi];
 325                 int zim = zi + 1;
 326                 while (sort[zim] >= 0)
 327                 {
 328                     int tmp   = sort[zim];
 329                     sort[zim] = cp;
 330                     cp        = tmp;
 331                     zim++;
 332                 }
 333                 sort[zim] = cp;
 334                 zi_max    = std::max(zi_max, zim);
 335             }
 336             sort[zi] = a[i];
 337             zi_max   = std::max(zi_max, zi);
 338         }
 339     }
 340
 341     int c = 0;
 342     if (!Backwards)
 343     {
 344         for (int zi = 0; zi < nsort; zi++)
 345         {
 346             if (sort[zi] >= 0)
 347             {
 348                 a[c++]   = sort[zi];
 349                 sort[zi] = -1;
 350             }
 351         }
 352     }
 353     else
 354     {
 355         for (int zi = zi_max; zi >= zi_min; zi--)
 356         {
 357             if (sort[zi] >= 0)
 358             {
 359                 a[c++]   = sort[zi];
 360                 sort[zi] = -1;
 361             }
 362         }
 363     }
 364     if (c < n)
 365     {
 366         gmx_incons("Lost particles while sorting");
 367     }
 368 }
 369
 370 #if GMX_DOUBLE
 371 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 372 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 373 #else
 374 #define R2F_D(x) (x)
 375 #define R2F_U(x) (x)
 376 #endif
 377
 378 /* Coordinate order x,y,z, bb order xyz0 */
 379 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 380 {
 381     int  i;
 382     real xl, xh, yl, yh, zl, zh;
 383
 384     i  = 0;
 385     xl = x[i+XX];
 386     xh = x[i+XX];
 387     yl = x[i+YY];
 388     yh = x[i+YY];
 389     zl = x[i+ZZ];
 390     zh = x[i+ZZ];
 391     i += stride;
 392     for (int j = 1; j < na; j++)
 393     {
 394         xl = std::min(xl, x[i+XX]);
 395         xh = std::max(xh, x[i+XX]);
 396         yl = std::min(yl, x[i+YY]);
 397         yh = std::max(yh, x[i+YY]);
 398         zl = std::min(zl, x[i+ZZ]);
 399         zh = std::max(zh, x[i+ZZ]);
 400         i += stride;
 401     }
 402     /* Note: possible double to float conversion here */
 403     bb->lower[BB_X] = R2F_D(xl);
 404     bb->lower[BB_Y] = R2F_D(yl);
 405     bb->lower[BB_Z] = R2F_D(zl);
 406     bb->upper[BB_X] = R2F_U(xh);
 407     bb->upper[BB_Y] = R2F_U(yh);
 408     bb->upper[BB_Z] = R2F_U(zh);
 409 }
 410
 411 /* Packed coordinates, bb order xyz0 */
 412 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 413 {
 414     real xl, xh, yl, yh, zl, zh;
 415
 416     xl = x[XX*c_packX4];
 417     xh = x[XX*c_packX4];
 418     yl = x[YY*c_packX4];
 419     yh = x[YY*c_packX4];
 420     zl = x[ZZ*c_packX4];
 421     zh = x[ZZ*c_packX4];
 422     for (int j = 1; j < na; j++)
 423     {
 424         xl = std::min(xl, x[j+XX*c_packX4]);
 425         xh = std::max(xh, x[j+XX*c_packX4]);
 426         yl = std::min(yl, x[j+YY*c_packX4]);
 427         yh = std::max(yh, x[j+YY*c_packX4]);
 428         zl = std::min(zl, x[j+ZZ*c_packX4]);
 429         zh = std::max(zh, x[j+ZZ*c_packX4]);
 430     }
 431     /* Note: possible double to float conversion here */
 432     bb->lower[BB_X] = R2F_D(xl);
 433     bb->lower[BB_Y] = R2F_D(yl);
 434     bb->lower[BB_Z] = R2F_D(zl);
 435     bb->upper[BB_X] = R2F_U(xh);
 436     bb->upper[BB_Y] = R2F_U(yh);
 437     bb->upper[BB_Z] = R2F_U(zh);
 438 }
 439
 440 /* Packed coordinates, bb order xyz0 */
 441 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 442 {
 443     real xl, xh, yl, yh, zl, zh;
 444
 445     xl = x[XX*c_packX8];
 446     xh = x[XX*c_packX8];
 447     yl = x[YY*c_packX8];
 448     yh = x[YY*c_packX8];
 449     zl = x[ZZ*c_packX8];
 450     zh = x[ZZ*c_packX8];
 451     for (int j = 1; j < na; j++)
 452     {
 453         xl = std::min(xl, x[j+XX*c_packX8]);
 454         xh = std::max(xh, x[j+XX*c_packX8]);
 455         yl = std::min(yl, x[j+YY*c_packX8]);
 456         yh = std::max(yh, x[j+YY*c_packX8]);
 457         zl = std::min(zl, x[j+ZZ*c_packX8]);
 458         zh = std::max(zh, x[j+ZZ*c_packX8]);
 459     }
 460     /* Note: possible double to float conversion here */
 461     bb->lower[BB_X] = R2F_D(xl);
 462     bb->lower[BB_Y] = R2F_D(yl);
 463     bb->lower[BB_Z] = R2F_D(zl);
 464     bb->upper[BB_X] = R2F_U(xh);
 465     bb->upper[BB_Y] = R2F_U(yh);
 466     bb->upper[BB_Z] = R2F_U(zh);
 467 }
 468
 469 /* Packed coordinates, bb order xyz0 */
 470 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 471                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 472 {
 473     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 474     using namespace gmx;
 475
 476     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 477
 478     if (na > 2)
 479     {
 480         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 481     }
 482     else
 483     {
 484         /* Set the "empty" bounding box to the same as the first one,
 485          * so we don't need to treat special cases in the rest of the code.
 486          */
 487 #if NBNXN_SEARCH_BB_SIMD4
 488         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 489         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 490 #else
 491         bbj[1] = bbj[0];
 492 #endif
 493     }
 494
 495 #if NBNXN_SEARCH_BB_SIMD4
 496     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 497     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 498 #else
 499     {
 500         int i;
 501
 502         for (i = 0; i < NNBSBB_C; i++)
 503         {
 504             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 505             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 506         }
 507     }
 508 #endif
 509 }
 510
 511 #if NBNXN_SEARCH_BB_SIMD4
 512
 513 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 514 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 515 {
 516     int  i;
 517     real xl, xh, yl, yh, zl, zh;
 518
 519     i  = 0;
 520     xl = x[i+XX];
 521     xh = x[i+XX];
 522     yl = x[i+YY];
 523     yh = x[i+YY];
 524     zl = x[i+ZZ];
 525     zh = x[i+ZZ];
 526     i += stride;
 527     for (int j = 1; j < na; j++)
 528     {
 529         xl = std::min(xl, x[i+XX]);
 530         xh = std::max(xh, x[i+XX]);
 531         yl = std::min(yl, x[i+YY]);
 532         yh = std::max(yh, x[i+YY]);
 533         zl = std::min(zl, x[i+ZZ]);
 534         zh = std::max(zh, x[i+ZZ]);
 535         i += stride;
 536     }
 537     /* Note: possible double to float conversion here */
 538     bb[0*STRIDE_PBB] = R2F_D(xl);
 539     bb[1*STRIDE_PBB] = R2F_D(yl);
 540     bb[2*STRIDE_PBB] = R2F_D(zl);
 541     bb[3*STRIDE_PBB] = R2F_U(xh);
 542     bb[4*STRIDE_PBB] = R2F_U(yh);
 543     bb[5*STRIDE_PBB] = R2F_U(zh);
 544 }
 545
 546 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 547
 548 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 549
 550 /* Coordinate order xyz?, bb order xyz0 */
 551 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 552 {
 553     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 554     using namespace gmx;
 555
 556     Simd4Float bb_0_S, bb_1_S;
 557     Simd4Float x_S;
 558
 559     bb_0_S = load4(x);
 560     bb_1_S = bb_0_S;
 561
 562     for (int i = 1; i < na; i++)
 563     {
 564         x_S    = load4(x+i*NNBSBB_C);
 565         bb_0_S = min(bb_0_S, x_S);
 566         bb_1_S = max(bb_1_S, x_S);
 567     }
 568
 569     store4(&bb->lower[0], bb_0_S);
 570     store4(&bb->upper[0], bb_1_S);
 571 }
 572
 573 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 574 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 575                                          nbnxn_bb_t *bb_work_aligned,
 576                                          real *bb)
 577 {
 578     calc_bounding_box_simd4(na, x, bb_work_aligned);
 579
 580     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 581     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 582     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 583     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 584     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 585     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 586 }
 587
 588 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 589
 590
 591 /* Combines pairs of consecutive bounding boxes */
 592 static void combine_bounding_box_pairs(nbnxn_grid_t                    *grid,
 593                                        gmx::ArrayRef<const nbnxn_bb_t>  bb)
 594 {
 595     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 596     using namespace gmx;
 597
 598     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY]; i++)
 599     {
 600         /* Starting bb in a column is expected to be 2-aligned */
 601         int sc2 = grid->cxy_ind[i]>>1;
 602         /* For odd numbers skip the last bb here */
 603         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 604         int c2;
 605         for (c2 = sc2; c2 < sc2+nc2; c2++)
 606         {
 607 #if NBNXN_SEARCH_BB_SIMD4
 608             Simd4Float min_S, max_S;
 609
 610             min_S = min(load4(&bb[c2*2+0].lower[0]),
 611                         load4(&bb[c2*2+1].lower[0]));
 612             max_S = max(load4(&bb[c2*2+0].upper[0]),
 613                         load4(&bb[c2*2+1].upper[0]));
 614             store4(&grid->bbj[c2].lower[0], min_S);
 615             store4(&grid->bbj[c2].upper[0], max_S);
 616 #else
 617             for (int j = 0; j < NNBSBB_C; j++)
 618             {
 619                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 620                                                   bb[c2*2+1].lower[j]);
 621                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 622                                                   bb[c2*2+1].upper[j]);
 623             }
 624 #endif
 625         }
 626         if (((grid->cxy_na[i]+3)>>2) & 1)
 627         {
 628             /* The bb count in this column is odd: duplicate the last bb */
 629             for (int j = 0; j < NNBSBB_C; j++)
 630             {
 631                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 632                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 633             }
 634         }
 635     }
 636 }
 637
 638
 639 /* Prints the average bb size, used for debug output */
 640 static void print_bbsizes_simple(FILE                *fp,
 641                                  const nbnxn_grid_t  *grid)
 642 {
 643     dvec ba;
 644
 645     clear_dvec(ba);
 646     for (int c = 0; c < grid->nc; c++)
 647     {
 648         for (int d = 0; d < DIM; d++)
 649         {
 650             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 651         }
 652     }
 653     dsvmul(1.0/grid->nc, ba, ba);
 654
 655     real avgCellSizeZ = (grid->atom_density > 0 ? grid->na_c/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]) : 0.0);
 656
 657     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 658             grid->cellSize[XX],
 659             grid->cellSize[YY],
 660             avgCellSizeZ,
 661             ba[XX], ba[YY], ba[ZZ],
 662             ba[XX]*grid->invCellSize[XX],
 663             ba[YY]*grid->invCellSize[YY],
 664             grid->atom_density > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 665 }
 666
 667 /* Prints the average bb size, used for debug output */
 668 static void print_bbsizes_supersub(FILE                *fp,
 669                                    const nbnxn_grid_t  *grid)
 670 {
 671     int  ns;
 672     dvec ba;
 673
 674     clear_dvec(ba);
 675     ns = 0;
 676     for (int c = 0; c < grid->nc; c++)
 677     {
 678 #if NBNXN_BBXXXX
 679         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 680         {
 681             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 682             for (int i = 0; i < STRIDE_PBB; i++)
 683             {
 684                 for (int d = 0; d < DIM; d++)
 685                 {
 686                     ba[d] +=
 687                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 688                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 689                 }
 690             }
 691         }
 692 #else
 693         for (int s = 0; s < grid->nsubc[c]; s++)
 694         {
 695             int cs = c*c_gpuNumClusterPerCell + s;
 696             for (int d = 0; d < DIM; d++)
 697             {
 698                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 699             }
 700         }
 701 #endif
 702         ns += grid->nsubc[c];
 703     }
 704     dsvmul(1.0/ns, ba, ba);
 705
 706     real avgClusterSizeZ = (grid->atom_density > 0 ? grid->na_sc/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 707
 708     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 709             grid->cellSize[XX]/c_gpuNumClusterPerCellX,
 710             grid->cellSize[YY]/c_gpuNumClusterPerCellY,
 711             avgClusterSizeZ,
 712             ba[XX], ba[YY], ba[ZZ],
 713             ba[XX]*c_gpuNumClusterPerCellX*grid->invCellSize[XX],
 714             ba[YY]*c_gpuNumClusterPerCellY*grid->invCellSize[YY],
 715             grid->atom_density > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 716 }
 717
 718 /* Set non-bonded interaction flags for the current cluster.
 719  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 720  */
 721 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 722                                  int                 atomStart,
 723                                  int                 atomEnd,
 724                                  const int          *atinfo,
 725                                  gmx::ArrayRef<int>  order,
 726                                  int                *flags)
 727 {
 728     constexpr int c_maxNumAtomsInCluster = 8;
 729     int           sort1[c_maxNumAtomsInCluster];
 730     int           sort2[c_maxNumAtomsInCluster];
 731
 732     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 733
 734     *flags = 0;
 735
 736     int subc = 0;
 737     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 738     {
 739         /* Make lists for this (sub-)cell on atoms with and without LJ */
 740         int      n1         = 0;
 741         int      n2         = 0;
 742         gmx_bool haveQ      = FALSE;
 743         int      a_lj_max   = -1;
 744         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 745         {
 746             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 747
 748             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 749             {
 750                 sort1[n1++] = order[a];
 751                 a_lj_max    = a;
 752             }
 753             else
 754             {
 755                 sort2[n2++] = order[a];
 756             }
 757         }
 758
 759         /* If we don't have atoms with LJ, there's nothing to sort */
 760         if (n1 > 0)
 761         {
 762             *flags |= NBNXN_CI_DO_LJ(subc);
 763
 764             if (2*n1 <= numAtomsInCluster)
 765             {
 766                 /* Only sort when strictly necessary. Ordering particles
 767                  * Ordering particles can lead to less accurate summation
 768                  * due to rounding, both for LJ and Coulomb interactions.
 769                  */
 770                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 771                 {
 772                     for (int i = 0; i < n1; i++)
 773                     {
 774                         order[atomStart + i]      = sort1[i];
 775                     }
 776                     for (int j = 0; j < n2; j++)
 777                     {
 778                         order[atomStart + n1 + j] = sort2[j];
 779                     }
 780                 }
 781
 782                 *flags |= NBNXN_CI_HALF_LJ(subc);
 783             }
 784         }
 785         if (haveQ)
 786         {
 787             *flags |= NBNXN_CI_DO_COUL(subc);
 788         }
 789         subc++;
 790     }
 791 }
 792
 793 /* Fill a pair search cell with atoms.
 794  * Potentially sorts atoms and sets the interaction flags.
 795  */
 796 static void fill_cell(nbnxn_search                  *nbs,
 797                       nbnxn_grid_t                  *grid,
 798                       nbnxn_atomdata_t              *nbat,
 799                       int                            atomStart,
 800                       int                            atomEnd,
 801                       const int                     *atinfo,
 802                       gmx::ArrayRef<const gmx::RVec> x,
 803                       nbnxn_bb_t gmx_unused         *bb_work_aligned)
 804 {
 805     const int numAtoms = atomEnd - atomStart;
 806
 807     if (grid->bSimple)
 808     {
 809         /* Note that non-local grids are already sorted.
 810          * Then sort_cluster_on_flag will only set the flags and the sorting
 811          * will not affect the atom order.
 812          */
 813         sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
 814                              grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
 815     }
 816
 817     if (nbs->bFEP)
 818     {
 819         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 820
 821         /* The grid-local cluster/(sub-)cell index */
 822         int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 823         grid->fep[cell] = 0;
 824         for (int at = atomStart; at < atomEnd; at++)
 825         {
 826             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 827             {
 828                 grid->fep[cell] |= (1 << (at - atomStart));
 829             }
 830         }
 831     }
 832
 833     /* Now we have sorted the atoms, set the cell indices */
 834     for (int at = atomStart; at < atomEnd; at++)
 835     {
 836         nbs->cell[nbs->a[at]] = at;
 837     }
 838
 839     copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c,
 840                            as_rvec_array(x.data()),
 841                            nbat->XFormat, nbat->x().data(), atomStart);
 842
 843     if (nbat->XFormat == nbatX4)
 844     {
 845         /* Store the bounding boxes as xyz.xyz. */
 846         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 847         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 848
 849 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 850         if (2*grid->na_cj == grid->na_c)
 851         {
 852             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 853                                           grid->bbj.data() + offset*2);
 854         }
 855         else
 856 #endif
 857         {
 858             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 859         }
 860     }
 861     else if (nbat->XFormat == nbatX8)
 862     {
 863         /* Store the bounding boxes as xyz.xyz. */
 864         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 865         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 866
 867         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 868     }
 869 #if NBNXN_BBXXXX
 870     else if (!grid->bSimple)
 871     {
 872         /* Store the bounding boxes in a format convenient
 873          * for SIMD4 calculations: xxxxyyyyzzzz...
 874          */
 875         float *pbb_ptr =
 876             grid->pbb.data() +
 877             ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 878             (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
 879
 880 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 881         if (nbat->XFormat == nbatXYZQ)
 882         {
 883             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 884                                          bb_work_aligned, pbb_ptr);
 885         }
 886         else
 887 #endif
 888         {
 889             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 890                                    pbb_ptr);
 891         }
 892         if (gmx_debug_at)
 893         {
 894             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 895                     atomStart >> grid->na_c_2log,
 896                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 897                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 898                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 899         }
 900     }
 901 #endif
 902     else
 903     {
 904         /* Store the bounding boxes as xyz.xyz. */
 905         nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
 906
 907         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 908                           bb_ptr);
 909
 910         if (gmx_debug_at)
 911         {
 912             int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
 913             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 914                     atomStart >> grid->na_c_2log,
 915                     grid->bb[bbo].lower[BB_X],
 916                     grid->bb[bbo].lower[BB_Y],
 917                     grid->bb[bbo].lower[BB_Z],
 918                     grid->bb[bbo].upper[BB_X],
 919                     grid->bb[bbo].upper[BB_Y],
 920                     grid->bb[bbo].upper[BB_Z]);
 921         }
 922     }
 923 }
 924
 925 /* Spatially sort the atoms within one grid column */
 926 static void sort_columns_simple(nbnxn_search *nbs,
 927                                 int dd_zone,
 928                                 nbnxn_grid_t *grid,
 929                                 int atomStart, int atomEnd,
 930                                 const int *atinfo,
 931                                 gmx::ArrayRef<const gmx::RVec> x,
 932                                 nbnxn_atomdata_t *nbat,
 933                                 int cxy_start, int cxy_end,
 934                                 gmx::ArrayRef<int> sort_work)
 935 {
 936     if (debug)
 937     {
 938         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 939                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
 940     }
 941
 942     const bool relevantAtomsAreWithinGridBounds = (grid->maxAtomGroupRadius == 0);
 943
 944     /* Sort the atoms within each x,y column in 3 dimensions */
 945     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 946     {
 947         int na  = grid->cxy_na[cxy];
 948         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 949         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 950
 951         /* Sort the atoms within each x,y column on z coordinate */
 952         sort_atoms(ZZ, FALSE, dd_zone,
 953                    relevantAtomsAreWithinGridBounds,
 954                    nbs->a.data() + ash, na, x,
 955                    grid->c0[ZZ],
 956                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 957                    sort_work);
 958
 959         /* Fill the ncz cells in this column */
 960         int cfilled = grid->cxy_ind[cxy];
 961         for (int cz = 0; cz < ncz; cz++)
 962         {
 963             int c     = grid->cxy_ind[cxy] + cz;
 964
 965             int ash_c = ash + cz*grid->na_sc;
 966             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
 967
 968             fill_cell(nbs, grid, nbat,
 969                       ash_c, ash_c+na_c, atinfo, x,
 970                       nullptr);
 971
 972             /* This copy to bbcz is not really necessary.
 973              * But it allows to use the same grid search code
 974              * for the simple and supersub cell setups.
 975              */
 976             if (na_c > 0)
 977             {
 978                 cfilled = c;
 979             }
 980             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
 981             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 982         }
 983
 984         /* Set the unused atom indices to -1 */
 985         for (int ind = na; ind < ncz*grid->na_sc; ind++)
 986         {
 987             nbs->a[ash+ind] = -1;
 988         }
 989     }
 990 }
 991
 992 /* Spatially sort the atoms within one grid column */
 993 static void sort_columns_supersub(nbnxn_search *nbs,
 994                                   int dd_zone,
 995                                   nbnxn_grid_t *grid,
 996                                   int atomStart, int atomEnd,
 997                                   const int *atinfo,
 998                                   gmx::ArrayRef<const gmx::RVec> x,
 999                                   nbnxn_atomdata_t *nbat,
1000                                   int cxy_start, int cxy_end,
1001                                   gmx::ArrayRef<int> sort_work)
1002 {
1003     nbnxn_bb_t  bb_work_array[2];
1004     nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1005
1006     if (debug)
1007     {
1008         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1009                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1010     }
1011
1012     const bool relevantAtomsAreWithinGridBounds = (grid->maxAtomGroupRadius == 0);
1013
1014     int        subdiv_x = grid->na_c;
1015     int        subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1016     int        subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1017
1018     /* Sort the atoms within each x,y column in 3 dimensions.
1019      * Loop over all columns on the x/y grid.
1020      */
1021     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1022     {
1023         int gridX            = cxy/grid->numCells[YY];
1024         int gridY            = cxy - gridX*grid->numCells[YY];
1025
1026         int numAtomsInColumn = grid->cxy_na[cxy];
1027         int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1028         int ash              = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1029
1030         /* Sort the atoms within each x,y column on z coordinate */
1031         sort_atoms(ZZ, FALSE, dd_zone,
1032                    relevantAtomsAreWithinGridBounds,
1033                    nbs->a.data() + ash, numAtomsInColumn, x,
1034                    grid->c0[ZZ],
1035                    1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1036                    sort_work);
1037
1038         /* This loop goes over the cells and clusters along z at once */
1039         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1040         {
1041             int ash_z = ash + sub_z*subdiv_z;
1042             int na_z  = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1043             int cz    = -1;
1044             /* We have already sorted on z */
1045
1046             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1047             {
1048                 cz = sub_z/c_gpuNumClusterPerCellZ;
1049                 int cell = grid->cxy_ind[cxy] + cz;
1050
1051                 /* The number of atoms in this cell/super-cluster */
1052                 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1053
1054                 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1055                                              (numAtoms + grid->na_c - 1)/grid->na_c);
1056
1057                 /* Store the z-boundaries of the bounding box of the cell */
1058                 grid->bbcz[cell*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1059                 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1060             }
1061
1062             if (c_gpuNumClusterPerCellY > 1)
1063             {
1064                 /* Sort the atoms along y */
1065                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1066                            relevantAtomsAreWithinGridBounds,
1067                            nbs->a.data() + ash_z, na_z, x,
1068                            grid->c0[YY] + gridY*grid->cellSize[YY],
1069                            grid->invCellSize[YY], subdiv_z,
1070                            sort_work);
1071             }
1072
1073             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1074             {
1075                 int ash_y = ash_z + sub_y*subdiv_y;
1076                 int na_y  = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1077
1078                 if (c_gpuNumClusterPerCellX > 1)
1079                 {
1080                     /* Sort the atoms along x */
1081                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1082                                relevantAtomsAreWithinGridBounds,
1083                                nbs->a.data() + ash_y, na_y, x,
1084                                grid->c0[XX] + gridX*grid->cellSize[XX],
1085                                grid->invCellSize[XX], subdiv_y,
1086                                sort_work);
1087                 }
1088
1089                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1090                 {
1091                     int ash_x = ash_y + sub_x*subdiv_x;
1092                     int na_x  = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1093
1094                     fill_cell(nbs, grid, nbat,
1095                               ash_x, ash_x + na_x, atinfo, x,
1096                               bb_work_aligned);
1097                 }
1098             }
1099         }
1100
1101         /* Set the unused atom indices to -1 */
1102         for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1103         {
1104             nbs->a[ash + ind] = -1;
1105         }
1106     }
1107 }
1108
1109 /* Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1110 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1111                                 int                 cellIndex,
1112                                 gmx::ArrayRef<int>  cxy_na,
1113                                 int                 atomIndex)
1114 {
1115     cell[atomIndex]    = cellIndex;
1116     cxy_na[cellIndex] += 1;
1117 }
1118
1119 /* Determine in which grid column atoms should go */
1120 static void calc_column_indices(nbnxn_grid_t *grid,
1121                                 const gmx::UpdateGroupsCog *updateGroupsCog,
1122                                 int atomStart, int atomEnd,
1123                                 gmx::ArrayRef<const gmx::RVec> x,
1124                                 int dd_zone, const int *move,
1125                                 int thread, int nthread,
1126                                 gmx::ArrayRef<int> cell,
1127                                 gmx::ArrayRef<int> cxy_na)
1128 {
1129     /* We add one extra cell for particles which moved during DD */
1130     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1131     {
1132         cxy_na[i] = 0;
1133     }
1134
1135     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1136     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1137     if (dd_zone == 0)
1138     {
1139         /* Home zone */
1140         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1141         {
1142             if (move == nullptr || move[i] >= 0)
1143             {
1144
1145                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1146
1147                 /* We need to be careful with rounding,
1148                  * particles might be a few bits outside the local zone.
1149                  * The int cast takes care of the lower bound,
1150                  * we will explicitly take care of the upper bound.
1151                  */
1152                 int cx = static_cast<int>((coord[XX] - grid->c0[XX])*grid->invCellSize[XX]);
1153                 int cy = static_cast<int>((coord[YY] - grid->c0[YY])*grid->invCellSize[YY]);
1154
1155 #ifndef NDEBUG
1156                 if (cx < 0 || cx > grid->numCells[XX] ||
1157                     cy < 0 || cy > grid->numCells[YY])
1158                 {
1159                     gmx_fatal(FARGS,
1160                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1161                               "atom %f %f %f, grid->c0 %f %f",
1162                               cx, cy, grid->numCells[XX], grid->numCells[YY],
1163                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1164                 }
1165 #endif
1166                 /* Take care of potential rouding issues */
1167                 cx = std::min(cx, grid->numCells[XX] - 1);
1168                 cy = std::min(cy, grid->numCells[YY] - 1);
1169
1170                 /* For the moment cell will contain only the, grid local,
1171                  * x and y indices, not z.
1172                  */
1173                 setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1174             }
1175             else
1176             {
1177                 /* Put this moved particle after the end of the grid,
1178                  * so we can process it later without using conditionals.
1179                  */
1180                 setCellAndAtomCount(cell, grid->numCells[XX]*grid->numCells[YY], cxy_na, i);
1181             }
1182         }
1183     }
1184     else
1185     {
1186         /* Non-home zone */
1187         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1188         {
1189             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->invCellSize[XX]);
1190             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->invCellSize[YY]);
1191
1192             /* For non-home zones there could be particles outside
1193              * the non-bonded cut-off range, which have been communicated
1194              * for bonded interactions only. For the result it doesn't
1195              * matter where these end up on the grid. For performance
1196              * we put them in an extra row at the border.
1197              */
1198             cx = std::max(cx, 0);
1199             cx = std::min(cx, grid->numCells[XX] - 1);
1200             cy = std::max(cy, 0);
1201             cy = std::min(cy, grid->numCells[YY] - 1);
1202
1203             /* For the moment cell will contain only the, grid local,
1204              * x and y indices, not z.
1205              */
1206             setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1207         }
1208     }
1209 }
1210
1211 /* Resizes grid and atom data which depend on the number of cells */
1212 static void resizeForNumberOfCells(const nbnxn_grid_t &grid,
1213                                    int                 numAtomsMoved,
1214                                    nbnxn_search       *nbs,
1215                                    nbnxn_atomdata_t   *nbat)
1216 {
1217     int numNbnxnAtoms = (grid.cell0 + grid.nc)*grid.na_sc;
1218
1219     /* Note: nbs->cell was already resized before */
1220
1221     /* To avoid conditionals we store the moved particles at the end of a,
1222      * make sure we have enough space.
1223      */
1224     nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1225
1226     /* Make space in nbat for storing the atom coordinates */
1227     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1228 }
1229
1230 /* Determine in which grid cells the atoms should go */
1231 static void
1232 calc_cell_indices(nbnxn_search                   *nbs,
1233                   int                             ddZone,
1234                   nbnxn_grid_t                   *grid,
1235                   const gmx::UpdateGroupsCog     *updateGroupsCog,
1236                   int                             atomStart,
1237                   int                             atomEnd,
1238                   const int                      *atinfo,
1239                   gmx::ArrayRef<const gmx::RVec>  x,
1240                   int                             numAtomsMoved,
1241                   const int                      *move,
1242                   nbnxn_atomdata_t               *nbat)
1243 {
1244     /* First compute all grid/column indices and store them in nbs->cell */
1245     nbs->cell.resize(atomEnd);
1246
1247     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1248
1249 #pragma omp parallel for num_threads(nthread) schedule(static)
1250     for (int thread = 0; thread < nthread; thread++)
1251     {
1252         try
1253         {
1254             calc_column_indices(grid, updateGroupsCog,
1255                                 atomStart, atomEnd, x,
1256                                 ddZone, move, thread, nthread,
1257                                 nbs->cell, nbs->work[thread].cxy_na);
1258         }
1259         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1260     }
1261
1262     /* Make the cell index as a function of x and y */
1263     int ncz_max      = 0;
1264     int ncz          = 0;
1265     grid->cxy_ind[0] = 0;
1266     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1267     {
1268         /* We set ncz_max at the beginning of the loop iso at the end
1269          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1270          * that do not need to be ordered on the grid.
1271          */
1272         if (ncz > ncz_max)
1273         {
1274             ncz_max = ncz;
1275         }
1276         int cxy_na_i = nbs->work[0].cxy_na[i];
1277         for (int thread = 1; thread < nthread; thread++)
1278         {
1279             cxy_na_i += nbs->work[thread].cxy_na[i];
1280         }
1281         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1282         if (nbat->XFormat == nbatX8)
1283         {
1284             /* Make the number of cell a multiple of 2 */
1285             ncz = (ncz + 1) & ~1;
1286         }
1287         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1288         /* Clear cxy_na, so we can reuse the array below */
1289         grid->cxy_na[i] = 0;
1290     }
1291     grid->nc = grid->cxy_ind[grid->numCells[XX]*grid->numCells[YY]] - grid->cxy_ind[0];
1292
1293     resizeForNumberOfCells(*grid, numAtomsMoved, nbs, nbat);
1294
1295     if (debug)
1296     {
1297         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1298                 grid->na_sc, grid->na_c, grid->nc,
1299                 grid->numCells[XX], grid->numCells[YY], grid->nc/(static_cast<double>(grid->numCells[XX]*grid->numCells[YY])),
1300                 ncz_max);
1301         if (gmx_debug_at)
1302         {
1303             int i = 0;
1304             for (int cy = 0; cy < grid->numCells[YY]; cy++)
1305             {
1306                 for (int cx = 0; cx < grid->numCells[XX]; cx++)
1307                 {
1308                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1309                     i++;
1310                 }
1311                 fprintf(debug, "\n");
1312             }
1313         }
1314     }
1315
1316     /* Make sure the work array for sorting is large enough */
1317     if (ncz_max*grid->na_sc*SGSF > gmx::index(nbs->work[0].sortBuffer.size()))
1318     {
1319         for (nbnxn_search_work_t &work : nbs->work)
1320         {
1321             /* Elements not in use should be -1 */
1322             work.sortBuffer.resize(ncz_max*grid->na_sc*SGSF, -1);
1323         }
1324     }
1325
1326     /* Now we know the dimensions we can fill the grid.
1327      * This is the first, unsorted fill. We sort the columns after this.
1328      */
1329     for (int i = atomStart; i < atomEnd; i++)
1330     {
1331         /* At this point nbs->cell contains the local grid x,y indices */
1332         int cxy = nbs->cell[i];
1333         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1334     }
1335
1336     if (ddZone == 0)
1337     {
1338         /* Set the cell indices for the moved particles */
1339         int n0 = grid->nc*grid->na_sc;
1340         int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->numCells[XX]*grid->numCells[YY]];
1341         if (ddZone == 0)
1342         {
1343             for (int i = n0; i < n1; i++)
1344             {
1345                 nbs->cell[nbs->a[i]] = i;
1346             }
1347         }
1348     }
1349
1350     /* Sort the super-cell columns along z into the sub-cells. */
1351 #pragma omp parallel for num_threads(nthread) schedule(static)
1352     for (int thread = 0; thread < nthread; thread++)
1353     {
1354         try
1355         {
1356             int columnStart = ((thread + 0)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1357             int columnEnd   = ((thread + 1)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1358             if (grid->bSimple)
1359             {
1360                 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1361                                     columnStart, columnEnd,
1362                                     nbs->work[thread].sortBuffer);
1363             }
1364             else
1365             {
1366                 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1367                                       columnStart, columnEnd,
1368                                       nbs->work[thread].sortBuffer);
1369             }
1370         }
1371         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1372     }
1373
1374     if (grid->bSimple && nbat->XFormat == nbatX8)
1375     {
1376         combine_bounding_box_pairs(grid, grid->bb);
1377     }
1378
1379     if (!grid->bSimple)
1380     {
1381         grid->nsubc_tot = 0;
1382         for (int i = 0; i < grid->nc; i++)
1383         {
1384             grid->nsubc_tot += grid->nsubc[i];
1385         }
1386     }
1387
1388     if (debug)
1389     {
1390         if (grid->bSimple)
1391         {
1392             print_bbsizes_simple(debug, grid);
1393         }
1394         else
1395         {
1396             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1397                     grid->nsubc_tot, (atomEnd - atomStart)/static_cast<double>(grid->nsubc_tot));
1398
1399             print_bbsizes_supersub(debug, grid);
1400         }
1401     }
1402 }
1403
1404 /* Sets up a grid and puts the atoms on the grid.
1405  * This function only operates on one domain of the domain decompostion.
1406  * Note that without domain decomposition there is only one domain.
1407  */
1408 void nbnxn_put_on_grid(nbnxn_search_t                  nbs,
1409                        int                             ePBC,
1410                        const matrix                    box,
1411                        int                             ddZone,
1412                        const rvec                      lowerCorner,
1413                        const rvec                      upperCorner,
1414                        const gmx::UpdateGroupsCog     *updateGroupsCog,
1415                        int                             atomStart,
1416                        int                             atomEnd,
1417                        real                            atomDensity,
1418                        const int                      *atinfo,
1419                        gmx::ArrayRef<const gmx::RVec>  x,
1420                        int                             numAtomsMoved,
1421                        const int                      *move,
1422                        int                             nb_kernel_type,
1423                        nbnxn_atomdata_t               *nbat)
1424 {
1425     nbnxn_grid_t *grid = &nbs->grid[ddZone];
1426
1427     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1428
1429     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1430
1431     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1432     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1433     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1434     grid->na_c_2log = get_2log(grid->na_c);
1435
1436     if (ddZone == 0)
1437     {
1438         grid->cell0 = 0;
1439     }
1440     else
1441     {
1442         grid->cell0 =
1443             (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1444             nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1445     }
1446
1447     const int n = atomEnd - atomStart;
1448
1449     if (ddZone == 0)
1450     {
1451         nbs->ePBC = ePBC;
1452         copy_mat(box, nbs->box);
1453
1454         /* Avoid zero density */
1455         if (atomDensity > 0)
1456         {
1457             grid->atom_density = atomDensity;
1458         }
1459         else
1460         {
1461             grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1462         }
1463
1464         grid->cell0 = 0;
1465
1466         nbs->natoms_local    = atomEnd - numAtomsMoved;
1467         /* We assume that nbnxn_put_on_grid is called first
1468          * for the local atoms (ddZone=0).
1469          */
1470         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1471
1472         /* When not using atom groups, all atoms should be within the grid bounds */
1473         grid->maxAtomGroupRadius = (updateGroupsCog ? updateGroupsCog->maxUpdateGroupRadius() : 0);
1474         /* For the non-local grids the situation is the same as for the local */
1475         for (size_t g = 1; g < nbs->grid.size(); g++)
1476         {
1477             nbs->grid[g].maxAtomGroupRadius = grid->maxAtomGroupRadius;
1478         }
1479
1480         if (debug)
1481         {
1482             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1483                     nbs->natoms_local, grid->atom_density);
1484         }
1485     }
1486     else
1487     {
1488         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1489     }
1490
1491     /* We always use the home zone (grid[0]) for setting the cell size,
1492      * since determining densities for non-local zones is difficult.
1493      */
1494     set_grid_size_xy(nbs, grid,
1495                      ddZone, n - numAtomsMoved,
1496                      lowerCorner, upperCorner,
1497                      nbs->grid[0].atom_density);
1498
1499     calc_cell_indices(nbs, ddZone, grid, updateGroupsCog, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1500
1501     if (ddZone == 0)
1502     {
1503         nbat->natoms_local = nbat->numAtoms();
1504     }
1505     if (ddZone == gmx::ssize(nbs->grid) - 1)
1506     {
1507         /* We are done setting up all grids, we can resize the force buffers */
1508         nbat->resizeForceBuffers();
1509     }
1510
1511     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1512 }
1513
1514 /* Calls nbnxn_put_on_grid for all non-local domains */
1515 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1516                                 const struct gmx_domdec_zones_t *zones,
1517                                 const int                       *atinfo,
1518                                 gmx::ArrayRef<const gmx::RVec>   x,
1519                                 int                              nb_kernel_type,
1520                                 nbnxn_atomdata_t                *nbat)
1521 {
1522     for (int zone = 1; zone < zones->n; zone++)
1523     {
1524         rvec c0, c1;
1525         for (int d = 0; d < DIM; d++)
1526         {
1527             c0[d] = zones->size[zone].bb_x0[d];
1528             c1[d] = zones->size[zone].bb_x1[d];
1529         }
1530
1531         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1532                           zone, c0, c1,
1533                           nullptr,
1534                           zones->cg_range[zone],
1535                           zones->cg_range[zone+1],
1536                           -1,
1537                           atinfo,
1538                           x,
1539                           0, nullptr,
1540                           nb_kernel_type,
1541                           nbat);
1542     }
1543 }
1544
1545 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1546 {
1547     *ncx = nbs->grid[0].numCells[XX];
1548     *ncy = nbs->grid[0].numCells[YY];
1549 }
1550
1551 gmx::ArrayRef<const int> nbnxn_get_atomorder(const nbnxn_search *nbs)
1552 {
1553     /* Return the atom order for the home cell (index 0) */
1554     const nbnxn_grid_t &grid       = nbs->grid[0];
1555
1556     int                 numIndices = grid.cxy_ind[grid.numCells[XX]*grid.numCells[YY]]*grid.na_sc;
1557
1558     return gmx::constArrayRefFromArray(nbs->a.data(), numIndices);
1559 }
1560
1561 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1562 {
1563     /* Set the atom order for the home cell (index 0) */
1564     nbnxn_grid_t *grid = &nbs->grid[0];
1565
1566     int           ao = 0;
1567     for (int cx = 0; cx < grid->numCells[XX]; cx++)
1568     {
1569         for (int cy = 0; cy < grid->numCells[YY]; cy++)
1570         {
1571             int cxy = cx*grid->numCells[YY] + cy;
1572             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1573             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1574             {
1575                 nbs->a[j]     = ao;
1576                 nbs->cell[ao] = j;
1577                 ao++;
1578                 j++;
1579             }
1580         }
1581     }
1582 }
1583
1584 gmx::ArrayRef<const int> nbnxn_get_gridindices(const nbnxn_search *nbs)
1585 {
1586     return nbs->cell;
1587 }