src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include <string.h>
  41
  42 #include <cmath>
  43
  44 #include <algorithm>
  45
  46 #include "gromacs/domdec/domdec_struct.h"
  47 #include "gromacs/math/utilities.h"
  48 #include "gromacs/math/vec.h"
  49 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  50 #include "gromacs/mdlib/nb_verlet.h"
  51 #include "gromacs/mdlib/nbnxn_atomdata.h"
  52 #include "gromacs/mdlib/nbnxn_consts.h"
  53 #include "gromacs/mdlib/nbnxn_internal.h"
  54 #include "gromacs/mdlib/nbnxn_search.h"
  55 #include "gromacs/mdlib/nbnxn_util.h"
  56 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  57 #include "gromacs/simd/simd.h"
  58 #include "gromacs/simd/vector_operations.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/smalloc.h"
  61
  62 struct gmx_domdec_zones_t;
  63
  64 static real grid_atom_density(int        numAtoms,
  65                               const rvec lowerCorner,
  66                               const rvec upperCorner)
  67 {
  68     rvec size;
  69
  70     if (numAtoms == 0)
  71     {
  72         /* To avoid zero density we use a minimum of 1 atom */
  73         numAtoms = 1;
  74     }
  75
  76     rvec_sub(upperCorner, lowerCorner, size);
  77
  78     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
  79 }
  80
  81 static void set_grid_size_xy(const nbnxn_search   *nbs,
  82                              nbnxn_grid_t         *grid,
  83                              int                   ddZone,
  84                              int                   numAtoms,
  85                              const rvec            lowerCorner,
  86                              const rvec            upperCorner,
  87                              real                  atomDensity)
  88 {
  89     rvec size;
  90     real tlen, tlen_x, tlen_y;
  91
  92     rvec_sub(upperCorner, lowerCorner, size);
  93
  94     if (numAtoms > grid->na_sc)
  95     {
  96         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
  97
  98         /* target cell length */
  99         if (grid->bSimple)
 100         {
 101             /* To minimize the zero interactions, we should make
 102              * the largest of the i/j cell cubic.
 103              */
 104             int numAtomsInCell = std::max(grid->na_c, grid->na_cj);
 105
 106             /* Approximately cubic cells */
 107             tlen   = std::cbrt(numAtomsInCell/atomDensity);
 108             tlen_x = tlen;
 109             tlen_y = tlen;
 110         }
 111         else
 112         {
 113             /* Approximately cubic sub cells */
 114             tlen   = std::cbrt(grid->na_c/atomDensity);
 115             tlen_x = tlen*c_gpuNumClusterPerCellX;
 116             tlen_y = tlen*c_gpuNumClusterPerCellY;
 117         }
 118         /* We round ncx and ncy down, because we get less cell pairs
 119          * in the nbsist when the fixed cell dimensions (x,y) are
 120          * larger than the variable one (z) than the other way around.
 121          */
 122         grid->numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 123         grid->numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 124     }
 125     else
 126     {
 127         grid->numCells[XX] = 1;
 128         grid->numCells[YY] = 1;
 129     }
 130
 131     for (int d = 0; d < DIM - 1; d++)
 132     {
 133         grid->cellSize[d]    = size[d]/grid->numCells[d];
 134         grid->invCellSize[d] = 1/grid->cellSize[d];
 135     }
 136
 137     if (ddZone > 0)
 138     {
 139         /* This is a non-home zone, add an extra row of cells
 140          * for particles communicated for bonded interactions.
 141          * These can be beyond the cut-off. It doesn't matter where
 142          * they end up on the grid, but for performance it's better
 143          * if they don't end up in cells that can be within cut-off range.
 144          */
 145         grid->numCells[XX]++;
 146         grid->numCells[YY]++;
 147     }
 148
 149     /* We need one additional cell entry for particles moved by DD */
 150     int numCellsXY = grid->numCells[XX]*grid->numCells[YY];
 151     grid->cxy_na.resize(numCellsXY + 1);
 152     grid->cxy_ind.resize(numCellsXY + 2);
 153
 154     for (nbnxn_search_work_t &work : nbs->work)
 155     {
 156         work.cxy_na.resize(numCellsXY + 1);
 157     }
 158
 159     /* Worst case scenario of 1 atom in each last cell */
 160     int maxNumCells;
 161     if (grid->na_cj <= grid->na_c)
 162     {
 163         maxNumCells = numAtoms/grid->na_sc + numCellsXY;
 164     }
 165     else
 166     {
 167         maxNumCells = numAtoms/grid->na_sc + numCellsXY*grid->na_cj/grid->na_c;
 168     }
 169
 170     grid->nsubc.resize(maxNumCells);
 171     grid->bbcz.resize(maxNumCells*NNBSBB_D);
 172
 173     /* This resize also zeros the contents, this avoid possible
 174      * floating exceptions in SIMD with the unused bb elements.
 175      */
 176     if (grid->bSimple)
 177     {
 178         grid->bb.resize(maxNumCells);
 179     }
 180     else
 181     {
 182 #if NBNXN_BBXXXX
 183         grid->pbb.resize(maxNumCells*c_gpuNumClusterPerCell/STRIDE_PBB*NNBSBB_XXXX);
 184 #else
 185         grid->bb.resize(maxNumCells*c_gpuNumClusterPerCell);
 186 #endif
 187     }
 188
 189     if (grid->bSimple)
 190     {
 191         if (grid->na_cj == grid->na_c)
 192         {
 193             grid->bbj = grid->bb;
 194         }
 195         else
 196         {
 197             grid->bbjStorage.resize(maxNumCells*grid->na_c/grid->na_cj);
 198             grid->bbj = grid->bbjStorage;
 199         }
 200     }
 201
 202     grid->flags.resize(maxNumCells);
 203     if (nbs->bFEP)
 204     {
 205         grid->fep.resize(maxNumCells*grid->na_sc/grid->na_c);
 206     }
 207
 208     copy_rvec(lowerCorner, grid->c0);
 209     copy_rvec(upperCorner, grid->c1);
 210     copy_rvec(size,        grid->size);
 211 }
 212
 213 /* We need to sort paricles in grid columns on z-coordinate.
 214  * As particle are very often distributed homogeneously, we a sorting
 215  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 216  * by a factor, cast to an int and try to store in that hole. If the hole
 217  * is full, we move this or another particle. A second pass is needed to make
 218  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 219  * 4 is the optimal value for homogeneous particle distribution and allows
 220  * for an O(#particles) sort up till distributions were all particles are
 221  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 222  * as it can be expensive to detect imhomogeneous particle distributions.
 223  * SGSF is the maximum ratio of holes used, in the worst case all particles
 224  * end up in the last hole and we need #particles extra holes at the end.
 225  */
 226 #define SORT_GRID_OVERSIZE 4
 227 #define SGSF (SORT_GRID_OVERSIZE + 1)
 228
 229 /* Sort particle index a on coordinates x along dim.
 230  * Backwards tells if we want decreasing iso increasing coordinates.
 231  * h0 is the minimum of the coordinate range.
 232  * invh is the 1/length of the sorting range.
 233  * n_per_h (>=n) is the expected average number of particles per 1/invh
 234  * sort is the sorting work array.
 235  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 236  * or easier, allocate at least n*SGSF elements.
 237  */
 238 static void sort_atoms(int dim, gmx_bool Backwards,
 239                        int gmx_unused dd_zone,
 240                        int *a, int n, const rvec *x,
 241                        real h0, real invh, int n_per_h,
 242                        gmx::ArrayRef<int> sort)
 243 {
 244     if (n <= 1)
 245     {
 246         /* Nothing to do */
 247         return;
 248     }
 249
 250 #ifndef NDEBUG
 251     if (n > n_per_h)
 252     {
 253         gmx_incons("n > n_per_h");
 254     }
 255 #endif
 256
 257     /* Transform the inverse range height into the inverse hole height */
 258     invh *= n_per_h*SORT_GRID_OVERSIZE;
 259
 260     /* Set nsort to the maximum possible number of holes used.
 261      * In worst case all n elements end up in the last bin.
 262      */
 263     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 264
 265     /* Determine the index range used, so we can limit it for the second pass */
 266     int zi_min = INT_MAX;
 267     int zi_max = -1;
 268
 269     /* Sort the particles using a simple index sort */
 270     for (int i = 0; i < n; i++)
 271     {
 272         /* The cast takes care of float-point rounding effects below zero.
 273          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 274          * times the box height out of the box.
 275          */
 276         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 277
 278 #ifndef NDEBUG
 279         /* As we can have rounding effect, we use > iso >= here */
 280         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 281         {
 282             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 283                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 284                       n_per_h, SORT_GRID_OVERSIZE);
 285         }
 286 #endif
 287
 288         /* In a non-local domain, particles communcated for bonded interactions
 289          * can be far beyond the grid size, which is set by the non-bonded
 290          * cut-off distance. We sort such particles into the last cell.
 291          */
 292         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 293         {
 294             zi = n_per_h*SORT_GRID_OVERSIZE;
 295         }
 296
 297         /* Ideally this particle should go in sort cell zi,
 298          * but that might already be in use,
 299          * in that case find the first empty cell higher up
 300          */
 301         if (sort[zi] < 0)
 302         {
 303             sort[zi] = a[i];
 304             zi_min   = std::min(zi_min, zi);
 305             zi_max   = std::max(zi_max, zi);
 306         }
 307         else
 308         {
 309             /* We have multiple atoms in the same sorting slot.
 310              * Sort on real z for minimal bounding box size.
 311              * There is an extra check for identical z to ensure
 312              * well-defined output order, independent of input order
 313              * to ensure binary reproducibility after restarts.
 314              */
 315             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 316                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 317                                        a[i] > sort[zi])))
 318             {
 319                 zi++;
 320             }
 321
 322             if (sort[zi] >= 0)
 323             {
 324                 /* Shift all elements by one slot until we find an empty slot */
 325                 int cp  = sort[zi];
 326                 int zim = zi + 1;
 327                 while (sort[zim] >= 0)
 328                 {
 329                     int tmp   = sort[zim];
 330                     sort[zim] = cp;
 331                     cp        = tmp;
 332                     zim++;
 333                 }
 334                 sort[zim] = cp;
 335                 zi_max    = std::max(zi_max, zim);
 336             }
 337             sort[zi] = a[i];
 338             zi_max   = std::max(zi_max, zi);
 339         }
 340     }
 341
 342     int c = 0;
 343     if (!Backwards)
 344     {
 345         for (int zi = 0; zi < nsort; zi++)
 346         {
 347             if (sort[zi] >= 0)
 348             {
 349                 a[c++]   = sort[zi];
 350                 sort[zi] = -1;
 351             }
 352         }
 353     }
 354     else
 355     {
 356         for (int zi = zi_max; zi >= zi_min; zi--)
 357         {
 358             if (sort[zi] >= 0)
 359             {
 360                 a[c++]   = sort[zi];
 361                 sort[zi] = -1;
 362             }
 363         }
 364     }
 365     if (c < n)
 366     {
 367         gmx_incons("Lost particles while sorting");
 368     }
 369 }
 370
 371 #if GMX_DOUBLE
 372 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 373 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 374 #else
 375 #define R2F_D(x) (x)
 376 #define R2F_U(x) (x)
 377 #endif
 378
 379 /* Coordinate order x,y,z, bb order xyz0 */
 380 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 381 {
 382     int  i;
 383     real xl, xh, yl, yh, zl, zh;
 384
 385     i  = 0;
 386     xl = x[i+XX];
 387     xh = x[i+XX];
 388     yl = x[i+YY];
 389     yh = x[i+YY];
 390     zl = x[i+ZZ];
 391     zh = x[i+ZZ];
 392     i += stride;
 393     for (int j = 1; j < na; j++)
 394     {
 395         xl = std::min(xl, x[i+XX]);
 396         xh = std::max(xh, x[i+XX]);
 397         yl = std::min(yl, x[i+YY]);
 398         yh = std::max(yh, x[i+YY]);
 399         zl = std::min(zl, x[i+ZZ]);
 400         zh = std::max(zh, x[i+ZZ]);
 401         i += stride;
 402     }
 403     /* Note: possible double to float conversion here */
 404     bb->lower[BB_X] = R2F_D(xl);
 405     bb->lower[BB_Y] = R2F_D(yl);
 406     bb->lower[BB_Z] = R2F_D(zl);
 407     bb->upper[BB_X] = R2F_U(xh);
 408     bb->upper[BB_Y] = R2F_U(yh);
 409     bb->upper[BB_Z] = R2F_U(zh);
 410 }
 411
 412 /* Packed coordinates, bb order xyz0 */
 413 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 414 {
 415     real xl, xh, yl, yh, zl, zh;
 416
 417     xl = x[XX*c_packX4];
 418     xh = x[XX*c_packX4];
 419     yl = x[YY*c_packX4];
 420     yh = x[YY*c_packX4];
 421     zl = x[ZZ*c_packX4];
 422     zh = x[ZZ*c_packX4];
 423     for (int j = 1; j < na; j++)
 424     {
 425         xl = std::min(xl, x[j+XX*c_packX4]);
 426         xh = std::max(xh, x[j+XX*c_packX4]);
 427         yl = std::min(yl, x[j+YY*c_packX4]);
 428         yh = std::max(yh, x[j+YY*c_packX4]);
 429         zl = std::min(zl, x[j+ZZ*c_packX4]);
 430         zh = std::max(zh, x[j+ZZ*c_packX4]);
 431     }
 432     /* Note: possible double to float conversion here */
 433     bb->lower[BB_X] = R2F_D(xl);
 434     bb->lower[BB_Y] = R2F_D(yl);
 435     bb->lower[BB_Z] = R2F_D(zl);
 436     bb->upper[BB_X] = R2F_U(xh);
 437     bb->upper[BB_Y] = R2F_U(yh);
 438     bb->upper[BB_Z] = R2F_U(zh);
 439 }
 440
 441 /* Packed coordinates, bb order xyz0 */
 442 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 443 {
 444     real xl, xh, yl, yh, zl, zh;
 445
 446     xl = x[XX*c_packX8];
 447     xh = x[XX*c_packX8];
 448     yl = x[YY*c_packX8];
 449     yh = x[YY*c_packX8];
 450     zl = x[ZZ*c_packX8];
 451     zh = x[ZZ*c_packX8];
 452     for (int j = 1; j < na; j++)
 453     {
 454         xl = std::min(xl, x[j+XX*c_packX8]);
 455         xh = std::max(xh, x[j+XX*c_packX8]);
 456         yl = std::min(yl, x[j+YY*c_packX8]);
 457         yh = std::max(yh, x[j+YY*c_packX8]);
 458         zl = std::min(zl, x[j+ZZ*c_packX8]);
 459         zh = std::max(zh, x[j+ZZ*c_packX8]);
 460     }
 461     /* Note: possible double to float conversion here */
 462     bb->lower[BB_X] = R2F_D(xl);
 463     bb->lower[BB_Y] = R2F_D(yl);
 464     bb->lower[BB_Z] = R2F_D(zl);
 465     bb->upper[BB_X] = R2F_U(xh);
 466     bb->upper[BB_Y] = R2F_U(yh);
 467     bb->upper[BB_Z] = R2F_U(zh);
 468 }
 469
 470 /* Packed coordinates, bb order xyz0 */
 471 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 472                                                      nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 473 {
 474     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 475     using namespace gmx;
 476
 477     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 478
 479     if (na > 2)
 480     {
 481         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 482     }
 483     else
 484     {
 485         /* Set the "empty" bounding box to the same as the first one,
 486          * so we don't need to treat special cases in the rest of the code.
 487          */
 488 #if NBNXN_SEARCH_BB_SIMD4
 489         store4(&bbj[1].lower[0], load4(&bbj[0].lower[0]));
 490         store4(&bbj[1].upper[0], load4(&bbj[0].upper[0]));
 491 #else
 492         bbj[1] = bbj[0];
 493 #endif
 494     }
 495
 496 #if NBNXN_SEARCH_BB_SIMD4
 497     store4(&bb->lower[0], min(load4(&bbj[0].lower[0]), load4(&bbj[1].lower[0])));
 498     store4(&bb->upper[0], max(load4(&bbj[0].upper[0]), load4(&bbj[1].upper[0])));
 499 #else
 500     {
 501         int i;
 502
 503         for (i = 0; i < NNBSBB_C; i++)
 504         {
 505             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 506             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 507         }
 508     }
 509 #endif
 510 }
 511
 512 #if NBNXN_SEARCH_BB_SIMD4
 513
 514 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 515 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 516 {
 517     int  i;
 518     real xl, xh, yl, yh, zl, zh;
 519
 520     i  = 0;
 521     xl = x[i+XX];
 522     xh = x[i+XX];
 523     yl = x[i+YY];
 524     yh = x[i+YY];
 525     zl = x[i+ZZ];
 526     zh = x[i+ZZ];
 527     i += stride;
 528     for (int j = 1; j < na; j++)
 529     {
 530         xl = std::min(xl, x[i+XX]);
 531         xh = std::max(xh, x[i+XX]);
 532         yl = std::min(yl, x[i+YY]);
 533         yh = std::max(yh, x[i+YY]);
 534         zl = std::min(zl, x[i+ZZ]);
 535         zh = std::max(zh, x[i+ZZ]);
 536         i += stride;
 537     }
 538     /* Note: possible double to float conversion here */
 539     bb[0*STRIDE_PBB] = R2F_D(xl);
 540     bb[1*STRIDE_PBB] = R2F_D(yl);
 541     bb[2*STRIDE_PBB] = R2F_D(zl);
 542     bb[3*STRIDE_PBB] = R2F_U(xh);
 543     bb[4*STRIDE_PBB] = R2F_U(yh);
 544     bb[5*STRIDE_PBB] = R2F_U(zh);
 545 }
 546
 547 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 548
 549 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 550
 551 /* Coordinate order xyz?, bb order xyz0 */
 552 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 553 {
 554     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 555     using namespace gmx;
 556
 557     Simd4Float bb_0_S, bb_1_S;
 558     Simd4Float x_S;
 559
 560     bb_0_S = load4(x);
 561     bb_1_S = bb_0_S;
 562
 563     for (int i = 1; i < na; i++)
 564     {
 565         x_S    = load4(x+i*NNBSBB_C);
 566         bb_0_S = min(bb_0_S, x_S);
 567         bb_1_S = max(bb_1_S, x_S);
 568     }
 569
 570     store4(&bb->lower[0], bb_0_S);
 571     store4(&bb->upper[0], bb_1_S);
 572 }
 573
 574 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 575 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 576                                          nbnxn_bb_t *bb_work_aligned,
 577                                          real *bb)
 578 {
 579     calc_bounding_box_simd4(na, x, bb_work_aligned);
 580
 581     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 582     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 583     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 584     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 585     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 586     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 587 }
 588
 589 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 590
 591
 592 /* Combines pairs of consecutive bounding boxes */
 593 static void combine_bounding_box_pairs(nbnxn_grid_t                    *grid,
 594                                        gmx::ArrayRef<const nbnxn_bb_t>  bb)
 595 {
 596     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 597     using namespace gmx;
 598
 599     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY]; i++)
 600     {
 601         /* Starting bb in a column is expected to be 2-aligned */
 602         int sc2 = grid->cxy_ind[i]>>1;
 603         /* For odd numbers skip the last bb here */
 604         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 605         int c2;
 606         for (c2 = sc2; c2 < sc2+nc2; c2++)
 607         {
 608 #if NBNXN_SEARCH_BB_SIMD4
 609             Simd4Float min_S, max_S;
 610
 611             min_S = min(load4(&bb[c2*2+0].lower[0]),
 612                         load4(&bb[c2*2+1].lower[0]));
 613             max_S = max(load4(&bb[c2*2+0].upper[0]),
 614                         load4(&bb[c2*2+1].upper[0]));
 615             store4(&grid->bbj[c2].lower[0], min_S);
 616             store4(&grid->bbj[c2].upper[0], max_S);
 617 #else
 618             for (int j = 0; j < NNBSBB_C; j++)
 619             {
 620                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 621                                                   bb[c2*2+1].lower[j]);
 622                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 623                                                   bb[c2*2+1].upper[j]);
 624             }
 625 #endif
 626         }
 627         if (((grid->cxy_na[i]+3)>>2) & 1)
 628         {
 629             /* The bb count in this column is odd: duplicate the last bb */
 630             for (int j = 0; j < NNBSBB_C; j++)
 631             {
 632                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 633                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 634             }
 635         }
 636     }
 637 }
 638
 639
 640 /* Prints the average bb size, used for debug output */
 641 static void print_bbsizes_simple(FILE                *fp,
 642                                  const nbnxn_grid_t  *grid)
 643 {
 644     dvec ba;
 645
 646     clear_dvec(ba);
 647     for (int c = 0; c < grid->nc; c++)
 648     {
 649         for (int d = 0; d < DIM; d++)
 650         {
 651             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 652         }
 653     }
 654     dsvmul(1.0/grid->nc, ba, ba);
 655
 656     real avgCellSizeZ = (grid->atom_density > 0 ? grid->na_c/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]) : 0.0);
 657
 658     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 659             grid->cellSize[XX],
 660             grid->cellSize[YY],
 661             avgCellSizeZ,
 662             ba[XX], ba[YY], ba[ZZ],
 663             ba[XX]*grid->invCellSize[XX],
 664             ba[YY]*grid->invCellSize[YY],
 665             grid->atom_density > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 666 }
 667
 668 /* Prints the average bb size, used for debug output */
 669 static void print_bbsizes_supersub(FILE                *fp,
 670                                    const nbnxn_grid_t  *grid)
 671 {
 672     int  ns;
 673     dvec ba;
 674
 675     clear_dvec(ba);
 676     ns = 0;
 677     for (int c = 0; c < grid->nc; c++)
 678     {
 679 #if NBNXN_BBXXXX
 680         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 681         {
 682             int cs_w = (c*c_gpuNumClusterPerCell + s)/STRIDE_PBB;
 683             for (int i = 0; i < STRIDE_PBB; i++)
 684             {
 685                 for (int d = 0; d < DIM; d++)
 686                 {
 687                     ba[d] +=
 688                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 689                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 690                 }
 691             }
 692         }
 693 #else
 694         for (int s = 0; s < grid->nsubc[c]; s++)
 695         {
 696             int cs = c*c_gpuNumClusterPerCell + s;
 697             for (int d = 0; d < DIM; d++)
 698             {
 699                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 700             }
 701         }
 702 #endif
 703         ns += grid->nsubc[c];
 704     }
 705     dsvmul(1.0/ns, ba, ba);
 706
 707     real avgClusterSizeZ = (grid->atom_density > 0 ? grid->na_sc/(grid->atom_density*grid->cellSize[XX]*grid->cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 708
 709     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 710             grid->cellSize[XX]/c_gpuNumClusterPerCellX,
 711             grid->cellSize[YY]/c_gpuNumClusterPerCellY,
 712             avgClusterSizeZ,
 713             ba[XX], ba[YY], ba[ZZ],
 714             ba[XX]*c_gpuNumClusterPerCellX*grid->invCellSize[XX],
 715             ba[YY]*c_gpuNumClusterPerCellY*grid->invCellSize[YY],
 716             grid->atom_density > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 717 }
 718
 719 /* Set non-bonded interaction flags for the current cluster.
 720  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 721  */
 722 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 723                                  int                 atomStart,
 724                                  int                 atomEnd,
 725                                  const int          *atinfo,
 726                                  gmx::ArrayRef<int>  order,
 727                                  int                *flags)
 728 {
 729     constexpr int c_maxNumAtomsInCluster = 8;
 730     int           sort1[c_maxNumAtomsInCluster];
 731     int           sort2[c_maxNumAtomsInCluster];
 732
 733     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 734
 735     *flags = 0;
 736
 737     int subc = 0;
 738     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 739     {
 740         /* Make lists for this (sub-)cell on atoms with and without LJ */
 741         int      n1         = 0;
 742         int      n2         = 0;
 743         gmx_bool haveQ      = FALSE;
 744         int      a_lj_max   = -1;
 745         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 746         {
 747             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 748
 749             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 750             {
 751                 sort1[n1++] = order[a];
 752                 a_lj_max    = a;
 753             }
 754             else
 755             {
 756                 sort2[n2++] = order[a];
 757             }
 758         }
 759
 760         /* If we don't have atoms with LJ, there's nothing to sort */
 761         if (n1 > 0)
 762         {
 763             *flags |= NBNXN_CI_DO_LJ(subc);
 764
 765             if (2*n1 <= numAtomsInCluster)
 766             {
 767                 /* Only sort when strictly necessary. Ordering particles
 768                  * Ordering particles can lead to less accurate summation
 769                  * due to rounding, both for LJ and Coulomb interactions.
 770                  */
 771                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 772                 {
 773                     for (int i = 0; i < n1; i++)
 774                     {
 775                         order[atomStart + i]      = sort1[i];
 776                     }
 777                     for (int j = 0; j < n2; j++)
 778                     {
 779                         order[atomStart + n1 + j] = sort2[j];
 780                     }
 781                 }
 782
 783                 *flags |= NBNXN_CI_HALF_LJ(subc);
 784             }
 785         }
 786         if (haveQ)
 787         {
 788             *flags |= NBNXN_CI_DO_COUL(subc);
 789         }
 790         subc++;
 791     }
 792 }
 793
 794 /* Fill a pair search cell with atoms.
 795  * Potentially sorts atoms and sets the interaction flags.
 796  */
 797 static void fill_cell(nbnxn_search          *nbs,
 798                       nbnxn_grid_t          *grid,
 799                       nbnxn_atomdata_t      *nbat,
 800                       int                    atomStart,
 801                       int                    atomEnd,
 802                       const int             *atinfo,
 803                       const rvec            *x,
 804                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 805 {
 806     const int numAtoms = atomEnd - atomStart;
 807
 808     if (grid->bSimple)
 809     {
 810         /* Note that non-local grids are already sorted.
 811          * Then sort_cluster_on_flag will only set the flags and the sorting
 812          * will not affect the atom order.
 813          */
 814         sort_cluster_on_flag(grid->na_c, atomStart, atomEnd, atinfo, nbs->a,
 815                              grid->flags.data() + (atomStart >> grid->na_c_2log) - grid->cell0);
 816     }
 817
 818     if (nbs->bFEP)
 819     {
 820         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 821
 822         /* The grid-local cluster/(sub-)cell index */
 823         int cell = (atomStart >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : c_gpuNumClusterPerCell);
 824         grid->fep[cell] = 0;
 825         for (int at = atomStart; at < atomEnd; at++)
 826         {
 827             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 828             {
 829                 grid->fep[cell] |= (1 << (at - atomStart));
 830             }
 831         }
 832     }
 833
 834     /* Now we have sorted the atoms, set the cell indices */
 835     for (int at = atomStart; at < atomEnd; at++)
 836     {
 837         nbs->cell[nbs->a[at]] = at;
 838     }
 839
 840     copy_rvec_to_nbat_real(nbs->a.data() + atomStart, numAtoms, grid->na_c, x,
 841                            nbat->XFormat, nbat->x, atomStart);
 842
 843     if (nbat->XFormat == nbatX4)
 844     {
 845         /* Store the bounding boxes as xyz.xyz. */
 846         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 847         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 848
 849 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 850         if (2*grid->na_cj == grid->na_c)
 851         {
 852             calc_bounding_box_x_x4_halves(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 853                                           grid->bbj.data() + offset*2);
 854         }
 855         else
 856 #endif
 857         {
 858             calc_bounding_box_x_x4(numAtoms, nbat->x + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 859         }
 860     }
 861     else if (nbat->XFormat == nbatX8)
 862     {
 863         /* Store the bounding boxes as xyz.xyz. */
 864         size_t      offset = (atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 865         nbnxn_bb_t *bb_ptr = grid->bb.data() + offset;
 866
 867         calc_bounding_box_x_x8(numAtoms, nbat->x +  atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 868     }
 869 #if NBNXN_BBXXXX
 870     else if (!grid->bSimple)
 871     {
 872         /* Store the bounding boxes in a format convenient
 873          * for SIMD4 calculations: xxxxyyyyzzzz...
 874          */
 875         float *pbb_ptr =
 876             grid->pbb.data() +
 877             ((atomStart - grid->cell0*grid->na_sc) >> (grid->na_c_2log + STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 878             (((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log) & (STRIDE_PBB - 1));
 879
 880 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 881         if (nbat->XFormat == nbatXYZQ)
 882         {
 883             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x + atomStart*nbat->xstride,
 884                                          bb_work_aligned, pbb_ptr);
 885         }
 886         else
 887 #endif
 888         {
 889             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 890                                    pbb_ptr);
 891         }
 892         if (gmx_debug_at)
 893         {
 894             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 895                     atomStart >> grid->na_c_2log,
 896                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 897                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 898                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 899         }
 900     }
 901 #endif
 902     else
 903     {
 904         /* Store the bounding boxes as xyz.xyz. */
 905         nbnxn_bb_t *bb_ptr = grid->bb.data() + ((atomStart - grid->cell0*grid->na_sc) >> grid->na_c_2log);
 906
 907         calc_bounding_box(numAtoms, nbat->xstride, nbat->x + atomStart*nbat->xstride,
 908                           bb_ptr);
 909
 910         if (gmx_debug_at)
 911         {
 912             int bbo = (atomStart - grid->cell0*grid->na_sc)/grid->na_c;
 913             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 914                     atomStart >> grid->na_c_2log,
 915                     grid->bb[bbo].lower[BB_X],
 916                     grid->bb[bbo].lower[BB_Y],
 917                     grid->bb[bbo].lower[BB_Z],
 918                     grid->bb[bbo].upper[BB_X],
 919                     grid->bb[bbo].upper[BB_Y],
 920                     grid->bb[bbo].upper[BB_Z]);
 921         }
 922     }
 923 }
 924
 925 /* Spatially sort the atoms within one grid column */
 926 static void sort_columns_simple(nbnxn_search *nbs,
 927                                 int dd_zone,
 928                                 nbnxn_grid_t *grid,
 929                                 int atomStart, int atomEnd,
 930                                 const int *atinfo,
 931                                 const rvec *x,
 932                                 nbnxn_atomdata_t *nbat,
 933                                 int cxy_start, int cxy_end,
 934                                 gmx::ArrayRef<int> sort_work)
 935 {
 936     if (debug)
 937     {
 938         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 939                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
 940     }
 941
 942     /* Sort the atoms within each x,y column in 3 dimensions */
 943     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 944     {
 945         int na  = grid->cxy_na[cxy];
 946         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 947         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 948
 949         /* Sort the atoms within each x,y column on z coordinate */
 950         sort_atoms(ZZ, FALSE, dd_zone,
 951                    nbs->a.data() + ash, na, x,
 952                    grid->c0[ZZ],
 953                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 954                    sort_work);
 955
 956         /* Fill the ncz cells in this column */
 957         int cfilled = grid->cxy_ind[cxy];
 958         for (int cz = 0; cz < ncz; cz++)
 959         {
 960             int c     = grid->cxy_ind[cxy] + cz;
 961
 962             int ash_c = ash + cz*grid->na_sc;
 963             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
 964
 965             fill_cell(nbs, grid, nbat,
 966                       ash_c, ash_c+na_c, atinfo, x,
 967                       nullptr);
 968
 969             /* This copy to bbcz is not really necessary.
 970              * But it allows to use the same grid search code
 971              * for the simple and supersub cell setups.
 972              */
 973             if (na_c > 0)
 974             {
 975                 cfilled = c;
 976             }
 977             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
 978             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 979         }
 980
 981         /* Set the unused atom indices to -1 */
 982         for (int ind = na; ind < ncz*grid->na_sc; ind++)
 983         {
 984             nbs->a[ash+ind] = -1;
 985         }
 986     }
 987 }
 988
 989 /* Spatially sort the atoms within one grid column */
 990 static void sort_columns_supersub(nbnxn_search *nbs,
 991                                   int dd_zone,
 992                                   nbnxn_grid_t *grid,
 993                                   int atomStart, int atomEnd,
 994                                   const int *atinfo,
 995                                   const rvec *x,
 996                                   nbnxn_atomdata_t *nbat,
 997                                   int cxy_start, int cxy_end,
 998                                   gmx::ArrayRef<int> sort_work)
 999 {
1000     nbnxn_bb_t  bb_work_array[2];
1001     nbnxn_bb_t *bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1002
1003     if (debug)
1004     {
1005         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1006                 grid->cell0, cxy_start, cxy_end, atomStart, atomEnd);
1007     }
1008
1009     int subdiv_x = grid->na_c;
1010     int subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1011     int subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1012
1013     /* Sort the atoms within each x,y column in 3 dimensions.
1014      * Loop over all columns on the x/y grid.
1015      */
1016     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1017     {
1018         int gridX            = cxy/grid->numCells[YY];
1019         int gridY            = cxy - gridX*grid->numCells[YY];
1020
1021         int numAtomsInColumn = grid->cxy_na[cxy];
1022         int numCellsInColumn = grid->cxy_ind[cxy + 1] - grid->cxy_ind[cxy];
1023         int ash              = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1024
1025         /* Sort the atoms within each x,y column on z coordinate */
1026         sort_atoms(ZZ, FALSE, dd_zone,
1027                    nbs->a.data() + ash, numAtomsInColumn, x,
1028                    grid->c0[ZZ],
1029                    1.0/grid->size[ZZ], numCellsInColumn*grid->na_sc,
1030                    sort_work);
1031
1032         /* This loop goes over the cells and clusters along z at once */
1033         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1034         {
1035             int ash_z = ash + sub_z*subdiv_z;
1036             int na_z  = std::min(subdiv_z, numAtomsInColumn - (ash_z - ash));
1037             int cz    = -1;
1038             /* We have already sorted on z */
1039
1040             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1041             {
1042                 cz = sub_z/c_gpuNumClusterPerCellZ;
1043                 int cell = grid->cxy_ind[cxy] + cz;
1044
1045                 /* The number of atoms in this cell/super-cluster */
1046                 int numAtoms = std::min(grid->na_sc, numAtomsInColumn - (ash_z - ash));
1047
1048                 grid->nsubc[cell] = std::min(c_gpuNumClusterPerCell,
1049                                              (numAtoms + grid->na_c - 1)/grid->na_c);
1050
1051                 /* Store the z-boundaries of the bounding box of the cell */
1052                 grid->bbcz[cell*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1053                 grid->bbcz[cell*NNBSBB_D+1] = x[nbs->a[ash_z + numAtoms - 1]][ZZ];
1054             }
1055
1056             if (c_gpuNumClusterPerCellY > 1)
1057             {
1058                 /* Sort the atoms along y */
1059                 sort_atoms(YY, (sub_z & 1), dd_zone,
1060                            nbs->a.data() + ash_z, na_z, x,
1061                            grid->c0[YY] + gridY*grid->cellSize[YY],
1062                            grid->invCellSize[YY], subdiv_z,
1063                            sort_work);
1064             }
1065
1066             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1067             {
1068                 int ash_y = ash_z + sub_y*subdiv_y;
1069                 int na_y  = std::min(subdiv_y, numAtomsInColumn - (ash_y - ash));
1070
1071                 if (c_gpuNumClusterPerCellX > 1)
1072                 {
1073                     /* Sort the atoms along x */
1074                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1), dd_zone,
1075                                nbs->a.data() + ash_y, na_y, x,
1076                                grid->c0[XX] + gridX*grid->cellSize[XX],
1077                                grid->invCellSize[XX], subdiv_y,
1078                                sort_work);
1079                 }
1080
1081                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1082                 {
1083                     int ash_x = ash_y + sub_x*subdiv_x;
1084                     int na_x  = std::min(subdiv_x, numAtomsInColumn - (ash_x - ash));
1085
1086                     fill_cell(nbs, grid, nbat,
1087                               ash_x, ash_x + na_x, atinfo, x,
1088                               bb_work_aligned);
1089                 }
1090             }
1091         }
1092
1093         /* Set the unused atom indices to -1 */
1094         for (int ind = numAtomsInColumn; ind < numCellsInColumn*grid->na_sc; ind++)
1095         {
1096             nbs->a[ash + ind] = -1;
1097         }
1098     }
1099 }
1100
1101 /* Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1102 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1103                                 int                 cellIndex,
1104                                 gmx::ArrayRef<int>  cxy_na,
1105                                 int                 atomIndex)
1106 {
1107     cell[atomIndex]    = cellIndex;
1108     cxy_na[cellIndex] += 1;
1109 }
1110
1111 /* Determine in which grid column atoms should go */
1112 static void calc_column_indices(nbnxn_grid_t *grid,
1113                                 int atomStart, int atomEnd,
1114                                 const rvec *x,
1115                                 int dd_zone, const int *move,
1116                                 int thread, int nthread,
1117                                 gmx::ArrayRef<int> cell,
1118                                 gmx::ArrayRef<int> cxy_na)
1119 {
1120     /* We add one extra cell for particles which moved during DD */
1121     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1122     {
1123         cxy_na[i] = 0;
1124     }
1125
1126     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1127     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1128     if (dd_zone == 0)
1129     {
1130         /* Home zone */
1131         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1132         {
1133             if (move == nullptr || move[i] >= 0)
1134             {
1135                 /* We need to be careful with rounding,
1136                  * particles might be a few bits outside the local zone.
1137                  * The int cast takes care of the lower bound,
1138                  * we will explicitly take care of the upper bound.
1139                  */
1140                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->invCellSize[XX]);
1141                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->invCellSize[YY]);
1142
1143 #ifndef NDEBUG
1144                 if (cx < 0 || cx > grid->numCells[XX] ||
1145                     cy < 0 || cy > grid->numCells[YY])
1146                 {
1147                     gmx_fatal(FARGS,
1148                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1149                               "atom %f %f %f, grid->c0 %f %f",
1150                               cx, cy, grid->numCells[XX], grid->numCells[YY],
1151                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1152                 }
1153 #endif
1154                 /* Take care of potential rouding issues */
1155                 cx = std::min(cx, grid->numCells[XX] - 1);
1156                 cy = std::min(cy, grid->numCells[YY] - 1);
1157
1158                 /* For the moment cell will contain only the, grid local,
1159                  * x and y indices, not z.
1160                  */
1161                 setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1162             }
1163             else
1164             {
1165                 /* Put this moved particle after the end of the grid,
1166                  * so we can process it later without using conditionals.
1167                  */
1168                 setCellAndAtomCount(cell, grid->numCells[XX]*grid->numCells[YY], cxy_na, i);
1169             }
1170         }
1171     }
1172     else
1173     {
1174         /* Non-home zone */
1175         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1176         {
1177             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->invCellSize[XX]);
1178             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->invCellSize[YY]);
1179
1180             /* For non-home zones there could be particles outside
1181              * the non-bonded cut-off range, which have been communicated
1182              * for bonded interactions only. For the result it doesn't
1183              * matter where these end up on the grid. For performance
1184              * we put them in an extra row at the border.
1185              */
1186             cx = std::max(cx, 0);
1187             cx = std::min(cx, grid->numCells[XX] - 1);
1188             cy = std::max(cy, 0);
1189             cy = std::min(cy, grid->numCells[YY] - 1);
1190
1191             /* For the moment cell will contain only the, grid local,
1192              * x and y indices, not z.
1193              */
1194             setCellAndAtomCount(cell, cx*grid->numCells[YY] + cy, cxy_na, i);
1195         }
1196     }
1197 }
1198
1199 /* Resizes grid and atom data which depend on the number of cells */
1200 static void resizeForNumberOfCells(const nbnxn_grid_t &grid,
1201                                    int                 numAtomsMoved,
1202                                    nbnxn_search       *nbs,
1203                                    nbnxn_atomdata_t   *nbat)
1204 {
1205     int numNbnxnAtoms = (grid.cell0 + grid.nc)*grid.na_sc;
1206
1207     /* Note: nbs->cell was already resized before */
1208
1209     /* To avoid conditionals we store the moved particles at the end of a,
1210      * make sure we have enough space.
1211      */
1212     nbs->a.resize(numNbnxnAtoms + numAtomsMoved);
1213
1214     /* We need padding up to a multiple of the buffer flag size: simply add */
1215     if (numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1216     {
1217         nbnxn_atomdata_realloc(nbat, numNbnxnAtoms + NBNXN_BUFFERFLAG_SIZE);
1218     }
1219
1220     nbat->natoms = numNbnxnAtoms;
1221 }
1222
1223 /* Determine in which grid cells the atoms should go */
1224 static void calc_cell_indices(nbnxn_search         *nbs,
1225                               int                   ddZone,
1226                               nbnxn_grid_t         *grid,
1227                               int                   atomStart,
1228                               int                   atomEnd,
1229                               const int            *atinfo,
1230                               const rvec           *x,
1231                               int                   numAtomsMoved,
1232                               const int            *move,
1233                               nbnxn_atomdata_t     *nbat)
1234 {
1235     /* First compute all grid/column indices and store them in nbs->cell */
1236     nbs->cell.resize(atomEnd);
1237
1238     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1239
1240 #pragma omp parallel for num_threads(nthread) schedule(static)
1241     for (int thread = 0; thread < nthread; thread++)
1242     {
1243         try
1244         {
1245             calc_column_indices(grid, atomStart, atomEnd, x, ddZone, move, thread, nthread,
1246                                 nbs->cell, nbs->work[thread].cxy_na);
1247         }
1248         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1249     }
1250
1251     /* Make the cell index as a function of x and y */
1252     int ncz_max      = 0;
1253     int ncz          = 0;
1254     grid->cxy_ind[0] = 0;
1255     for (int i = 0; i < grid->numCells[XX]*grid->numCells[YY] + 1; i++)
1256     {
1257         /* We set ncz_max at the beginning of the loop iso at the end
1258          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1259          * that do not need to be ordered on the grid.
1260          */
1261         if (ncz > ncz_max)
1262         {
1263             ncz_max = ncz;
1264         }
1265         int cxy_na_i = nbs->work[0].cxy_na[i];
1266         for (int thread = 1; thread < nthread; thread++)
1267         {
1268             cxy_na_i += nbs->work[thread].cxy_na[i];
1269         }
1270         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1271         if (nbat->XFormat == nbatX8)
1272         {
1273             /* Make the number of cell a multiple of 2 */
1274             ncz = (ncz + 1) & ~1;
1275         }
1276         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1277         /* Clear cxy_na, so we can reuse the array below */
1278         grid->cxy_na[i] = 0;
1279     }
1280     grid->nc = grid->cxy_ind[grid->numCells[XX]*grid->numCells[YY]] - grid->cxy_ind[0];
1281
1282     resizeForNumberOfCells(*grid, numAtomsMoved, nbs, nbat);
1283
1284     if (debug)
1285     {
1286         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1287                 grid->na_sc, grid->na_c, grid->nc,
1288                 grid->numCells[XX], grid->numCells[YY], grid->nc/(static_cast<double>(grid->numCells[XX]*grid->numCells[YY])),
1289                 ncz_max);
1290         if (gmx_debug_at)
1291         {
1292             int i = 0;
1293             for (int cy = 0; cy < grid->numCells[YY]; cy++)
1294             {
1295                 for (int cx = 0; cx < grid->numCells[XX]; cx++)
1296                 {
1297                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1298                     i++;
1299                 }
1300                 fprintf(debug, "\n");
1301             }
1302         }
1303     }
1304
1305     /* Make sure the work array for sorting is large enough */
1306     if (ncz_max*grid->na_sc*SGSF > gmx::index(nbs->work[0].sortBuffer.size()))
1307     {
1308         for (nbnxn_search_work_t &work : nbs->work)
1309         {
1310             /* Elements not in use should be -1 */
1311             work.sortBuffer.resize(ncz_max*grid->na_sc*SGSF, -1);
1312         }
1313     }
1314
1315     /* Now we know the dimensions we can fill the grid.
1316      * This is the first, unsorted fill. We sort the columns after this.
1317      */
1318     for (int i = atomStart; i < atomEnd; i++)
1319     {
1320         /* At this point nbs->cell contains the local grid x,y indices */
1321         int cxy = nbs->cell[i];
1322         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1323     }
1324
1325     if (ddZone == 0)
1326     {
1327         /* Set the cell indices for the moved particles */
1328         int n0 = grid->nc*grid->na_sc;
1329         int n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->numCells[XX]*grid->numCells[YY]];
1330         if (ddZone == 0)
1331         {
1332             for (int i = n0; i < n1; i++)
1333             {
1334                 nbs->cell[nbs->a[i]] = i;
1335             }
1336         }
1337     }
1338
1339     /* Sort the super-cell columns along z into the sub-cells. */
1340 #pragma omp parallel for num_threads(nthread) schedule(static)
1341     for (int thread = 0; thread < nthread; thread++)
1342     {
1343         try
1344         {
1345             int columnStart = ((thread + 0)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1346             int columnEnd   = ((thread + 1)*grid->numCells[XX]*grid->numCells[YY])/nthread;
1347             if (grid->bSimple)
1348             {
1349                 sort_columns_simple(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1350                                     columnStart, columnEnd,
1351                                     nbs->work[thread].sortBuffer);
1352             }
1353             else
1354             {
1355                 sort_columns_supersub(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, nbat,
1356                                       columnStart, columnEnd,
1357                                       nbs->work[thread].sortBuffer);
1358             }
1359         }
1360         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1361     }
1362
1363     if (grid->bSimple && nbat->XFormat == nbatX8)
1364     {
1365         combine_bounding_box_pairs(grid, grid->bb);
1366     }
1367
1368     if (!grid->bSimple)
1369     {
1370         grid->nsubc_tot = 0;
1371         for (int i = 0; i < grid->nc; i++)
1372         {
1373             grid->nsubc_tot += grid->nsubc[i];
1374         }
1375     }
1376
1377     if (debug)
1378     {
1379         if (grid->bSimple)
1380         {
1381             print_bbsizes_simple(debug, grid);
1382         }
1383         else
1384         {
1385             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1386                     grid->nsubc_tot, (atomEnd - atomStart)/static_cast<double>(grid->nsubc_tot));
1387
1388             print_bbsizes_supersub(debug, grid);
1389         }
1390     }
1391 }
1392
1393 /* Sets up a grid and puts the atoms on the grid.
1394  * This function only operates on one domain of the domain decompostion.
1395  * Note that without domain decomposition there is only one domain.
1396  */
1397 void nbnxn_put_on_grid(nbnxn_search_t    nbs,
1398                        int               ePBC,
1399                        const matrix      box,
1400                        int               ddZone,
1401                        const rvec        lowerCorner,
1402                        const rvec        upperCorner,
1403                        int               atomStart,
1404                        int               atomEnd,
1405                        real              atomDensity,
1406                        const int        *atinfo,
1407                        const rvec       *x,
1408                        int               numAtomsMoved,
1409                        const int        *move,
1410                        int               nb_kernel_type,
1411                        nbnxn_atomdata_t *nbat)
1412 {
1413     nbnxn_grid_t *grid = &nbs->grid[ddZone];
1414
1415     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1416
1417     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1418
1419     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1420     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1421     grid->na_sc     = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c;
1422     grid->na_c_2log = get_2log(grid->na_c);
1423
1424     nbat->na_c = grid->na_c;
1425
1426     if (ddZone == 0)
1427     {
1428         grid->cell0 = 0;
1429     }
1430     else
1431     {
1432         grid->cell0 =
1433             (nbs->grid[ddZone - 1].cell0 + nbs->grid[ddZone - 1].nc)*
1434             nbs->grid[ddZone- 1].na_sc/grid->na_sc;
1435     }
1436
1437     const int n = atomEnd - atomStart;
1438
1439     if (ddZone == 0)
1440     {
1441         nbs->ePBC = ePBC;
1442         copy_mat(box, nbs->box);
1443
1444         /* Avoid zero density */
1445         if (atomDensity > 0)
1446         {
1447             grid->atom_density = atomDensity;
1448         }
1449         else
1450         {
1451             grid->atom_density = grid_atom_density(n - numAtomsMoved, lowerCorner, upperCorner);
1452         }
1453
1454         grid->cell0 = 0;
1455
1456         nbs->natoms_local    = atomEnd - numAtomsMoved;
1457         /* We assume that nbnxn_put_on_grid is called first
1458          * for the local atoms (ddZone=0).
1459          */
1460         nbs->natoms_nonlocal = atomEnd - numAtomsMoved;
1461
1462         if (debug)
1463         {
1464             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1465                     nbs->natoms_local, grid->atom_density);
1466         }
1467     }
1468     else
1469     {
1470         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, atomEnd);
1471     }
1472
1473     /* We always use the home zone (grid[0]) for setting the cell size,
1474      * since determining densities for non-local zones is difficult.
1475      */
1476     set_grid_size_xy(nbs, grid,
1477                      ddZone, n - numAtomsMoved,
1478                      lowerCorner, upperCorner,
1479                      nbs->grid[0].atom_density);
1480
1481     calc_cell_indices(nbs, ddZone, grid, atomStart, atomEnd, atinfo, x, numAtomsMoved, move, nbat);
1482
1483     if (ddZone == 0)
1484     {
1485         nbat->natoms_local = nbat->natoms;
1486     }
1487
1488     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1489 }
1490
1491 /* Calls nbnxn_put_on_grid for all non-local domains */
1492 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1493                                 const struct gmx_domdec_zones_t *zones,
1494                                 const int                       *atinfo,
1495                                 const rvec                      *x,
1496                                 int                              nb_kernel_type,
1497                                 nbnxn_atomdata_t                *nbat)
1498 {
1499     for (int zone = 1; zone < zones->n; zone++)
1500     {
1501         rvec c0, c1;
1502         for (int d = 0; d < DIM; d++)
1503         {
1504             c0[d] = zones->size[zone].bb_x0[d];
1505             c1[d] = zones->size[zone].bb_x1[d];
1506         }
1507
1508         nbnxn_put_on_grid(nbs, nbs->ePBC, nullptr,
1509                           zone, c0, c1,
1510                           zones->cg_range[zone],
1511                           zones->cg_range[zone+1],
1512                           -1,
1513                           atinfo,
1514                           x,
1515                           0, nullptr,
1516                           nb_kernel_type,
1517                           nbat);
1518     }
1519 }
1520
1521 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1522 {
1523     *ncx = nbs->grid[0].numCells[XX];
1524     *ncy = nbs->grid[0].numCells[YY];
1525 }
1526
1527 gmx::ArrayRef<const int> nbnxn_get_atomorder(const nbnxn_search *nbs)
1528 {
1529     /* Return the atom order for the home cell (index 0) */
1530     const nbnxn_grid_t &grid       = nbs->grid[0];
1531
1532     int                 numIndices = grid.cxy_ind[grid.numCells[XX]*grid.numCells[YY]]*grid.na_sc;
1533
1534     return gmx::constArrayRefFromArray(nbs->a.data(), numIndices);
1535 }
1536
1537 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1538 {
1539     /* Set the atom order for the home cell (index 0) */
1540     nbnxn_grid_t *grid = &nbs->grid[0];
1541
1542     int           ao = 0;
1543     for (int cx = 0; cx < grid->numCells[XX]; cx++)
1544     {
1545         for (int cy = 0; cy < grid->numCells[YY]; cy++)
1546         {
1547             int cxy = cx*grid->numCells[YY] + cy;
1548             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1549             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1550             {
1551                 nbs->a[j]     = ao;
1552                 nbs->cell[ao] = j;
1553                 ao++;
1554                 j++;
1555             }
1556         }
1557     }
1558 }