src/gromacs/nbnxm/atomdata.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #include "gmxpre.h"
  38
  39 #include "atomdata.h"
  40
  41 #include <cassert>
  42 #include <cmath>
  43 #include <cstdlib>
  44 #include <cstring>
  45
  46 #include <algorithm>
  47
  48 #include "thread_mpi/atomic.h"
  49
  50 #include "gromacs/math/functions.h"
  51 #include "gromacs/math/utilities.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  54 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  55 #include "gromacs/mdtypes/mdatom.h"
  56 #include "gromacs/nbnxm/nbnxm.h"
  57 #include "gromacs/pbcutil/ishift.h"
  58 #include "gromacs/simd/simd.h"
  59 #include "gromacs/utility/exceptions.h"
  60 #include "gromacs/utility/fatalerror.h"
  61 #include "gromacs/utility/gmxomp.h"
  62 #include "gromacs/utility/logger.h"
  63 #include "gromacs/utility/smalloc.h"
  64 #include "gromacs/utility/strconvert.h"
  65 #include "gromacs/utility/stringutil.h"
  66
  67 #include "grid.h"
  68 #include "gridset.h"
  69 #include "nbnxm_geometry.h"
  70 #include "nbnxm_gpu.h"
  71 #include "pairlist.h"
  72
  73 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  74
  75 void nbnxn_atomdata_t::resizeCoordinateBuffer(int numAtoms)
  76 {
  77     numAtoms_ = numAtoms;
  78
  79     x_.resize(numAtoms * xstride);
  80 }
  81
  82 void nbnxn_atomdata_t::resizeForceBuffers()
  83 {
  84     /* Force buffers need padding up to a multiple of the buffer flag size */
  85     const int paddedSize =
  86             (numAtoms() + NBNXN_BUFFERFLAG_SIZE - 1) / NBNXN_BUFFERFLAG_SIZE * NBNXN_BUFFERFLAG_SIZE;
  87
  88     /* Should we let each thread allocate it's own data instead? */
  89     for (nbnxn_atomdata_output_t& outBuffer : out)
  90     {
  91         outBuffer.f.resize(paddedSize * fstride);
  92     }
  93 }
  94
  95 /* Initializes an nbnxn_atomdata_output_t data structure */
  96 nbnxn_atomdata_output_t::nbnxn_atomdata_output_t(Nbnxm::KernelType  kernelType,
  97                                                  int                numEnergyGroups,
  98                                                  int                simdEnergyBufferStride,
  99                                                  gmx::PinningPolicy pinningPolicy) :
 100     f({}, { pinningPolicy }),
 101     fshift({}, { pinningPolicy }),
 102     Vvdw({}, { pinningPolicy }),
 103     Vc({}, { pinningPolicy })
 104 {
 105     fshift.resize(SHIFTS * DIM);
 106     Vvdw.resize(numEnergyGroups * numEnergyGroups);
 107     Vc.resize(numEnergyGroups * numEnergyGroups);
 108
 109     if (Nbnxm::kernelTypeIsSimd(kernelType))
 110     {
 111         int cj_size = Nbnxm::JClusterSizePerKernelType[kernelType];
 112         int numElements =
 113                 numEnergyGroups * numEnergyGroups * simdEnergyBufferStride * (cj_size / 2) * cj_size;
 114         VSvdw.resize(numElements);
 115         VSc.resize(numElements);
 116     }
 117 }
 118
 119 static void copy_int_to_nbat_int(const int* a, int na, int na_round, const int* in, int fill, int* innb)
 120 {
 121     int i, j;
 122
 123     j = 0;
 124     for (i = 0; i < na; i++)
 125     {
 126         innb[j++] = in[a[i]];
 127     }
 128     /* Complete the partially filled last cell with fill */
 129     for (; i < na_round; i++)
 130     {
 131         innb[j++] = fill;
 132     }
 133 }
 134
 135 void copy_rvec_to_nbat_real(const int* a, int na, int na_round, const rvec* x, int nbatFormat, real* xnb, int a0)
 136 {
 137     /* We complete partially filled cells, can only be the last one in each
 138      * column, with coordinates farAway. The actual coordinate value does
 139      * not influence the results, since these filler particles do not interact.
 140      * Clusters with normal atoms + fillers have a bounding box based only
 141      * on the coordinates of the atoms. Clusters with only fillers have as
 142      * the bounding box the coordinates of the first filler. Such clusters
 143      * are not considered as i-entries, but they are considered as j-entries.
 144      * So for performance it is better to have their bounding boxes far away,
 145      * such that filler only clusters don't end up in the pair list.
 146      */
 147     const real farAway = -1000000;
 148
 149     int i, j, c;
 150
 151     switch (nbatFormat)
 152     {
 153         case nbatXYZ:
 154             j = a0 * STRIDE_XYZ;
 155             for (i = 0; i < na; i++)
 156             {
 157                 xnb[j++] = x[a[i]][XX];
 158                 xnb[j++] = x[a[i]][YY];
 159                 xnb[j++] = x[a[i]][ZZ];
 160             }
 161             /* Complete the partially filled last cell with farAway elements */
 162             for (; i < na_round; i++)
 163             {
 164                 xnb[j++] = farAway;
 165                 xnb[j++] = farAway;
 166                 xnb[j++] = farAway;
 167             }
 168             break;
 169         case nbatXYZQ:
 170             j = a0 * STRIDE_XYZQ;
 171             for (i = 0; i < na; i++)
 172             {
 173                 xnb[j++] = x[a[i]][XX];
 174                 xnb[j++] = x[a[i]][YY];
 175                 xnb[j++] = x[a[i]][ZZ];
 176                 j++;
 177             }
 178             /* Complete the partially filled last cell with zeros */
 179             for (; i < na_round; i++)
 180             {
 181                 xnb[j++] = farAway;
 182                 xnb[j++] = farAway;
 183                 xnb[j++] = farAway;
 184                 j++;
 185             }
 186             break;
 187         case nbatX4:
 188             j = atom_to_x_index<c_packX4>(a0);
 189             c = a0 & (c_packX4 - 1);
 190             for (i = 0; i < na; i++)
 191             {
 192                 xnb[j + XX * c_packX4] = x[a[i]][XX];
 193                 xnb[j + YY * c_packX4] = x[a[i]][YY];
 194                 xnb[j + ZZ * c_packX4] = x[a[i]][ZZ];
 195                 j++;
 196                 c++;
 197                 if (c == c_packX4)
 198                 {
 199                     j += (DIM - 1) * c_packX4;
 200                     c = 0;
 201                 }
 202             }
 203             /* Complete the partially filled last cell with zeros */
 204             for (; i < na_round; i++)
 205             {
 206                 xnb[j + XX * c_packX4] = farAway;
 207                 xnb[j + YY * c_packX4] = farAway;
 208                 xnb[j + ZZ * c_packX4] = farAway;
 209                 j++;
 210                 c++;
 211                 if (c == c_packX4)
 212                 {
 213                     j += (DIM - 1) * c_packX4;
 214                     c = 0;
 215                 }
 216             }
 217             break;
 218         case nbatX8:
 219             j = atom_to_x_index<c_packX8>(a0);
 220             c = a0 & (c_packX8 - 1);
 221             for (i = 0; i < na; i++)
 222             {
 223                 xnb[j + XX * c_packX8] = x[a[i]][XX];
 224                 xnb[j + YY * c_packX8] = x[a[i]][YY];
 225                 xnb[j + ZZ * c_packX8] = x[a[i]][ZZ];
 226                 j++;
 227                 c++;
 228                 if (c == c_packX8)
 229                 {
 230                     j += (DIM - 1) * c_packX8;
 231                     c = 0;
 232                 }
 233             }
 234             /* Complete the partially filled last cell with zeros */
 235             for (; i < na_round; i++)
 236             {
 237                 xnb[j + XX * c_packX8] = farAway;
 238                 xnb[j + YY * c_packX8] = farAway;
 239                 xnb[j + ZZ * c_packX8] = farAway;
 240                 j++;
 241                 c++;
 242                 if (c == c_packX8)
 243                 {
 244                     j += (DIM - 1) * c_packX8;
 245                     c = 0;
 246                 }
 247             }
 248             break;
 249         default: gmx_incons("Unsupported nbnxn_atomdata_t format");
 250     }
 251 }
 252
 253 /* Stores the LJ parameter data in a format convenient for different kernels */
 254 static void set_lj_parameter_data(nbnxn_atomdata_t::Params* params, gmx_bool bSIMD)
 255 {
 256     int nt = params->numTypes;
 257
 258     if (bSIMD)
 259     {
 260 #if GMX_SIMD
 261         /* nbfp_aligned stores two parameters using the stride most suitable
 262          * for the present SIMD architecture, as specified by the constant
 263          * c_simdBestPairAlignment from the SIMD header.
 264          * There's a slight inefficiency in allocating and initializing nbfp_aligned
 265          * when it might not be used, but introducing the conditional code is not
 266          * really worth it.
 267          */
 268         params->nbfp_aligned.resize(nt * nt * c_simdBestPairAlignment);
 269
 270         for (int i = 0; i < nt; i++)
 271         {
 272             for (int j = 0; j < nt; j++)
 273             {
 274                 params->nbfp_aligned[(i * nt + j) * c_simdBestPairAlignment + 0] =
 275                         params->nbfp[(i * nt + j) * 2 + 0];
 276                 params->nbfp_aligned[(i * nt + j) * c_simdBestPairAlignment + 1] =
 277                         params->nbfp[(i * nt + j) * 2 + 1];
 278                 if (c_simdBestPairAlignment > 2)
 279                 {
 280                     params->nbfp_aligned[(i * nt + j) * c_simdBestPairAlignment + 2] = 0;
 281                     params->nbfp_aligned[(i * nt + j) * c_simdBestPairAlignment + 3] = 0;
 282                 }
 283             }
 284         }
 285 #endif
 286     }
 287
 288     /* We use combination rule data for SIMD combination rule kernels
 289      * and with LJ-PME kernels. We then only need parameters per atom type,
 290      * not per pair of atom types.
 291      */
 292     params->nbfp_comb.resize(nt * 2);
 293     switch (params->comb_rule)
 294     {
 295         case ljcrGEOM:
 296             params->comb_rule = ljcrGEOM;
 297
 298             for (int i = 0; i < nt; i++)
 299             {
 300                 /* Store the sqrt of the diagonal from the nbfp matrix */
 301                 params->nbfp_comb[i * 2]     = std::sqrt(params->nbfp[(i * nt + i) * 2]);
 302                 params->nbfp_comb[i * 2 + 1] = std::sqrt(params->nbfp[(i * nt + i) * 2 + 1]);
 303             }
 304             break;
 305         case ljcrLB:
 306             for (int i = 0; i < nt; i++)
 307             {
 308                 /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
 309                 const real c6  = params->nbfp[(i * nt + i) * 2];
 310                 const real c12 = params->nbfp[(i * nt + i) * 2 + 1];
 311                 if (c6 > 0 && c12 > 0)
 312                 {
 313                     /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
 314                      * so we get 6*C6 and 12*C12 after combining.
 315                      */
 316                     params->nbfp_comb[i * 2]     = 0.5 * gmx::sixthroot(c12 / c6);
 317                     params->nbfp_comb[i * 2 + 1] = std::sqrt(c6 * c6 / c12);
 318                 }
 319                 else
 320                 {
 321                     params->nbfp_comb[i * 2]     = 0;
 322                     params->nbfp_comb[i * 2 + 1] = 0;
 323                 }
 324             }
 325             break;
 326         case ljcrNONE:
 327             /* We always store the full matrix (see code above) */
 328             break;
 329         default: gmx_incons("Unknown combination rule");
 330     }
 331 }
 332
 333 nbnxn_atomdata_t::SimdMasks::SimdMasks()
 334 {
 335 #if GMX_SIMD
 336     constexpr int simd_width = GMX_SIMD_REAL_WIDTH;
 337     /* Set the diagonal cluster pair exclusion mask setup data.
 338      * In the kernel we check 0 < j - i to generate the masks.
 339      * Here we store j - i for generating the mask for the first i,
 340      * we substract 0.5 to avoid rounding issues.
 341      * In the kernel we can subtract 1 to generate the subsequent mask.
 342      */
 343     const int simd_4xn_diag_size = std::max(c_nbnxnCpuIClusterSize, simd_width);
 344     diagonal_4xn_j_minus_i.resize(simd_4xn_diag_size);
 345     for (int j = 0; j < simd_4xn_diag_size; j++)
 346     {
 347         diagonal_4xn_j_minus_i[j] = j - 0.5;
 348     }
 349
 350     diagonal_2xnn_j_minus_i.resize(simd_width);
 351     for (int j = 0; j < simd_width / 2; j++)
 352     {
 353         /* The j-cluster size is half the SIMD width */
 354         diagonal_2xnn_j_minus_i[j] = j - 0.5;
 355         /* The next half of the SIMD width is for i + 1 */
 356         diagonal_2xnn_j_minus_i[simd_width / 2 + j] = j - 1 - 0.5;
 357     }
 358
 359     /* We use up to 32 bits for exclusion masking.
 360      * The same masks are used for the 4xN and 2x(N+N) kernels.
 361      * The masks are read either into integer SIMD registers or into
 362      * real SIMD registers (together with a cast).
 363      * In single precision this means the real and integer SIMD registers
 364      * are of equal size.
 365      */
 366     const int simd_excl_size = c_nbnxnCpuIClusterSize * simd_width;
 367 #    if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
 368     exclusion_filter64.resize(simd_excl_size);
 369 #    else
 370     exclusion_filter.resize(simd_excl_size);
 371 #    endif
 372
 373     for (int j = 0; j < simd_excl_size; j++)
 374     {
 375         /* Set the consecutive bits for masking pair exclusions */
 376 #    if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
 377         exclusion_filter64[j] = (1U << j);
 378 #    else
 379         exclusion_filter[j] = (1U << j);
 380 #    endif
 381     }
 382
 383     if (!GMX_SIMD_HAVE_LOGICAL && !GMX_SIMD_HAVE_INT32_LOGICAL) // NOLINT(misc-redundant-expression)
 384     {
 385         // If the SIMD implementation has no bitwise logical operation support
 386         // whatsoever we cannot use the normal masking. Instead,
 387         // we generate a vector of all 2^4 possible ways an i atom
 388         // interacts with its 4 j atoms. Each array entry contains
 389         // GMX_SIMD_REAL_WIDTH values that are read with a single aligned SIMD load.
 390         // Since there is no logical value representation in this case, we use
 391         // any nonzero value to indicate 'true', while zero mean 'false'.
 392         // This can then be converted to a SIMD boolean internally in the SIMD
 393         // module by comparing to zero.
 394         // Each array entry encodes how this i atom will interact with the 4 j atoms.
 395         // Matching code exists in set_ci_top_excls() to generate indices into this array.
 396         // Those indices are used in the kernels.
 397
 398         const int  simd_excl_size = c_nbnxnCpuIClusterSize * c_nbnxnCpuIClusterSize;
 399         const real simdFalse      = 0.0;
 400         const real simdTrue       = 1.0;
 401
 402         interaction_array.resize(simd_excl_size * GMX_SIMD_REAL_WIDTH);
 403         for (int j = 0; j < simd_excl_size; j++)
 404         {
 405             const int index = j * GMX_SIMD_REAL_WIDTH;
 406             for (int i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 407             {
 408                 interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse;
 409             }
 410         }
 411     }
 412 #endif
 413 }
 414
 415 nbnxn_atomdata_t::Params::Params(gmx::PinningPolicy pinningPolicy) :
 416     numTypes(0),
 417     nbfp({}, { pinningPolicy }),
 418     nbfp_comb({}, { pinningPolicy }),
 419     type({}, { pinningPolicy }),
 420     lj_comb({}, { pinningPolicy }),
 421     q({}, { pinningPolicy }),
 422     nenergrp(0),
 423     neg_2log(0),
 424     energrp({}, { pinningPolicy })
 425 {
 426 }
 427
 428 nbnxn_atomdata_t::nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy) :
 429     params_(pinningPolicy),
 430     numAtoms_(0),
 431     natoms_local(0),
 432     shift_vec({}, { pinningPolicy }),
 433     x_({}, { pinningPolicy }),
 434     simdMasks(),
 435     bUseBufferFlags(FALSE),
 436     bUseTreeReduce(FALSE)
 437 {
 438 }
 439
 440 /* Initializes an nbnxn_atomdata_t::Params data structure */
 441 static void nbnxn_atomdata_params_init(const gmx::MDLogger&      mdlog,
 442                                        nbnxn_atomdata_t::Params* params,
 443                                        const Nbnxm::KernelType   kernelType,
 444                                        int                       enbnxninitcombrule,
 445                                        int                       ntype,
 446                                        ArrayRef<const real>      nbfp,
 447                                        int                       n_energygroups)
 448 {
 449     real     c6, c12, tol;
 450     char*    ptr;
 451     gmx_bool simple, bCombGeom, bCombLB, bSIMD;
 452
 453     if (debug)
 454     {
 455         fprintf(debug, "There are %d atom types in the system, adding one for nbnxn_atomdata_t\n", ntype);
 456     }
 457     params->numTypes = ntype + 1;
 458     params->nbfp.resize(params->numTypes * params->numTypes * 2);
 459     params->nbfp_comb.resize(params->numTypes * 2);
 460
 461     /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
 462      * force-field floating point parameters.
 463      */
 464     tol = 1e-5;
 465     ptr = getenv("GMX_LJCOMB_TOL");
 466     if (ptr != nullptr)
 467     {
 468         double dbl;
 469
 470         sscanf(ptr, "%lf", &dbl);
 471         tol = dbl;
 472     }
 473     bCombGeom = TRUE;
 474     bCombLB   = TRUE;
 475
 476     /* Temporarily fill params->nbfp_comb with sigma and epsilon
 477      * to check for the LB rule.
 478      */
 479     for (int i = 0; i < ntype; i++)
 480     {
 481         c6  = nbfp[(i * ntype + i) * 2] / 6.0;
 482         c12 = nbfp[(i * ntype + i) * 2 + 1] / 12.0;
 483         if (c6 > 0 && c12 > 0)
 484         {
 485             params->nbfp_comb[i * 2]     = gmx::sixthroot(c12 / c6);
 486             params->nbfp_comb[i * 2 + 1] = 0.25 * c6 * c6 / c12;
 487         }
 488         else if (c6 == 0 && c12 == 0)
 489         {
 490             params->nbfp_comb[i * 2]     = 0;
 491             params->nbfp_comb[i * 2 + 1] = 0;
 492         }
 493         else
 494         {
 495             /* Can not use LB rule with only dispersion or repulsion */
 496             bCombLB = FALSE;
 497         }
 498     }
 499
 500     for (int i = 0; i < params->numTypes; i++)
 501     {
 502         for (int j = 0; j < params->numTypes; j++)
 503         {
 504             if (i < ntype && j < ntype)
 505             {
 506                 /* fr->nbfp has been updated, so that array too now stores c6/c12 including
 507                  * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
 508                  */
 509                 c6                                               = nbfp[(i * ntype + j) * 2];
 510                 c12                                              = nbfp[(i * ntype + j) * 2 + 1];
 511                 params->nbfp[(i * params->numTypes + j) * 2]     = c6;
 512                 params->nbfp[(i * params->numTypes + j) * 2 + 1] = c12;
 513
 514                 /* Compare 6*C6 and 12*C12 for geometric cobination rule */
 515                 bCombGeom =
 516                         bCombGeom
 517                         && gmx_within_tol(c6 * c6,
 518                                           nbfp[(i * ntype + i) * 2] * nbfp[(j * ntype + j) * 2], tol)
 519                         && gmx_within_tol(c12 * c12,
 520                                           nbfp[(i * ntype + i) * 2 + 1] * nbfp[(j * ntype + j) * 2 + 1],
 521                                           tol);
 522
 523                 /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
 524                 c6 /= 6.0;
 525                 c12 /= 12.0;
 526                 bCombLB =
 527                         bCombLB
 528                         && ((c6 == 0 && c12 == 0
 529                              && (params->nbfp_comb[i * 2 + 1] == 0 || params->nbfp_comb[j * 2 + 1] == 0))
 530                             || (c6 > 0 && c12 > 0
 531                                 && gmx_within_tol(
 532                                            gmx::sixthroot(c12 / c6),
 533                                            0.5 * (params->nbfp_comb[i * 2] + params->nbfp_comb[j * 2]), tol)
 534                                 && gmx_within_tol(0.25 * c6 * c6 / c12,
 535                                                   std::sqrt(params->nbfp_comb[i * 2 + 1]
 536                                                             * params->nbfp_comb[j * 2 + 1]),
 537                                                   tol)));
 538             }
 539             else
 540             {
 541                 /* Add zero parameters for the additional dummy atom type */
 542                 params->nbfp[(i * params->numTypes + j) * 2]     = 0;
 543                 params->nbfp[(i * params->numTypes + j) * 2 + 1] = 0;
 544             }
 545         }
 546     }
 547     if (debug)
 548     {
 549         fprintf(debug, "Combination rules: geometric %s Lorentz-Berthelot %s\n",
 550                 gmx::boolToString(bCombGeom), gmx::boolToString(bCombLB));
 551     }
 552
 553     simple = Nbnxm::kernelTypeUsesSimplePairlist(kernelType);
 554
 555     switch (enbnxninitcombrule)
 556     {
 557         case enbnxninitcombruleDETECT:
 558             /* We prefer the geometic combination rule,
 559              * as that gives a slightly faster kernel than the LB rule.
 560              */
 561             if (bCombGeom)
 562             {
 563                 params->comb_rule = ljcrGEOM;
 564             }
 565             else if (bCombLB)
 566             {
 567                 params->comb_rule = ljcrLB;
 568             }
 569             else
 570             {
 571                 params->comb_rule = ljcrNONE;
 572
 573                 params->nbfp_comb.clear();
 574             }
 575
 576             {
 577                 std::string mesg;
 578                 if (params->comb_rule == ljcrNONE)
 579                 {
 580                     mesg = "Using full Lennard-Jones parameter combination matrix";
 581                 }
 582                 else
 583                 {
 584                     mesg = gmx::formatString(
 585                             "Using %s Lennard-Jones combination rule",
 586                             params->comb_rule == ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
 587                 }
 588                 GMX_LOG(mdlog.info).asParagraph().appendText(mesg);
 589             }
 590             break;
 591         case enbnxninitcombruleGEOM: params->comb_rule = ljcrGEOM; break;
 592         case enbnxninitcombruleLB: params->comb_rule = ljcrLB; break;
 593         case enbnxninitcombruleNONE:
 594             params->comb_rule = ljcrNONE;
 595
 596             params->nbfp_comb.clear();
 597             break;
 598         default: gmx_incons("Unknown enbnxninitcombrule");
 599     }
 600
 601     bSIMD = Nbnxm::kernelTypeIsSimd(kernelType);
 602
 603     set_lj_parameter_data(params, bSIMD);
 604
 605     params->nenergrp = n_energygroups;
 606     if (!simple)
 607     {
 608         // We now check for energy groups already when starting mdrun
 609         GMX_RELEASE_ASSERT(n_energygroups == 1, "GPU kernels do not support energy groups");
 610     }
 611     /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
 612     if (params->nenergrp > 64)
 613     {
 614         gmx_fatal(FARGS, "With NxN kernels not more than 64 energy groups are supported\n");
 615     }
 616     params->neg_2log = 1;
 617     while (params->nenergrp > (1 << params->neg_2log))
 618     {
 619         params->neg_2log++;
 620     }
 621 }
 622
 623 /* Initializes an nbnxn_atomdata_t data structure */
 624 void nbnxn_atomdata_init(const gmx::MDLogger&    mdlog,
 625                          nbnxn_atomdata_t*       nbat,
 626                          const Nbnxm::KernelType kernelType,
 627                          int                     enbnxninitcombrule,
 628                          int                     ntype,
 629                          ArrayRef<const real>    nbfp,
 630                          int                     n_energygroups,
 631                          int                     nout)
 632 {
 633     nbnxn_atomdata_params_init(mdlog, &nbat->paramsDeprecated(), kernelType, enbnxninitcombrule,
 634                                ntype, nbfp, n_energygroups);
 635
 636     const bool simple = Nbnxm::kernelTypeUsesSimplePairlist(kernelType);
 637     const bool bSIMD  = Nbnxm::kernelTypeIsSimd(kernelType);
 638
 639     if (simple)
 640     {
 641         int pack_x;
 642
 643         if (bSIMD)
 644         {
 645             pack_x = std::max(c_nbnxnCpuIClusterSize, Nbnxm::JClusterSizePerKernelType[kernelType]);
 646             switch (pack_x)
 647             {
 648                 case 4: nbat->XFormat = nbatX4; break;
 649                 case 8: nbat->XFormat = nbatX8; break;
 650                 default: gmx_incons("Unsupported packing width");
 651             }
 652         }
 653         else
 654         {
 655             nbat->XFormat = nbatXYZ;
 656         }
 657
 658         nbat->FFormat = nbat->XFormat;
 659     }
 660     else
 661     {
 662         nbat->XFormat = nbatXYZQ;
 663         nbat->FFormat = nbatXYZ;
 664     }
 665
 666     nbat->shift_vec.resize(SHIFTS);
 667
 668     nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 669     nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 670
 671     /* Initialize the output data structures */
 672     for (int i = 0; i < nout; i++)
 673     {
 674         const auto& pinningPolicy = nbat->params().type.get_allocator().pinningPolicy();
 675         nbat->out.emplace_back(kernelType, nbat->params().nenergrp, 1 << nbat->params().neg_2log,
 676                                pinningPolicy);
 677     }
 678
 679     nbat->buffer_flags.flag        = nullptr;
 680     nbat->buffer_flags.flag_nalloc = 0;
 681
 682     const int nth = gmx_omp_nthreads_get(emntNonbonded);
 683
 684     const char* ptr = getenv("GMX_USE_TREEREDUCE");
 685     if (ptr != nullptr)
 686     {
 687         nbat->bUseTreeReduce = (strtol(ptr, nullptr, 10) != 0);
 688     }
 689 #if defined __MIC__
 690     else if (nth > 8) /*on the CPU we currently don't benefit even at 32*/
 691     {
 692         nbat->bUseTreeReduce = 1;
 693     }
 694 #endif
 695     else
 696     {
 697         nbat->bUseTreeReduce = false;
 698     }
 699     if (nbat->bUseTreeReduce)
 700     {
 701         GMX_LOG(mdlog.info).asParagraph().appendText("Using tree force reduction");
 702
 703         nbat->syncStep = new tMPI_Atomic[nth];
 704     }
 705 }
 706
 707 template<int packSize>
 708 static void copy_lj_to_nbat_lj_comb(gmx::ArrayRef<const real> ljparam_type, const int* type, int na, real* ljparam_at)
 709 {
 710     /* The LJ params follow the combination rule:
 711      * copy the params for the type array to the atom array.
 712      */
 713     for (int is = 0; is < na; is += packSize)
 714     {
 715         for (int k = 0; k < packSize; k++)
 716         {
 717             int i                             = is + k;
 718             ljparam_at[is * 2 + k]            = ljparam_type[type[i] * 2];
 719             ljparam_at[is * 2 + packSize + k] = ljparam_type[type[i] * 2 + 1];
 720         }
 721     }
 722 }
 723
 724 /* Sets the atom type in nbnxn_atomdata_t */
 725 static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t::Params* params,
 726                                          const Nbnxm::GridSet&     gridSet,
 727                                          const int*                type)
 728 {
 729     params->type.resize(gridSet.numGridAtomsTotal());
 730
 731     for (const Nbnxm::Grid& grid : gridSet.grids())
 732     {
 733         /* Loop over all columns and copy and fill */
 734         for (int i = 0; i < grid.numColumns(); i++)
 735         {
 736             const int numAtoms   = grid.paddedNumAtomsInColumn(i);
 737             const int atomOffset = grid.firstAtomInColumn(i);
 738
 739             copy_int_to_nbat_int(gridSet.atomIndices().data() + atomOffset, grid.numAtomsInColumn(i),
 740                                  numAtoms, type, params->numTypes - 1, params->type.data() + atomOffset);
 741         }
 742     }
 743 }
 744
 745 /* Sets the LJ combination rule parameters in nbnxn_atomdata_t */
 746 static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t::Params* params,
 747                                             const int                 XFormat,
 748                                             const Nbnxm::GridSet&     gridSet)
 749 {
 750     params->lj_comb.resize(gridSet.numGridAtomsTotal() * 2);
 751
 752     if (params->comb_rule != ljcrNONE)
 753     {
 754         for (const Nbnxm::Grid& grid : gridSet.grids())
 755         {
 756             /* Loop over all columns and copy and fill */
 757             for (int i = 0; i < grid.numColumns(); i++)
 758             {
 759                 const int numAtoms   = grid.paddedNumAtomsInColumn(i);
 760                 const int atomOffset = grid.firstAtomInColumn(i);
 761
 762                 if (XFormat == nbatX4)
 763                 {
 764                     copy_lj_to_nbat_lj_comb<c_packX4>(params->nbfp_comb,
 765                                                       params->type.data() + atomOffset, numAtoms,
 766                                                       params->lj_comb.data() + atomOffset * 2);
 767                 }
 768                 else if (XFormat == nbatX8)
 769                 {
 770                     copy_lj_to_nbat_lj_comb<c_packX8>(params->nbfp_comb,
 771                                                       params->type.data() + atomOffset, numAtoms,
 772                                                       params->lj_comb.data() + atomOffset * 2);
 773                 }
 774                 else if (XFormat == nbatXYZQ)
 775                 {
 776                     copy_lj_to_nbat_lj_comb<1>(params->nbfp_comb, params->type.data() + atomOffset,
 777                                                numAtoms, params->lj_comb.data() + atomOffset * 2);
 778                 }
 779             }
 780         }
 781     }
 782 }
 783
 784 /* Sets the charges in nbnxn_atomdata_t *nbat */
 785 static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t* nbat, const Nbnxm::GridSet& gridSet, const real* charge)
 786 {
 787     if (nbat->XFormat != nbatXYZQ)
 788     {
 789         nbat->paramsDeprecated().q.resize(nbat->numAtoms());
 790     }
 791
 792     for (const Nbnxm::Grid& grid : gridSet.grids())
 793     {
 794         /* Loop over all columns and copy and fill */
 795         for (int cxy = 0; cxy < grid.numColumns(); cxy++)
 796         {
 797             const int atomOffset     = grid.firstAtomInColumn(cxy);
 798             const int numAtoms       = grid.numAtomsInColumn(cxy);
 799             const int paddedNumAtoms = grid.paddedNumAtomsInColumn(cxy);
 800
 801             if (nbat->XFormat == nbatXYZQ)
 802             {
 803                 real* q = nbat->x().data() + atomOffset * STRIDE_XYZQ + ZZ + 1;
 804                 int   i;
 805                 for (i = 0; i < numAtoms; i++)
 806                 {
 807                     *q = charge[gridSet.atomIndices()[atomOffset + i]];
 808                     q += STRIDE_XYZQ;
 809                 }
 810                 /* Complete the partially filled last cell with zeros */
 811                 for (; i < paddedNumAtoms; i++)
 812                 {
 813                     *q = 0;
 814                     q += STRIDE_XYZQ;
 815                 }
 816             }
 817             else
 818             {
 819                 real* q = nbat->paramsDeprecated().q.data() + atomOffset;
 820                 int   i;
 821                 for (i = 0; i < numAtoms; i++)
 822                 {
 823                     *q = charge[gridSet.atomIndices()[atomOffset + i]];
 824                     q++;
 825                 }
 826                 /* Complete the partially filled last cell with zeros */
 827                 for (; i < paddedNumAtoms; i++)
 828                 {
 829                     *q = 0;
 830                     q++;
 831                 }
 832             }
 833         }
 834     }
 835 }
 836
 837 /* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0.
 838  * This is to automatically remove the RF/PME self term in the nbnxn kernels.
 839  * Part of the zero interactions are still calculated in the normal kernels.
 840  * All perturbed interactions are calculated in the free energy kernel,
 841  * using the original charge and LJ data, not nbnxn_atomdata_t.
 842  */
 843 static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t* nbat, const Nbnxm::GridSet& gridSet)
 844 {
 845     nbnxn_atomdata_t::Params& params = nbat->paramsDeprecated();
 846     real*                     q;
 847     int                       stride_q;
 848
 849     if (nbat->XFormat == nbatXYZQ)
 850     {
 851         q        = nbat->x().data() + ZZ + 1;
 852         stride_q = STRIDE_XYZQ;
 853     }
 854     else
 855     {
 856         q        = params.q.data();
 857         stride_q = 1;
 858     }
 859
 860     for (const Nbnxm::Grid& grid : gridSet.grids())
 861     {
 862         int nsubc;
 863         if (grid.geometry().isSimple)
 864         {
 865             nsubc = 1;
 866         }
 867         else
 868         {
 869             nsubc = c_gpuNumClusterPerCell;
 870         }
 871
 872         int c_offset = grid.firstAtomInColumn(0);
 873
 874         /* Loop over all columns and copy and fill */
 875         for (int c = 0; c < grid.numCells() * nsubc; c++)
 876         {
 877             /* Does this cluster contain perturbed particles? */
 878             if (grid.clusterIsPerturbed(c))
 879             {
 880                 const int numAtomsPerCluster = grid.geometry().numAtomsICluster;
 881                 for (int i = 0; i < numAtomsPerCluster; i++)
 882                 {
 883                     /* Is this a perturbed particle? */
 884                     if (grid.atomIsPerturbed(c, i))
 885                     {
 886                         int ind = c_offset + c * numAtomsPerCluster + i;
 887                         /* Set atom type and charge to non-interacting */
 888                         params.type[ind]  = params.numTypes - 1;
 889                         q[ind * stride_q] = 0;
 890                     }
 891                 }
 892             }
 893         }
 894     }
 895 }
 896
 897 /* Copies the energy group indices to a reordered and packed array */
 898 static void
 899 copy_egp_to_nbat_egps(const int* a, int na, int na_round, int na_c, int bit_shift, const int* in, int* innb)
 900 {
 901     int i;
 902     int comb;
 903
 904     int j = 0;
 905     for (i = 0; i < na; i += na_c)
 906     {
 907         /* Store na_c energy group numbers into one int */
 908         comb = 0;
 909         for (int sa = 0; sa < na_c; sa++)
 910         {
 911             int at = a[i + sa];
 912             if (at >= 0)
 913             {
 914                 comb |= (GET_CGINFO_GID(in[at]) << (sa * bit_shift));
 915             }
 916         }
 917         innb[j++] = comb;
 918     }
 919     /* Complete the partially filled last cell with fill */
 920     for (; i < na_round; i += na_c)
 921     {
 922         innb[j++] = 0;
 923     }
 924 }
 925
 926 /* Set the energy group indices for atoms in nbnxn_atomdata_t */
 927 static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t::Params* params,
 928                                             const Nbnxm::GridSet&     gridSet,
 929                                             const int*                atinfo)
 930 {
 931     if (params->nenergrp == 1)
 932     {
 933         return;
 934     }
 935
 936     params->energrp.resize(gridSet.numGridAtomsTotal());
 937
 938     for (const Nbnxm::Grid& grid : gridSet.grids())
 939     {
 940         /* Loop over all columns and copy and fill */
 941         for (int i = 0; i < grid.numColumns(); i++)
 942         {
 943             const int numAtoms   = grid.paddedNumAtomsInColumn(i);
 944             const int atomOffset = grid.firstAtomInColumn(i);
 945
 946             copy_egp_to_nbat_egps(gridSet.atomIndices().data() + atomOffset, grid.numAtomsInColumn(i),
 947                                   numAtoms, c_nbnxnCpuIClusterSize, params->neg_2log, atinfo,
 948                                   params->energrp.data() + grid.atomToCluster(atomOffset));
 949         }
 950     }
 951 }
 952
 953 /* Sets all required atom parameter data in nbnxn_atomdata_t */
 954 void nbnxn_atomdata_set(nbnxn_atomdata_t*     nbat,
 955                         const Nbnxm::GridSet& gridSet,
 956                         const t_mdatoms*      mdatoms,
 957                         const int*            atinfo)
 958 {
 959     nbnxn_atomdata_t::Params& params = nbat->paramsDeprecated();
 960
 961     nbnxn_atomdata_set_atomtypes(&params, gridSet, mdatoms->typeA);
 962
 963     nbnxn_atomdata_set_charges(nbat, gridSet, mdatoms->chargeA);
 964
 965     if (gridSet.haveFep())
 966     {
 967         nbnxn_atomdata_mask_fep(nbat, gridSet);
 968     }
 969
 970     /* This must be done after masking types for FEP */
 971     nbnxn_atomdata_set_ljcombparams(&params, nbat->XFormat, gridSet);
 972
 973     nbnxn_atomdata_set_energygroups(&params, gridSet, atinfo);
 974 }
 975
 976 /* Copies the shift vector array to nbnxn_atomdata_t */
 977 void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox, rvec* shift_vec, nbnxn_atomdata_t* nbat)
 978 {
 979     int i;
 980
 981     nbat->bDynamicBox = bDynamicBox;
 982     for (i = 0; i < SHIFTS; i++)
 983     {
 984         copy_rvec(shift_vec[i], nbat->shift_vec[i]);
 985     }
 986 }
 987
 988 // This is slightly different from nbnxn_get_atom_range(...) at the end of the file
 989 // TODO: Combine if possible
 990 static void getAtomRanges(const Nbnxm::GridSet&   gridSet,
 991                           const gmx::AtomLocality locality,
 992                           int*                    gridBegin,
 993                           int*                    gridEnd)
 994 {
 995     switch (locality)
 996     {
 997         case gmx::AtomLocality::All:
 998             *gridBegin = 0;
 999             *gridEnd   = gridSet.grids().size();
1000             break;
1001         case gmx::AtomLocality::Local:
1002             *gridBegin = 0;
1003             *gridEnd   = 1;
1004             break;
1005         case gmx::AtomLocality::NonLocal:
1006             *gridBegin = 1;
1007             *gridEnd   = gridSet.grids().size();
1008             break;
1009         case gmx::AtomLocality::Count:
1010             GMX_ASSERT(false, "Count is invalid locality specifier");
1011             break;
1012     }
1013 }
1014
1015 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
1016 void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet&   gridSet,
1017                                      const gmx::AtomLocality locality,
1018                                      bool                    fillLocal,
1019                                      const rvec*             coordinates,
1020                                      nbnxn_atomdata_t*       nbat)
1021 {
1022
1023     int gridBegin = 0;
1024     int gridEnd   = 0;
1025     getAtomRanges(gridSet, locality, &gridBegin, &gridEnd);
1026
1027     if (fillLocal)
1028     {
1029         nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
1030     }
1031
1032     const int nth = gmx_omp_nthreads_get(emntPairsearch);
1033 #pragma omp parallel for num_threads(nth) schedule(static)
1034     for (int th = 0; th < nth; th++)
1035     {
1036         try
1037         {
1038             for (int g = gridBegin; g < gridEnd; g++)
1039             {
1040                 const Nbnxm::Grid& grid       = gridSet.grids()[g];
1041                 const int          numCellsXY = grid.numColumns();
1042
1043                 const int cxy0 = (numCellsXY * th + nth - 1) / nth;
1044                 const int cxy1 = (numCellsXY * (th + 1) + nth - 1) / nth;
1045
1046                 for (int cxy = cxy0; cxy < cxy1; cxy++)
1047                 {
1048                     const int na  = grid.numAtomsInColumn(cxy);
1049                     const int ash = grid.firstAtomInColumn(cxy);
1050
1051                     int na_fill;
1052                     if (g == 0 && fillLocal)
1053                     {
1054                         na_fill = grid.paddedNumAtomsInColumn(cxy);
1055                     }
1056                     else
1057                     {
1058                         /* We fill only the real particle locations.
1059                          * We assume the filling entries at the end have been
1060                          * properly set before during pair-list generation.
1061                          */
1062                         na_fill = na;
1063                     }
1064                     copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash, na, na_fill,
1065                                            coordinates, nbat->XFormat, nbat->x().data(), ash);
1066                 }
1067             }
1068         }
1069         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1070     }
1071 }
1072
1073 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
1074 void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
1075                                     const gmx::AtomLocality locality,
1076                                     bool                    fillLocal,
1077                                     NbnxmGpu*               gpu_nbv,
1078                                     DeviceBuffer<float>     d_x,
1079                                     GpuEventSynchronizer*   xReadyOnDevice)
1080 {
1081
1082     int gridBegin = 0;
1083     int gridEnd   = 0;
1084     getAtomRanges(gridSet, locality, &gridBegin, &gridEnd);
1085
1086     for (int g = gridBegin; g < gridEnd; g++)
1087     {
1088         nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g], fillLocal && g == 0, gpu_nbv, d_x, xReadyOnDevice,
1089                               locality, g, gridSet.numColumnsMax());
1090     }
1091 }
1092
1093 static void nbnxn_atomdata_clear_reals(gmx::ArrayRef<real> dest, int i0, int i1)
1094 {
1095     for (int i = i0; i < i1; i++)
1096     {
1097         dest[i] = 0;
1098     }
1099 }
1100
1101 gmx_unused static void nbnxn_atomdata_reduce_reals(real* gmx_restrict dest,
1102                                                    gmx_bool           bDestSet,
1103                                                    const real** gmx_restrict src,
1104                                                    int                       nsrc,
1105                                                    int                       i0,
1106                                                    int                       i1)
1107 {
1108     if (bDestSet)
1109     {
1110         /* The destination buffer contains data, add to it */
1111         for (int i = i0; i < i1; i++)
1112         {
1113             for (int s = 0; s < nsrc; s++)
1114             {
1115                 dest[i] += src[s][i];
1116             }
1117         }
1118     }
1119     else
1120     {
1121         /* The destination buffer is unitialized, set it first */
1122         for (int i = i0; i < i1; i++)
1123         {
1124             dest[i] = src[0][i];
1125             for (int s = 1; s < nsrc; s++)
1126             {
1127                 dest[i] += src[s][i];
1128             }
1129         }
1130     }
1131 }
1132
1133 gmx_unused static void nbnxn_atomdata_reduce_reals_simd(real gmx_unused* gmx_restrict dest,
1134                                                         gmx_bool gmx_unused bDestSet,
1135                                                         const gmx_unused real** gmx_restrict src,
1136                                                         int gmx_unused nsrc,
1137                                                         int gmx_unused i0,
1138                                                         int gmx_unused i1)
1139 {
1140 #if GMX_SIMD
1141     /* The SIMD width here is actually independent of that in the kernels,
1142      * but we use the same width for simplicity (usually optimal anyhow).
1143      */
1144     SimdReal dest_SSE, src_SSE;
1145
1146     if (bDestSet)
1147     {
1148         for (int i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH)
1149         {
1150             dest_SSE = load<SimdReal>(dest + i);
1151             for (int s = 0; s < nsrc; s++)
1152             {
1153                 src_SSE  = load<SimdReal>(src[s] + i);
1154                 dest_SSE = dest_SSE + src_SSE;
1155             }
1156             store(dest + i, dest_SSE);
1157         }
1158     }
1159     else
1160     {
1161         for (int i = i0; i < i1; i += GMX_SIMD_REAL_WIDTH)
1162         {
1163             dest_SSE = load<SimdReal>(src[0] + i);
1164             for (int s = 1; s < nsrc; s++)
1165             {
1166                 src_SSE  = load<SimdReal>(src[s] + i);
1167                 dest_SSE = dest_SSE + src_SSE;
1168             }
1169             store(dest + i, dest_SSE);
1170         }
1171     }
1172 #endif
1173 }
1174
1175 /* Add part of the force array(s) from nbnxn_atomdata_t to f
1176  *
1177  * Note: Adding restrict to f makes this function 50% slower with gcc 7.3
1178  */
1179 static void nbnxn_atomdata_add_nbat_f_to_f_part(const Nbnxm::GridSet&          gridSet,
1180                                                 const nbnxn_atomdata_t&        nbat,
1181                                                 const nbnxn_atomdata_output_t& out,
1182                                                 const int                      a0,
1183                                                 const int                      a1,
1184                                                 rvec*                          f)
1185 {
1186     gmx::ArrayRef<const int> cell = gridSet.cells();
1187     // Note: Using ArrayRef instead makes this code 25% slower with gcc 7.3
1188     const real* fnb = out.f.data();
1189
1190     /* Loop over all columns and copy and fill */
1191     switch (nbat.FFormat)
1192     {
1193         case nbatXYZ:
1194         case nbatXYZQ:
1195             for (int a = a0; a < a1; a++)
1196             {
1197                 int i = cell[a] * nbat.fstride;
1198
1199                 f[a][XX] += fnb[i];
1200                 f[a][YY] += fnb[i + 1];
1201                 f[a][ZZ] += fnb[i + 2];
1202             }
1203             break;
1204         case nbatX4:
1205             for (int a = a0; a < a1; a++)
1206             {
1207                 int i = atom_to_x_index<c_packX4>(cell[a]);
1208
1209                 f[a][XX] += fnb[i + XX * c_packX4];
1210                 f[a][YY] += fnb[i + YY * c_packX4];
1211                 f[a][ZZ] += fnb[i + ZZ * c_packX4];
1212             }
1213             break;
1214         case nbatX8:
1215             for (int a = a0; a < a1; a++)
1216             {
1217                 int i = atom_to_x_index<c_packX8>(cell[a]);
1218
1219                 f[a][XX] += fnb[i + XX * c_packX8];
1220                 f[a][YY] += fnb[i + YY * c_packX8];
1221                 f[a][ZZ] += fnb[i + ZZ * c_packX8];
1222             }
1223             break;
1224         default: gmx_incons("Unsupported nbnxn_atomdata_t format");
1225     }
1226 }
1227
1228 static inline unsigned char reverse_bits(unsigned char b)
1229 {
1230     /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
1231     return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
1232 }
1233
1234 static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbnxn_atomdata_t* nbat, int nth)
1235 {
1236     const nbnxn_buffer_flags_t* flags = &nbat->buffer_flags;
1237
1238     int next_pow2 = 1 << (gmx::log2I(nth - 1) + 1);
1239
1240     const int numOutputBuffers = nbat->out.size();
1241     GMX_ASSERT(numOutputBuffers == nth,
1242                "tree-reduce currently only works for numOutputBuffers==nth");
1243
1244     memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep)) * nth);
1245
1246 #pragma omp parallel num_threads(nth)
1247     {
1248         try
1249         {
1250             int b0, b1, b;
1251             int i0, i1;
1252             int group_size, th;
1253
1254             th = gmx_omp_get_thread_num();
1255
1256             for (group_size = 2; group_size < 2 * next_pow2; group_size *= 2)
1257             {
1258                 int index[2], group_pos, partner_pos, wu;
1259                 int partner_th = th ^ (group_size / 2);
1260
1261                 if (group_size > 2)
1262                 {
1263 #ifdef TMPI_ATOMICS
1264                     /* wait on partner thread - replaces full barrier */
1265                     int sync_th, sync_group_size;
1266
1267                     tMPI_Atomic_memory_barrier(); /* gurantee data is saved before marking work as done */
1268                     tMPI_Atomic_set(&(nbat->syncStep[th]), group_size / 2); /* mark previous step as completed */
1269
1270                     /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
1271                     for (sync_th = partner_th, sync_group_size = group_size;
1272                          sync_th >= nth && sync_group_size > 2; sync_group_size /= 2)
1273                     {
1274                         sync_th &= ~(sync_group_size / 4);
1275                     }
1276                     if (sync_th < nth) /* otherwise nothing to sync index[1] will be >=nout */
1277                     {
1278                         /* wait on the thread which computed input data in previous step */
1279                         while (tMPI_Atomic_get(static_cast<volatile tMPI_Atomic_t*>(&(nbat->syncStep[sync_th])))
1280                                < group_size / 2)
1281                         {
1282                             gmx_pause();
1283                         }
1284                         /* guarantee that no later load happens before wait loop is finisehd */
1285                         tMPI_Atomic_memory_barrier();
1286                     }
1287 #else /* TMPI_ATOMICS */
1288 #    pragma omp barrier
1289 #endif
1290                 }
1291
1292                 /* Calculate buffers to sum (result goes into first buffer) */
1293                 group_pos = th % group_size;
1294                 index[0]  = th - group_pos;
1295                 index[1]  = index[0] + group_size / 2;
1296
1297                 /* If no second buffer, nothing to do */
1298                 if (index[1] >= numOutputBuffers && group_size > 2)
1299                 {
1300                     continue;
1301                 }
1302
1303 #if NBNXN_BUFFERFLAG_MAX_THREADS > 256
1304 #    error reverse_bits assumes max 256 threads
1305 #endif
1306                 /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
1307                    This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
1308                    The permutation which allows this corresponds to reversing the bits of the group position.
1309                  */
1310                 group_pos = reverse_bits(group_pos) / (256 / group_size);
1311
1312                 partner_pos = group_pos ^ 1;
1313
1314                 /* loop over two work-units (own and partner) */
1315                 for (wu = 0; wu < 2; wu++)
1316                 {
1317                     if (wu == 1)
1318                     {
1319                         if (partner_th < nth)
1320                         {
1321                             break; /* partner exists we don't have to do his work */
1322                         }
1323                         else
1324                         {
1325                             group_pos = partner_pos;
1326                         }
1327                     }
1328
1329                     /* Calculate the cell-block range for our thread */
1330                     b0 = (flags->nflag * group_pos) / group_size;
1331                     b1 = (flags->nflag * (group_pos + 1)) / group_size;
1332
1333                     for (b = b0; b < b1; b++)
1334                     {
1335                         i0 = b * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
1336                         i1 = (b + 1) * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
1337
1338                         if (bitmask_is_set(flags->flag[b], index[1]) || group_size > 2)
1339                         {
1340                             const real* fIndex1 = nbat->out[index[1]].f.data();
1341 #if GMX_SIMD
1342                             nbnxn_atomdata_reduce_reals_simd
1343 #else
1344                             nbnxn_atomdata_reduce_reals
1345 #endif
1346                                     (nbat->out[index[0]].f.data(),
1347                                      bitmask_is_set(flags->flag[b], index[0]) || group_size > 2,
1348                                      &fIndex1, 1, i0, i1);
1349                         }
1350                         else if (!bitmask_is_set(flags->flag[b], index[0]))
1351                         {
1352                             nbnxn_atomdata_clear_reals(nbat->out[index[0]].f, i0, i1);
1353                         }
1354                     }
1355                 }
1356             }
1357         }
1358         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1359     }
1360 }
1361
1362
1363 static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t* nbat, int nth)
1364 {
1365 #pragma omp parallel for num_threads(nth) schedule(static)
1366     for (int th = 0; th < nth; th++)
1367     {
1368         try
1369         {
1370             const nbnxn_buffer_flags_t* flags;
1371             int                         nfptr;
1372             const real*                 fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
1373
1374             flags = &nbat->buffer_flags;
1375
1376             /* Calculate the cell-block range for our thread */
1377             int b0 = (flags->nflag * th) / nth;
1378             int b1 = (flags->nflag * (th + 1)) / nth;
1379
1380             for (int b = b0; b < b1; b++)
1381             {
1382                 int i0 = b * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
1383                 int i1 = (b + 1) * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
1384
1385                 nfptr = 0;
1386                 for (gmx::index out = 1; out < gmx::ssize(nbat->out); out++)
1387                 {
1388                     if (bitmask_is_set(flags->flag[b], out))
1389                     {
1390                         fptr[nfptr++] = nbat->out[out].f.data();
1391                     }
1392                 }
1393                 if (nfptr > 0)
1394                 {
1395 #if GMX_SIMD
1396                     nbnxn_atomdata_reduce_reals_simd
1397 #else
1398                     nbnxn_atomdata_reduce_reals
1399 #endif
1400                             (nbat->out[0].f.data(), bitmask_is_set(flags->flag[b], 0), fptr, nfptr, i0, i1);
1401                 }
1402                 else if (!bitmask_is_set(flags->flag[b], 0))
1403                 {
1404                     nbnxn_atomdata_clear_reals(nbat->out[0].f, i0, i1);
1405                 }
1406             }
1407         }
1408         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1409     }
1410 }
1411
1412
1413 /* Add the force array(s) from nbnxn_atomdata_t to f */
1414 void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, rvec* f)
1415 {
1416     int a0 = 0;
1417     int na = 0;
1418
1419     nbnxn_get_atom_range(locality, gridSet, &a0, &na);
1420
1421     if (na == 0)
1422     {
1423         /* The are no atoms for this reduction, avoid some overhead */
1424         return;
1425     }
1426
1427     int nth = gmx_omp_nthreads_get(emntNonbonded);
1428
1429     if (nbat->out.size() > 1)
1430     {
1431         if (locality != gmx::AtomLocality::All)
1432         {
1433             gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
1434         }
1435
1436         /* Reduce the force thread output buffers into buffer 0, before adding
1437          * them to the, differently ordered, "real" force buffer.
1438          */
1439         if (nbat->bUseTreeReduce)
1440         {
1441             nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
1442         }
1443         else
1444         {
1445             nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
1446         }
1447     }
1448 #pragma omp parallel for num_threads(nth) schedule(static)
1449     for (int th = 0; th < nth; th++)
1450     {
1451         try
1452         {
1453             nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat, nbat->out[0], a0 + ((th + 0) * na) / nth,
1454                                                 a0 + ((th + 1) * na) / nth, f);
1455         }
1456         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1457     }
1458 }
1459
1460 /* Add the force array(s) from nbnxn_atomdata_t to f */
1461 void reduceForcesGpu(const gmx::AtomLocality                    locality,
1462                      DeviceBuffer<float>                        totalForcesDevice,
1463                      const Nbnxm::GridSet&                      gridSet,
1464                      void*                                      pmeForcesDevice,
1465                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
1466                      NbnxmGpu*                                  gpu_nbv,
1467                      bool                                       useGpuFPmeReduction,
1468                      bool                                       accumulateForce)
1469 {
1470     int atomsStart = 0;
1471     int numAtoms   = 0;
1472
1473     nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms);
1474
1475     if (numAtoms == 0)
1476     {
1477         /* The are no atoms for this reduction, avoid some overhead */
1478         return;
1479     }
1480
1481     Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, totalForcesDevice, gpu_nbv, pmeForcesDevice, dependencyList,
1482                                      atomsStart, numAtoms, useGpuFPmeReduction, accumulateForce);
1483 }
1484
1485 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift)
1486 {
1487     gmx::ArrayRef<const nbnxn_atomdata_output_t> outputBuffers = nbat.out;
1488
1489     for (int s = 0; s < SHIFTS; s++)
1490     {
1491         rvec sum;
1492         clear_rvec(sum);
1493         for (const nbnxn_atomdata_output_t& out : outputBuffers)
1494         {
1495             sum[XX] += out.fshift[s * DIM + XX];
1496             sum[YY] += out.fshift[s * DIM + YY];
1497             sum[ZZ] += out.fshift[s * DIM + ZZ];
1498         }
1499         fshift[s] += sum;
1500     }
1501 }
1502
1503 void nbnxn_get_atom_range(const gmx::AtomLocality atomLocality,
1504                           const Nbnxm::GridSet&   gridSet,
1505                           int*                    atomStart,
1506                           int*                    nAtoms)
1507 {
1508
1509     switch (atomLocality)
1510     {
1511         case gmx::AtomLocality::All:
1512             *atomStart = 0;
1513             *nAtoms    = gridSet.numRealAtomsTotal();
1514             break;
1515         case gmx::AtomLocality::Local:
1516             *atomStart = 0;
1517             *nAtoms    = gridSet.numRealAtomsLocal();
1518             break;
1519         case gmx::AtomLocality::NonLocal:
1520             *atomStart = gridSet.numRealAtomsLocal();
1521             *nAtoms    = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal();
1522             break;
1523         case gmx::AtomLocality::Count:
1524             GMX_ASSERT(false, "Count is invalid locality specifier");
1525             break;
1526     }
1527 }