src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdlib.h>
  46
  47 #include "gromacs/nbnxm/nbnxm_gpu.h"
  48
  49 #if defined(_MSVC)
  50 #    include <limits>
  51 #endif
  52
  53
  54 #include "nbnxm_cuda.h"
  55
  56 #include "gromacs/gpu_utils/cudautils.cuh"
  57 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  58 #include "gromacs/gpu_utils/vectype_ops.cuh"
  59 #include "gromacs/mdtypes/simulation_workload.h"
  60 #include "gromacs/nbnxm/atomdata.h"
  61 #include "gromacs/nbnxm/gpu_common.h"
  62 #include "gromacs/nbnxm/gpu_common_utils.h"
  63 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  64 #include "gromacs/nbnxm/grid.h"
  65 #include "gromacs/nbnxm/nbnxm.h"
  66 #include "gromacs/nbnxm/pairlist.h"
  67 #include "gromacs/timing/gpu_timing.h"
  68 #include "gromacs/utility/cstringutil.h"
  69 #include "gromacs/utility/gmxassert.h"
  70
  71 #include "nbnxm_buffer_ops_kernels.cuh"
  72 #include "nbnxm_cuda_types.h"
  73
  74 /***** The kernel declarations/definitions come here *****/
  75
  76 /* Top-level kernel declaration generation: will generate through multiple
  77  * inclusion the following flavors for all kernel declarations:
  78  * - force-only output;
  79  * - force and energy output;
  80  * - force-only with pair list pruning;
  81  * - force and energy output with pair list pruning.
  82  */
  83 #define FUNCTION_DECLARATION_ONLY
  84 /** Force only **/
  85 #include "nbnxm_cuda_kernels.cuh"
  86 /** Force & energy **/
  87 #define CALC_ENERGIES
  88 #include "nbnxm_cuda_kernels.cuh"
  89 #undef CALC_ENERGIES
  90
  91 /*** Pair-list pruning kernels ***/
  92 /** Force only **/
  93 #define PRUNE_NBL
  94 #include "nbnxm_cuda_kernels.cuh"
  95 /** Force & energy **/
  96 #define CALC_ENERGIES
  97 #include "nbnxm_cuda_kernels.cuh"
  98 #undef CALC_ENERGIES
  99 #undef PRUNE_NBL
 100
 101 /* Prune-only kernels */
 102 #include "nbnxm_cuda_kernel_pruneonly.cuh"
 103 #undef FUNCTION_DECLARATION_ONLY
 104
 105 /* Now generate the function definitions if we are using a single compilation unit. */
 106 #if GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 107 #    include "nbnxm_cuda_kernel_F_noprune.cu"
 108 #    include "nbnxm_cuda_kernel_F_prune.cu"
 109 #    include "nbnxm_cuda_kernel_VF_noprune.cu"
 110 #    include "nbnxm_cuda_kernel_VF_prune.cu"
 111 #    include "nbnxm_cuda_kernel_pruneonly.cu"
 112 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 113
 114 namespace Nbnxm
 115 {
 116
 117 //! Number of CUDA threads in a block
 118 // TODO Optimize this through experimentation
 119 constexpr static int c_bufOpsThreadsPerBlock = 128;
 120
 121 /*! Nonbonded kernel function pointer type */
 122 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const cu_nbparam_t, const cu_plist_t, bool);
 123
 124 /*********************************/
 125
 126 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 127 static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t* dinfo)
 128 {
 129     int max_grid_x_size;
 130
 131     assert(dinfo);
 132     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
 133        empty domain) and that case should be handled before this point. */
 134     assert(nwork_units > 0);
 135
 136     max_grid_x_size = dinfo->prop.maxGridSize[0];
 137
 138     /* do we exceed the grid x dimension limit? */
 139     if (nwork_units > max_grid_x_size)
 140     {
 141         gmx_fatal(FARGS,
 142                   "Watch out, the input system is too large to simulate!\n"
 143                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 144                   "maximum grid size in x dimension (%d > %d)!",
 145                   nwork_units, max_grid_x_size);
 146     }
 147
 148     return nwork_units;
 149 }
 150
 151
 152 /* Constant arrays listing all kernel function pointers and enabling selection
 153    of a kernel in an elegant manner. */
 154
 155 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 156  *  electrostatics and VDW type.
 157  *
 158  *  Note that the row- and column-order of function pointers has to match the
 159  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 160  *  defined in nbnxn_cuda_types.h.
 161  */
 162
 163 /*! Force-only kernel function pointers. */
 164 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] = {
 165     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJCombGeom_F_cuda,
 166       nbnxn_kernel_ElecCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,
 167       nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,
 168       nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda },
 169     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJCombGeom_F_cuda,
 170       nbnxn_kernel_ElecRF_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,
 171       nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,
 172       nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda },
 173     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_cuda,
 174       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,
 175       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,
 176       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda },
 177     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_cuda,
 178       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda,
 179       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda,
 180       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 181     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJCombGeom_F_cuda,
 182       nbnxn_kernel_ElecEw_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,
 183       nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,
 184       nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda },
 185     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_cuda,
 186       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,
 187       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,
 188       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda }
 189 };
 190
 191 /*! Force + energy kernel function pointers. */
 192 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] = {
 193     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_cuda,
 194       nbnxn_kernel_ElecCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,
 195       nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,
 196       nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda },
 197     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_cuda,
 198       nbnxn_kernel_ElecRF_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,
 199       nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,
 200       nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda },
 201     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_cuda,
 202       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,
 203       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,
 204       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda },
 205     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_cuda,
 206       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda,
 207       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda,
 208       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
 209     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_cuda,
 210       nbnxn_kernel_ElecEw_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,
 211       nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,
 212       nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda },
 213     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_cuda,
 214       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,
 215       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,
 216       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda }
 217 };
 218
 219 /*! Force + pruning kernel function pointers. */
 220 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] = {
 221     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_cuda,
 222       nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,
 223       nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,
 224       nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda },
 225     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_cuda,
 226       nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,
 227       nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,
 228       nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda },
 229     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_cuda,
 230       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,
 231       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,
 232       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda },
 233     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,
 234       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_cuda,
 235       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_cuda,
 236       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda,
 237       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda,
 238       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
 239     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_cuda,
 240       nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,
 241       nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,
 242       nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda },
 243     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_cuda,
 244       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,
 245       nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,
 246       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda }
 247 };
 248
 249 /*! Force + energy + pruning kernel function pointers. */
 250 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] = {
 251     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_cuda,
 252       nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,
 253       nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,
 254       nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda },
 255     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_cuda,
 256       nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,
 257       nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,
 258       nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda },
 259     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_cuda,
 260       nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,
 261       nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,
 262       nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda },
 263     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda,
 264       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_cuda,
 265       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_cuda,
 266       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda,
 267       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda,
 268       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda,
 269       nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 270     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_cuda,
 271       nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,
 272       nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,
 273       nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda },
 274     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_cuda,
 275       nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_cuda,
 276       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,
 277       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,
 278       nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda }
 279 };
 280
 281 /*! Return a pointer to the kernel version to be executed at the current step. */
 282 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int                     eeltype,
 283                                                        int                     evdwtype,
 284                                                        bool                    bDoEne,
 285                                                        bool                    bDoPrune,
 286                                                        const gmx_device_info_t gmx_unused* devInfo)
 287 {
 288     nbnxn_cu_kfunc_ptr_t res;
 289
 290     GMX_ASSERT(eeltype < eelCuNR,
 291                "The electrostatics type requested is not implemented in the CUDA kernels.");
 292     GMX_ASSERT(evdwtype < evdwCuNR,
 293                "The VdW type requested is not implemented in the CUDA kernels.");
 294
 295     /* assert assumptions made by the kernels */
 296     GMX_ASSERT(c_nbnxnGpuClusterSize * c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit
 297                        == devInfo->prop.warpSize,
 298                "The CUDA kernels require the "
 299                "cluster_size_i*cluster_size_j/nbnxn_gpu_clusterpair_split to match the warp size "
 300                "of the architecture targeted.");
 301
 302     if (bDoEne)
 303     {
 304         if (bDoPrune)
 305         {
 306             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 307         }
 308         else
 309         {
 310             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 311         }
 312     }
 313     else
 314     {
 315         if (bDoPrune)
 316         {
 317             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 318         }
 319         else
 320         {
 321             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 322         }
 323     }
 324
 325     return res;
 326 }
 327
 328 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use. */
 329 static inline int calc_shmem_required_nonbonded(const int               num_threads_z,
 330                                                 const gmx_device_info_t gmx_unused* dinfo,
 331                                                 const cu_nbparam_t*                 nbp)
 332 {
 333     int shmem;
 334
 335     assert(dinfo);
 336
 337     /* size of shmem (force-buffers/xq/atom type preloading) */
 338     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 339     /* i-atom x+q in shared memory */
 340     shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
 341     /* cj in shared memory, for each warp separately */
 342     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 343
 344     if (nbp->vdwtype == evdwCuCUTCOMBGEOM || nbp->vdwtype == evdwCuCUTCOMBLB)
 345     {
 346         /* i-atom LJ combination parameters in shared memory */
 347         shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
 348     }
 349     else
 350     {
 351         /* i-atom types in shared memory */
 352         shmem += c_numClPerSupercl * c_clSize * sizeof(int);
 353     }
 354
 355     return shmem;
 356 }
 357
 358 /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
 359  *
 360  *  As the point where the local stream tasks can be considered complete happens
 361  *  at the same call point where the nonlocal stream should be synced with the
 362  *  the local, this function records the event if called with the local stream as
 363  *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
 364  */
 365 void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t* nb, const InteractionLocality interactionLocality)
 366 {
 367     cudaStream_t stream = nb->stream[interactionLocality];
 368
 369     /* When we get here all misc operations issued in the local stream as well as
 370        the local xq H2D are done,
 371        so we record that in the local stream and wait for it in the nonlocal one.
 372        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 373        compute on interactions between local and nonlocal atoms.
 374      */
 375     if (nb->bUseTwoStreams)
 376     {
 377         if (interactionLocality == InteractionLocality::Local)
 378         {
 379             cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
 380             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
 381         }
 382         else
 383         {
 384             cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
 385             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
 386         }
 387     }
 388 }
 389
 390 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 391 void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 392 {
 393     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 394
 395     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
 396                "Only local and non-local xq transfers are supported");
 397
 398     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 399
 400     int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 401
 402     cu_atomdata_t* adat   = nb->atdat;
 403     cu_plist_t*    plist  = nb->plist[iloc];
 404     cu_timers_t*   t      = nb->timers;
 405     cudaStream_t   stream = nb->stream[iloc];
 406
 407     bool bDoTime = nb->bDoTime;
 408
 409     /* Don't launch the non-local H2D copy if there is no dependent
 410        work to do: neither non-local nor other (e.g. bonded) work
 411        to do that has as input the nbnxn coordaintes.
 412        Doing the same for the local kernel is more complicated, since the
 413        local part of the force array also depends on the non-local kernel.
 414        So to avoid complicating the code and to reduce the risk of bugs,
 415        we always call the local local x+q copy (and the rest of the local
 416        work in nbnxn_gpu_launch_kernel().
 417      */
 418     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 419     {
 420         plist->haveFreshList = false;
 421
 422         return;
 423     }
 424
 425     /* calculate the atom data index range based on locality */
 426     if (atomLocality == AtomLocality::Local)
 427     {
 428         adat_begin = 0;
 429         adat_len   = adat->natoms_local;
 430     }
 431     else
 432     {
 433         adat_begin = adat->natoms_local;
 434         adat_len   = adat->natoms - adat->natoms_local;
 435     }
 436
 437     /* HtoD x, q */
 438     /* beginning of timed HtoD section */
 439     if (bDoTime)
 440     {
 441         t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
 442     }
 443
 444     cu_copy_H2D_async(adat->xq + adat_begin,
 445                       static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
 446                       adat_len * sizeof(*adat->xq), stream);
 447
 448     if (bDoTime)
 449     {
 450         t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
 451     }
 452
 453     /* When we get here all misc operations issued in the local stream as well as
 454        the local xq H2D are done,
 455        so we record that in the local stream and wait for it in the nonlocal one.
 456        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 457        compute on interactions between local and nonlocal atoms.
 458      */
 459     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 460 }
 461
 462 /*! As we execute nonbonded workload in separate streams, before launching
 463    the kernel we need to make sure that he following operations have completed:
 464    - atomdata allocation and related H2D transfers (every nstlist step);
 465    - pair list H2D transfer (every nstlist step);
 466    - shift vector H2D transfer (every nstlist step);
 467    - force (+shift force and energy) output clearing (every step).
 468
 469    These operations are issued in the local stream at the beginning of the step
 470    and therefore always complete before the local kernel launch. The non-local
 471    kernel is launched after the local on the same device/context hence it is
 472    inherently scheduled after the operations in the local stream (including the
 473    above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
 474    devices with multiple hardware queues the dependency needs to be enforced.
 475    We use the misc_ops_and_local_H2D_done event to record the point where
 476    the local x+q H2D (and all preceding) tasks are complete and synchronize
 477    with this event in the non-local stream before launching the non-bonded kernel.
 478  */
 479 void gpu_launch_kernel(gmx_nbnxn_cuda_t* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
 480 {
 481     cu_atomdata_t* adat   = nb->atdat;
 482     cu_nbparam_t*  nbp    = nb->nbparam;
 483     cu_plist_t*    plist  = nb->plist[iloc];
 484     cu_timers_t*   t      = nb->timers;
 485     cudaStream_t   stream = nb->stream[iloc];
 486
 487     bool bDoTime = nb->bDoTime;
 488
 489     /* Don't launch the non-local kernel if there is no work to do.
 490        Doing the same for the local kernel is more complicated, since the
 491        local part of the force array also depends on the non-local kernel.
 492        So to avoid complicating the code and to reduce the risk of bugs,
 493        we always call the local kernel, and later (not in
 494        this function) the stream wait, local f copyback and the f buffer
 495        clearing. All these operations, except for the local interaction kernel,
 496        are needed for the non-local interactions. The skip of the local kernel
 497        call is taken care of later in this function. */
 498     if (canSkipNonbondedWork(*nb, iloc))
 499     {
 500         plist->haveFreshList = false;
 501
 502         return;
 503     }
 504
 505     if (nbp->useDynamicPruning && plist->haveFreshList)
 506     {
 507         /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
 508            (TODO: ATM that's the way the timing accounting can distinguish between
 509            separate prune kernel and combined force+prune, maybe we need a better way?).
 510          */
 511         gpu_launch_kernel_pruneonly(nb, iloc, 1);
 512     }
 513
 514     if (plist->nsci == 0)
 515     {
 516         /* Don't launch an empty local kernel (not allowed with CUDA) */
 517         return;
 518     }
 519
 520     /* beginning of timed nonbonded calculation section */
 521     if (bDoTime)
 522     {
 523         t->interaction[iloc].nb_k.openTimingRegion(stream);
 524     }
 525
 526     /* Kernel launch config:
 527      * - The thread block dimensions match the size of i-clusters, j-clusters,
 528      *   and j-cluster concurrency, in x, y, and z, respectively.
 529      * - The 1D block-grid contains as many blocks as super-clusters.
 530      */
 531     int num_threads_z = 1;
 532     if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
 533     {
 534         num_threads_z = 2;
 535     }
 536     int nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
 537
 538
 539     KernelLaunchConfig config;
 540     config.blockSize[0]     = c_clSize;
 541     config.blockSize[1]     = c_clSize;
 542     config.blockSize[2]     = num_threads_z;
 543     config.gridSize[0]      = nblock;
 544     config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
 545     config.stream           = stream;
 546
 547     if (debug)
 548     {
 549         fprintf(debug,
 550                 "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
 551                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 552                 "\tShMem: %zu\n",
 553                 config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
 554                 config.gridSize[1], plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c,
 555                 config.sharedMemorySize);
 556     }
 557
 558     auto*      timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
 559     const auto kernel      = select_nbnxn_kernel(
 560             nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
 561             (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->dev_info);
 562     const auto kernelArgs =
 563             prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial);
 564     launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs);
 565
 566     if (bDoTime)
 567     {
 568         t->interaction[iloc].nb_k.closeTimingRegion(stream);
 569     }
 570
 571     if (GMX_NATIVE_WINDOWS)
 572     {
 573         /* Windows: force flushing WDDM queue */
 574         cudaStreamQuery(stream);
 575     }
 576 }
 577
 578 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 579 static inline int calc_shmem_required_prune(const int num_threads_z)
 580 {
 581     int shmem;
 582
 583     /* i-atom x in shared memory */
 584     shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
 585     /* cj in shared memory, for each warp separately */
 586     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 587
 588     return shmem;
 589 }
 590
 591 void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t* nb, const InteractionLocality iloc, const int numParts)
 592 {
 593     cu_atomdata_t* adat   = nb->atdat;
 594     cu_nbparam_t*  nbp    = nb->nbparam;
 595     cu_plist_t*    plist  = nb->plist[iloc];
 596     cu_timers_t*   t      = nb->timers;
 597     cudaStream_t   stream = nb->stream[iloc];
 598
 599     bool bDoTime = nb->bDoTime;
 600
 601     if (plist->haveFreshList)
 602     {
 603         GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
 604
 605         /* Set rollingPruningNumParts to signal that it is not set */
 606         plist->rollingPruningNumParts = 0;
 607         plist->rollingPruningPart     = 0;
 608     }
 609     else
 610     {
 611         if (plist->rollingPruningNumParts == 0)
 612         {
 613             plist->rollingPruningNumParts = numParts;
 614         }
 615         else
 616         {
 617             GMX_ASSERT(numParts == plist->rollingPruningNumParts,
 618                        "It is not allowed to change numParts in between list generation steps");
 619         }
 620     }
 621
 622     /* Use a local variable for part and update in plist, so we can return here
 623      * without duplicating the part increment code.
 624      */
 625     int part = plist->rollingPruningPart;
 626
 627     plist->rollingPruningPart++;
 628     if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
 629     {
 630         plist->rollingPruningPart = 0;
 631     }
 632
 633     /* Compute the number of list entries to prune in this pass */
 634     int numSciInPart = (plist->nsci - part) / numParts;
 635
 636     /* Don't launch the kernel if there is no work to do (not allowed with CUDA) */
 637     if (numSciInPart <= 0)
 638     {
 639         plist->haveFreshList = false;
 640
 641         return;
 642     }
 643
 644     GpuRegionTimer* timer = nullptr;
 645     if (bDoTime)
 646     {
 647         timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
 648     }
 649
 650     /* beginning of timed prune calculation section */
 651     if (bDoTime)
 652     {
 653         timer->openTimingRegion(stream);
 654     }
 655
 656     /* Kernel launch config:
 657      * - The thread block dimensions match the size of i-clusters, j-clusters,
 658      *   and j-cluster concurrency, in x, y, and z, respectively.
 659      * - The 1D block-grid contains as many blocks as super-clusters.
 660      */
 661     int                num_threads_z = c_cudaPruneKernelJ4Concurrency;
 662     int                nblock        = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
 663     KernelLaunchConfig config;
 664     config.blockSize[0]     = c_clSize;
 665     config.blockSize[1]     = c_clSize;
 666     config.blockSize[2]     = num_threads_z;
 667     config.gridSize[0]      = nblock;
 668     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
 669     config.stream           = stream;
 670
 671     if (debug)
 672     {
 673         fprintf(debug,
 674                 "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
 675                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 676                 "\tShMem: %zu\n",
 677                 config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
 678                 config.gridSize[1], numSciInPart * c_numClPerSupercl, c_numClPerSupercl,
 679                 plist->na_c, config.sharedMemorySize);
 680     }
 681
 682     auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
 683     constexpr char kernelName[] = "k_pruneonly";
 684     const auto     kernel =
 685             plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
 686     const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
 687     launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
 688
 689     /* TODO: consider a more elegant way to track which kernel has been called
 690        (combined or separate 1st pass prune, rolling prune). */
 691     if (plist->haveFreshList)
 692     {
 693         plist->haveFreshList = false;
 694         /* Mark that pruning has been done */
 695         nb->timers->interaction[iloc].didPrune = true;
 696     }
 697     else
 698     {
 699         /* Mark that rolling pruning has been done */
 700         nb->timers->interaction[iloc].didRollingPrune = true;
 701     }
 702
 703     if (bDoTime)
 704     {
 705         timer->closeTimingRegion(stream);
 706     }
 707
 708     if (GMX_NATIVE_WINDOWS)
 709     {
 710         /* Windows: force flushing WDDM queue */
 711         cudaStreamQuery(stream);
 712     }
 713 }
 714
 715 void gpu_launch_cpyback(gmx_nbnxn_cuda_t*        nb,
 716                         nbnxn_atomdata_t*        nbatom,
 717                         const gmx::StepWorkload& stepWork,
 718                         const AtomLocality       atomLocality)
 719 {
 720     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 721
 722     cudaError_t stat;
 723     int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 724
 725     /* determine interaction locality from atom locality */
 726     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 727
 728     /* extract the data */
 729     cu_atomdata_t* adat    = nb->atdat;
 730     cu_timers_t*   t       = nb->timers;
 731     bool           bDoTime = nb->bDoTime;
 732     cudaStream_t   stream  = nb->stream[iloc];
 733
 734     /* don't launch non-local copy-back if there was no non-local work to do */
 735     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 736     {
 737         return;
 738     }
 739
 740     getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
 741
 742     /* beginning of timed D2H section */
 743     if (bDoTime)
 744     {
 745         t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
 746     }
 747
 748     /* With DD the local D2H transfer can only start after the non-local
 749        kernel has finished. */
 750     if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
 751     {
 752         stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
 753         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 754     }
 755
 756     /* DtoH f
 757      * Skip if buffer ops / reduction is offloaded to the GPU.
 758      */
 759     if (!stepWork.useGpuFBufferOps)
 760     {
 761         cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
 762                           (adat_len) * sizeof(*adat->f), stream);
 763     }
 764
 765     /* After the non-local D2H is launched the nonlocal_done event can be
 766        recorded which signals that the local D2H can proceed. This event is not
 767        placed after the non-local kernel because we want the non-local data
 768        back first. */
 769     if (iloc == InteractionLocality::NonLocal)
 770     {
 771         stat = cudaEventRecord(nb->nonlocal_done, stream);
 772         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 773     }
 774
 775     /* only transfer energies in the local stream */
 776     if (iloc == InteractionLocality::Local)
 777     {
 778         /* DtoH fshift when virial is needed */
 779         if (stepWork.computeVirial)
 780         {
 781             cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream);
 782         }
 783
 784         /* DtoH energies */
 785         if (stepWork.computeEnergy)
 786         {
 787             cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream);
 788             cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), stream);
 789         }
 790     }
 791
 792     if (bDoTime)
 793     {
 794         t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
 795     }
 796 }
 797
 798 void cuda_set_cacheconfig()
 799 {
 800     cudaError_t stat;
 801
 802     for (int i = 0; i < eelCuNR; i++)
 803     {
 804         for (int j = 0; j < evdwCuNR; j++)
 805         {
 806             /* Default kernel 32/32 kB Shared/L1 */
 807             cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 808             cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 809             cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 810             stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 811             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 812         }
 813     }
 814 }
 815
 816 /* X buffer operations on GPU: performs conversion from rvec to nb format. */
 817 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
 818                            bool                      setFillerCoords,
 819                            gmx_nbnxn_gpu_t*          nb,
 820                            DeviceBuffer<float>       d_x,
 821                            GpuEventSynchronizer*     xReadyOnDevice,
 822                            const Nbnxm::AtomLocality locality,
 823                            int                       gridId,
 824                            int                       numColumnsMax)
 825 {
 826     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 827
 828     cu_atomdata_t* adat = nb->atdat;
 829
 830     const int                  numColumns      = grid.numColumns();
 831     const int                  cellOffset      = grid.cellOffset();
 832     const int                  numAtomsPerCell = grid.numAtomsPerCell();
 833     Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
 834
 835     cudaStream_t stream = nb->stream[interactionLoc];
 836
 837     int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
 838     // avoid empty kernel launch, skip to inserting stream dependency
 839     if (numAtoms != 0)
 840     {
 841         // TODO: This will only work with CUDA
 842         GMX_ASSERT(d_x, "Need a valid device pointer");
 843
 844         // ensure that coordinates are ready on the device before launching the kernel
 845         GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
 846         xReadyOnDevice->enqueueWaitEvent(stream);
 847
 848         KernelLaunchConfig config;
 849         config.blockSize[0] = c_bufOpsThreadsPerBlock;
 850         config.blockSize[1] = 1;
 851         config.blockSize[2] = 1;
 852         config.gridSize[0] = (grid.numCellsColumnMax() * numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)
 853                              / c_bufOpsThreadsPerBlock;
 854         config.gridSize[1] = numColumns;
 855         config.gridSize[2] = 1;
 856         GMX_ASSERT(config.gridSize[0] > 0,
 857                    "Can not have empty grid, early return above avoids this");
 858         config.sharedMemorySize = 0;
 859         config.stream           = stream;
 860
 861         auto       kernelFn      = nbnxn_gpu_x_to_nbat_x_kernel;
 862         float*     xqPtr         = &(adat->xq->x);
 863         const int* d_atomIndices = nb->atomIndices;
 864         const int* d_cxy_na      = &nb->cxy_na[numColumnsMax * gridId];
 865         const int* d_cxy_ind     = &nb->cxy_ind[numColumnsMax * gridId];
 866         const auto kernelArgs    = prepareGpuKernelArguments(
 867                 kernelFn, config, &numColumns, &xqPtr, &setFillerCoords, &d_x, &d_atomIndices,
 868                 &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
 869         launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
 870     }
 871
 872     // TODO: note that this is not necessary when there are no local atoms, that is:
 873     // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
 874     // but for now we avoid that optimization
 875     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 876 }
 877
 878 /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
 879 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
 880                                DeviceBuffer<float>                        totalForcesDevice,
 881                                gmx_nbnxn_gpu_t*                           nb,
 882                                void*                                      pmeForcesDevice,
 883                                gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
 884                                int                                        atomStart,
 885                                int                                        numAtoms,
 886                                bool                                       useGpuFPmeReduction,
 887                                bool                                       accumulateForce)
 888 {
 889     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 890     GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
 891     GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
 892
 893     const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 894     cudaStream_t              stream    = nb->stream[iLocality];
 895     cu_atomdata_t*            adat      = nb->atdat;
 896
 897     size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
 898                                              + static_cast<size_t>((accumulateForce == true));
 899     GMX_ASSERT(numDependency >= dependencyList.size(),
 900                "Mismatching number of dependencies and call signature");
 901
 902     // Enqueue wait on all dependencies passed
 903     for (auto const synchronizer : dependencyList)
 904     {
 905         synchronizer->enqueueWaitEvent(stream);
 906     }
 907
 908     /* launch kernel */
 909
 910     KernelLaunchConfig config;
 911     config.blockSize[0] = c_bufOpsThreadsPerBlock;
 912     config.blockSize[1] = 1;
 913     config.blockSize[2] = 1;
 914     config.gridSize[0]  = ((numAtoms + 1) + c_bufOpsThreadsPerBlock - 1) / c_bufOpsThreadsPerBlock;
 915     config.gridSize[1]  = 1;
 916     config.gridSize[2]  = 1;
 917     config.sharedMemorySize = 0;
 918     config.stream           = stream;
 919
 920     auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
 921                                     : nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
 922
 923     if (useGpuFPmeReduction)
 924     {
 925         GMX_ASSERT(pmeForcesDevice, "Need a valid pmeForcesDevice pointer");
 926         kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, true>
 927                                    : nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
 928     }
 929
 930     const float3* d_fNB    = adat->f;
 931     const float3* d_fPme   = (float3*)pmeForcesDevice;
 932     float3*       d_fTotal = (float3*)totalForcesDevice;
 933     const int*    d_cell   = nb->cell;
 934
 935     const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
 936                                                       &d_cell, &atomStart, &numAtoms);
 937
 938     launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
 939
 940     if (atomLocality == AtomLocality::Local)
 941     {
 942         GMX_ASSERT(nb->localFReductionDone != nullptr,
 943                    "localFReductionDone has to be a valid pointer");
 944         nb->localFReductionDone->markEvent(stream);
 945     }
 946 }
 947
 948 void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t* nb)
 949 {
 950     nb->xNonLocalCopyD2HDone->waitForEvent();
 951 }
 952
 953 void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb)
 954 {
 955     cudaStream_t localStream    = nb->stream[InteractionLocality::Local];
 956     cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
 957
 958     GpuEventSynchronizer event;
 959     event.markEvent(nonLocalStream);
 960     event.enqueueWaitEvent(localStream);
 961 }
 962
 963 } // namespace Nbnxm