src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdlib.h>
  46
  47 #include "gromacs/nbnxm/nbnxm_gpu.h"
  48
  49 #if defined(_MSVC)
  50 #include <limits>
  51 #endif
  52
  53
  54 #include "nbnxm_cuda.h"
  55
  56 #include "gromacs/gpu_utils/cudautils.cuh"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/nbnxm/atomdata.h"
  59 #include "gromacs/nbnxm/gpu_common.h"
  60 #include "gromacs/nbnxm/gpu_common_utils.h"
  61 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  62 #include "gromacs/nbnxm/nbnxm.h"
  63 #include "gromacs/nbnxm/pairlist.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/gmxassert.h"
  67
  68 #include "nbnxm_cuda_types.h"
  69
  70 /***** The kernel declarations/definitions come here *****/
  71
  72 /* Top-level kernel declaration generation: will generate through multiple
  73  * inclusion the following flavors for all kernel declarations:
  74  * - force-only output;
  75  * - force and energy output;
  76  * - force-only with pair list pruning;
  77  * - force and energy output with pair list pruning.
  78  */
  79 #define FUNCTION_DECLARATION_ONLY
  80 /** Force only **/
  81 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernels.cuh"
  82 /** Force & energy **/
  83 #define CALC_ENERGIES
  84 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernels.cuh"
  85 #undef CALC_ENERGIES
  86
  87 /*** Pair-list pruning kernels ***/
  88 /** Force only **/
  89 #define PRUNE_NBL
  90 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernels.cuh"
  91 /** Force & energy **/
  92 #define CALC_ENERGIES
  93 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernels.cuh"
  94 #undef CALC_ENERGIES
  95 #undef PRUNE_NBL
  96
  97 /* Prune-only kernels */
  98 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh"
  99 #undef FUNCTION_DECLARATION_ONLY
 100
 101 /* Now generate the function definitions if we are using a single compilation unit. */
 102 #if GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 103 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_F_noprune.cu"
 104 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_F_prune.cu"
 105 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_VF_noprune.cu"
 106 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_VF_prune.cu"
 107 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu"
 108 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 109
 110
 111 namespace Nbnxm
 112 {
 113
 114 /*! Nonbonded kernel function pointer type */
 115 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 116                                      const cu_nbparam_t,
 117                                      const cu_plist_t,
 118                                      bool);
 119
 120 /*********************************/
 121
 122 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 123 static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
 124 {
 125     int max_grid_x_size;
 126
 127     assert(dinfo);
 128     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
 129        empty domain) and that case should be handled before this point. */
 130     assert(nwork_units > 0);
 131
 132     max_grid_x_size = dinfo->prop.maxGridSize[0];
 133
 134     /* do we exceed the grid x dimension limit? */
 135     if (nwork_units > max_grid_x_size)
 136     {
 137         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 138                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 139                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 140     }
 141
 142     return nwork_units;
 143 }
 144
 145
 146 /* Constant arrays listing all kernel function pointers and enabling selection
 147    of a kernel in an elegant manner. */
 148
 149 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 150  *  electrostatics and VDW type.
 151  *
 152  *  Note that the row- and column-order of function pointers has to match the
 153  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 154  *  defined in nbnxn_cuda_types.h.
 155  */
 156
 157 /*! Force-only kernel function pointers. */
 158 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 159 {
 160     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
 161     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
 162     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
 163     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 164     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
 165     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 166 };
 167
 168 /*! Force + energy kernel function pointers. */
 169 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 170 {
 171     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda            },
 172     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda             },
 173     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda        },
 174     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
 175     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda             },
 176     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda      }
 177 };
 178
 179 /*! Force + pruning kernel function pointers. */
 180 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 181 {
 182     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda             },
 183     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda              },
 184     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda         },
 185     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda  },
 186     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda              },
 187     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda       }
 188 };
 189
 190 /*! Force + energy + pruning kernel function pointers. */
 191 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 192 {
 193     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
 194     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
 195     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
 196     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 197     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
 198     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 199 };
 200
 201 /*! Return a pointer to the kernel version to be executed at the current step. */
 202 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int                                  eeltype,
 203                                                        int                                  evdwtype,
 204                                                        bool                                 bDoEne,
 205                                                        bool                                 bDoPrune,
 206                                                        const gmx_device_info_t gmx_unused  *devInfo)
 207 {
 208     nbnxn_cu_kfunc_ptr_t res;
 209
 210     GMX_ASSERT(eeltype < eelCuNR,
 211                "The electrostatics type requested is not implemented in the CUDA kernels.");
 212     GMX_ASSERT(evdwtype < evdwCuNR,
 213                "The VdW type requested is not implemented in the CUDA kernels.");
 214
 215     /* assert assumptions made by the kernels */
 216     GMX_ASSERT(c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit == devInfo->prop.warpSize,
 217                "The CUDA kernels require the cluster_size_i*cluster_size_j/nbnxn_gpu_clusterpair_split to match the warp size of the architecture targeted.");
 218
 219     if (bDoEne)
 220     {
 221         if (bDoPrune)
 222         {
 223             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 224         }
 225         else
 226         {
 227             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 228         }
 229     }
 230     else
 231     {
 232         if (bDoPrune)
 233         {
 234             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 235         }
 236         else
 237         {
 238             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 239         }
 240     }
 241
 242     return res;
 243 }
 244
 245 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use. */
 246 static inline int calc_shmem_required_nonbonded(const int num_threads_z, const gmx_device_info_t gmx_unused *dinfo, const cu_nbparam_t *nbp)
 247 {
 248     int shmem;
 249
 250     assert(dinfo);
 251
 252     /* size of shmem (force-buffers/xq/atom type preloading) */
 253     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 254     /* i-atom x+q in shared memory */
 255     shmem  = c_numClPerSupercl * c_clSize * sizeof(float4);
 256     /* cj in shared memory, for each warp separately */
 257     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 258
 259     if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
 260         nbp->vdwtype == evdwCuCUTCOMBLB)
 261     {
 262         /* i-atom LJ combination parameters in shared memory */
 263         shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
 264     }
 265     else
 266     {
 267         /* i-atom types in shared memory */
 268         shmem += c_numClPerSupercl * c_clSize * sizeof(int);
 269     }
 270
 271     return shmem;
 272 }
 273
 274 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 275 void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
 276                         const nbnxn_atomdata_t *nbatom,
 277                         const AtomLocality      atomLocality,
 278                         const bool              haveOtherWork)
 279 {
 280     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
 281                "Only local and non-local xq transfers are supported");
 282
 283     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 284
 285     int                       adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 286
 287     cu_atomdata_t            *adat    = nb->atdat;
 288     cu_plist_t               *plist   = nb->plist[iloc];
 289     cu_timers_t              *t       = nb->timers;
 290     cudaStream_t              stream  = nb->stream[iloc];
 291
 292     bool                      bDoTime     = nb->bDoTime;
 293
 294     /* Don't launch the non-local H2D copy if there is no dependent
 295        work to do: neither non-local nor other (e.g. bonded) work
 296        to do that has as input the nbnxn coordaintes.
 297        Doing the same for the local kernel is more complicated, since the
 298        local part of the force array also depends on the non-local kernel.
 299        So to avoid complicating the code and to reduce the risk of bugs,
 300        we always call the local local x+q copy (and the rest of the local
 301        work in nbnxn_gpu_launch_kernel().
 302      */
 303     if (!haveOtherWork && canSkipWork(*nb, iloc))
 304     {
 305         plist->haveFreshList = false;
 306
 307         return;
 308     }
 309
 310     /* calculate the atom data index range based on locality */
 311     if (atomLocality == AtomLocality::Local)
 312     {
 313         adat_begin  = 0;
 314         adat_len    = adat->natoms_local;
 315     }
 316     else
 317     {
 318         adat_begin  = adat->natoms_local;
 319         adat_len    = adat->natoms - adat->natoms_local;
 320     }
 321
 322     /* beginning of timed HtoD section */
 323     if (bDoTime)
 324     {
 325         t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
 326     }
 327
 328     /* HtoD x, q */
 329     cu_copy_H2D_async(adat->xq + adat_begin,
 330                       static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
 331                       adat_len * sizeof(*adat->xq), stream);
 332
 333     if (bDoTime)
 334     {
 335         t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
 336     }
 337
 338     /* When we get here all misc operations issued in the local stream as well as
 339        the local xq H2D are done,
 340        so we record that in the local stream and wait for it in the nonlocal one.
 341        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 342        compute on interactions between local and nonlocal atoms.
 343      */
 344     if (nb->bUseTwoStreams)
 345     {
 346         if (iloc == InteractionLocality::Local)
 347         {
 348             cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
 349             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
 350         }
 351         else
 352         {
 353             cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
 354             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
 355         }
 356     }
 357 }
 358
 359 /*! As we execute nonbonded workload in separate streams, before launching
 360    the kernel we need to make sure that he following operations have completed:
 361    - atomdata allocation and related H2D transfers (every nstlist step);
 362    - pair list H2D transfer (every nstlist step);
 363    - shift vector H2D transfer (every nstlist step);
 364    - force (+shift force and energy) output clearing (every step).
 365
 366    These operations are issued in the local stream at the beginning of the step
 367    and therefore always complete before the local kernel launch. The non-local
 368    kernel is launched after the local on the same device/context hence it is
 369    inherently scheduled after the operations in the local stream (including the
 370    above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
 371    devices with multiple hardware queues the dependency needs to be enforced.
 372    We use the misc_ops_and_local_H2D_done event to record the point where
 373    the local x+q H2D (and all preceding) tasks are complete and synchronize
 374    with this event in the non-local stream before launching the non-bonded kernel.
 375  */
 376 void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
 377                        const int                  flags,
 378                        const InteractionLocality  iloc)
 379 {
 380     cu_atomdata_t       *adat    = nb->atdat;
 381     cu_nbparam_t        *nbp     = nb->nbparam;
 382     cu_plist_t          *plist   = nb->plist[iloc];
 383     cu_timers_t         *t       = nb->timers;
 384     cudaStream_t         stream  = nb->stream[iloc];
 385
 386     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 387     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
 388     bool                 bDoTime     = nb->bDoTime;
 389
 390     /* Don't launch the non-local kernel if there is no work to do.
 391        Doing the same for the local kernel is more complicated, since the
 392        local part of the force array also depends on the non-local kernel.
 393        So to avoid complicating the code and to reduce the risk of bugs,
 394        we always call the local kernel, and later (not in
 395        this function) the stream wait, local f copyback and the f buffer
 396        clearing. All these operations, except for the local interaction kernel,
 397        are needed for the non-local interactions. The skip of the local kernel
 398        call is taken care of later in this function. */
 399     if (canSkipWork(*nb, iloc))
 400     {
 401         plist->haveFreshList = false;
 402
 403         return;
 404     }
 405
 406     if (nbp->useDynamicPruning && plist->haveFreshList)
 407     {
 408         /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
 409            (TODO: ATM that's the way the timing accounting can distinguish between
 410            separate prune kernel and combined force+prune, maybe we need a better way?).
 411          */
 412         gpu_launch_kernel_pruneonly(nb, iloc, 1);
 413     }
 414
 415     if (plist->nsci == 0)
 416     {
 417         /* Don't launch an empty local kernel (not allowed with CUDA) */
 418         return;
 419     }
 420
 421     /* beginning of timed nonbonded calculation section */
 422     if (bDoTime)
 423     {
 424         t->interaction[iloc].nb_k.openTimingRegion(stream);
 425     }
 426
 427     /* Kernel launch config:
 428      * - The thread block dimensions match the size of i-clusters, j-clusters,
 429      *   and j-cluster concurrency, in x, y, and z, respectively.
 430      * - The 1D block-grid contains as many blocks as super-clusters.
 431      */
 432     int num_threads_z = 1;
 433     if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
 434     {
 435         num_threads_z = 2;
 436     }
 437     int nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
 438
 439
 440     KernelLaunchConfig config;
 441     config.blockSize[0]     = c_clSize;
 442     config.blockSize[1]     = c_clSize;
 443     config.blockSize[2]     = num_threads_z;
 444     config.gridSize[0]      = nblock;
 445     config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
 446     config.stream           = stream;
 447
 448     if (debug)
 449     {
 450         fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
 451                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 452                 "\tShMem: %zu\n",
 453                 config.blockSize[0], config.blockSize[1], config.blockSize[2],
 454                 config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl,
 455                 c_numClPerSupercl, plist->na_c,
 456                 config.sharedMemorySize);
 457     }
 458
 459     auto       *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
 460     const auto  kernel      = select_nbnxn_kernel(nbp->eeltype,
 461                                                   nbp->vdwtype,
 462                                                   bCalcEner,
 463                                                   (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
 464                                                   nb->dev_info);
 465     const auto kernelArgs  = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &bCalcFshift);
 466     launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs);
 467
 468     if (bDoTime)
 469     {
 470         t->interaction[iloc].nb_k.closeTimingRegion(stream);
 471     }
 472
 473     if (GMX_NATIVE_WINDOWS)
 474     {
 475         /* Windows: force flushing WDDM queue */
 476         cudaStreamQuery(stream);
 477     }
 478 }
 479
 480 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 481 static inline int calc_shmem_required_prune(const int num_threads_z)
 482 {
 483     int shmem;
 484
 485     /* i-atom x in shared memory */
 486     shmem  = c_numClPerSupercl * c_clSize * sizeof(float4);
 487     /* cj in shared memory, for each warp separately */
 488     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 489
 490     return shmem;
 491 }
 492
 493 void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 494                                  const InteractionLocality  iloc,
 495                                  const int                  numParts)
 496 {
 497     cu_atomdata_t       *adat    = nb->atdat;
 498     cu_nbparam_t        *nbp     = nb->nbparam;
 499     cu_plist_t          *plist   = nb->plist[iloc];
 500     cu_timers_t         *t       = nb->timers;
 501     cudaStream_t         stream  = nb->stream[iloc];
 502
 503     bool                 bDoTime = nb->bDoTime;
 504
 505     if (plist->haveFreshList)
 506     {
 507         GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
 508
 509         /* Set rollingPruningNumParts to signal that it is not set */
 510         plist->rollingPruningNumParts = 0;
 511         plist->rollingPruningPart     = 0;
 512     }
 513     else
 514     {
 515         if (plist->rollingPruningNumParts == 0)
 516         {
 517             plist->rollingPruningNumParts = numParts;
 518         }
 519         else
 520         {
 521             GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
 522         }
 523     }
 524
 525     /* Use a local variable for part and update in plist, so we can return here
 526      * without duplicating the part increment code.
 527      */
 528     int part = plist->rollingPruningPart;
 529
 530     plist->rollingPruningPart++;
 531     if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
 532     {
 533         plist->rollingPruningPart = 0;
 534     }
 535
 536     /* Compute the number of list entries to prune in this pass */
 537     int numSciInPart = (plist->nsci - part)/numParts;
 538
 539     /* Don't launch the kernel if there is no work to do (not allowed with CUDA) */
 540     if (numSciInPart <= 0)
 541     {
 542         plist->haveFreshList = false;
 543
 544         return;
 545     }
 546
 547     GpuRegionTimer *timer = nullptr;
 548     if (bDoTime)
 549     {
 550         timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
 551     }
 552
 553     /* beginning of timed prune calculation section */
 554     if (bDoTime)
 555     {
 556         timer->openTimingRegion(stream);
 557     }
 558
 559     /* Kernel launch config:
 560      * - The thread block dimensions match the size of i-clusters, j-clusters,
 561      *   and j-cluster concurrency, in x, y, and z, respectively.
 562      * - The 1D block-grid contains as many blocks as super-clusters.
 563      */
 564     int                num_threads_z  = c_cudaPruneKernelJ4Concurrency;
 565     int                nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
 566     KernelLaunchConfig config;
 567     config.blockSize[0]     = c_clSize;
 568     config.blockSize[1]     = c_clSize;
 569     config.blockSize[2]     = num_threads_z;
 570     config.gridSize[0]      = nblock;
 571     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
 572     config.stream           = stream;
 573
 574     if (debug)
 575     {
 576         fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
 577                 "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 578                 "\tShMem: %zu\n",
 579                 config.blockSize[0], config.blockSize[1], config.blockSize[2],
 580                 config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl,
 581                 c_numClPerSupercl, plist->na_c,
 582                 config.sharedMemorySize);
 583     }
 584
 585     auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
 586     constexpr char kernelName[] = "k_pruneonly";
 587     const auto     kernel       = plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
 588     const auto     kernelArgs   = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
 589     launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
 590
 591     /* TODO: consider a more elegant way to track which kernel has been called
 592        (combined or separate 1st pass prune, rolling prune). */
 593     if (plist->haveFreshList)
 594     {
 595         plist->haveFreshList                   = false;
 596         /* Mark that pruning has been done */
 597         nb->timers->interaction[iloc].didPrune = true;
 598     }
 599     else
 600     {
 601         /* Mark that rolling pruning has been done */
 602         nb->timers->interaction[iloc].didRollingPrune = true;
 603     }
 604
 605     if (bDoTime)
 606     {
 607         timer->closeTimingRegion(stream);
 608     }
 609
 610     if (GMX_NATIVE_WINDOWS)
 611     {
 612         /* Windows: force flushing WDDM queue */
 613         cudaStreamQuery(stream);
 614     }
 615 }
 616
 617 void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
 618                         nbnxn_atomdata_t       *nbatom,
 619                         const int               flags,
 620                         const AtomLocality      atomLocality,
 621                         const bool              haveOtherWork)
 622 {
 623     cudaError_t stat;
 624     int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 625
 626     /* determine interaction locality from atom locality */
 627     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 628
 629     /* extract the data */
 630     cu_atomdata_t   *adat    = nb->atdat;
 631     cu_timers_t     *t       = nb->timers;
 632     bool             bDoTime = nb->bDoTime;
 633     cudaStream_t     stream  = nb->stream[iloc];
 634
 635     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 636     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 637
 638     /* don't launch non-local copy-back if there was no non-local work to do */
 639     if (!haveOtherWork && canSkipWork(*nb, iloc))
 640     {
 641         return;
 642     }
 643
 644     getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
 645
 646     /* beginning of timed D2H section */
 647     if (bDoTime)
 648     {
 649         t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
 650     }
 651
 652     /* With DD the local D2H transfer can only start after the non-local
 653        kernel has finished. */
 654     if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
 655     {
 656         stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
 657         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 658     }
 659
 660     /* DtoH f */
 661     cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
 662                       (adat_len)*sizeof(*adat->f), stream);
 663
 664     /* After the non-local D2H is launched the nonlocal_done event can be
 665        recorded which signals that the local D2H can proceed. This event is not
 666        placed after the non-local kernel because we want the non-local data
 667        back first. */
 668     if (iloc == InteractionLocality::NonLocal)
 669     {
 670         stat = cudaEventRecord(nb->nonlocal_done, stream);
 671         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 672     }
 673
 674     /* only transfer energies in the local stream */
 675     if (iloc == InteractionLocality::Local)
 676     {
 677         /* DtoH fshift */
 678         if (bCalcFshift)
 679         {
 680             cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
 681                               SHIFTS * sizeof(*nb->nbst.fshift), stream);
 682         }
 683
 684         /* DtoH energies */
 685         if (bCalcEner)
 686         {
 687             cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
 688                               sizeof(*nb->nbst.e_lj), stream);
 689             cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
 690                               sizeof(*nb->nbst.e_el), stream);
 691         }
 692     }
 693
 694     if (bDoTime)
 695     {
 696         t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
 697     }
 698 }
 699
 700 void cuda_set_cacheconfig()
 701 {
 702     cudaError_t stat;
 703
 704     for (int i = 0; i < eelCuNR; i++)
 705     {
 706         for (int j = 0; j < evdwCuNR; j++)
 707         {
 708             /* Default kernel 32/32 kB Shared/L1 */
 709             cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 710             cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 711             cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 712             stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 713             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 714         }
 715     }
 716 }
 717
 718 } // namespace Nbnxm