src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdlib.h>
  46
  47 #include "gromacs/mdlib/nbnxn_gpu.h"
  48
  49 #if defined(_MSVC)
  50 #include <limits>
  51 #endif
  52
  53
  54 #include "nbnxn_cuda.h"
  55
  56 #include "gromacs/gpu_utils/cudautils.cuh"
  57 #include "gromacs/mdlib/force_flags.h"
  58 #include "gromacs/mdlib/nb_verlet.h"
  59 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  60 #include "gromacs/mdlib/nbnxn_pairlist.h"
  61 #include "gromacs/timing/gpu_timing.h"
  62 #include "gromacs/utility/cstringutil.h"
  63 #include "gromacs/utility/gmxassert.h"
  64
  65 #include "nbnxn_cuda_types.h"
  66
  67 /*
  68  * Texture references are created at compile-time and need to be declared
  69  * at file scope as global variables (see http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#texture-reference-api).
  70  * The texture references below are used in two translation units;
  71  * we declare them here along the kernels that use them (when compiling legacy Fermi kernels),
  72  * and provide getters (see below) used by the data_mgmt module where the
  73  * textures are bound/unbound.
  74  * (In principle we could do it the other way arond, but that would likely require
  75  * device linking and we'd rather avoid technical hurdles.)
  76  */
  77 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  78 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  79
  80 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  81 texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  82
  83 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  84 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  85
  86
  87 /***** The kernel declarations/definitions come here *****/
  88
  89 /* Top-level kernel declaration generation: will generate through multiple
  90  * inclusion the following flavors for all kernel declarations:
  91  * - force-only output;
  92  * - force and energy output;
  93  * - force-only with pair list pruning;
  94  * - force and energy output with pair list pruning.
  95  */
  96 #define FUNCTION_DECLARATION_ONLY
  97 /** Force only **/
  98 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  99 /** Force & energy **/
 100 #define CALC_ENERGIES
 101 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 102 #undef CALC_ENERGIES
 103
 104 /*** Pair-list pruning kernels ***/
 105 /** Force only **/
 106 #define PRUNE_NBL
 107 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 108 /** Force & energy **/
 109 #define CALC_ENERGIES
 110 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 111 #undef CALC_ENERGIES
 112 #undef PRUNE_NBL
 113
 114 /* Prune-only kernels */
 115 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh"
 116 #undef FUNCTION_DECLARATION_ONLY
 117
 118 /* Now generate the function definitions if we are using a single compilation unit. */
 119 #if GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 120 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_F_noprune.cu"
 121 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_F_prune.cu"
 122 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_noprune.cu"
 123 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_prune.cu"
 124 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cu"
 125 #else
 126 /* Prevent compilation in multiple compilation unit mode for CC 2.x. Although we have
 127  * build-time checks to prevent this, the user could manually tweaks nvcc flags
 128  * which would lead to buggy kernels getting compiled.
 129  */
 130 #if GMX_PTX_ARCH > 0 && GMX_PTX_ARCH <= 210
 131 #error Due to an CUDA compiler bug, the CUDA non-bonded module can not be compiled with multiple compilation units for CC 2.x devices. If you have changed the nvcc flags manually, either use the GMX_CUDA_TARGET_* variables instead or set GMX_CUDA_NB_SINGLE_COMPILATION_UNIT=ON CMake option.
 132 #endif
 133 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 134
 135
 136 /*! Nonbonded kernel function pointer type */
 137 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 138                                      const cu_nbparam_t,
 139                                      const cu_plist_t,
 140                                      bool);
 141
 142 /*********************************/
 143
 144 /* XXX switch between chevron and cudaLaunch (supported only in CUDA >=7.0)
 145    -- only for benchmarking purposes */
 146 static const bool bUseCudaLaunchKernel =
 147     (GMX_CUDA_VERSION >= 7000) && (getenv("GMX_DISABLE_CUDALAUNCH") == NULL);
 148
 149 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 150 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 151 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 152 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 153
 154
 155 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 156 static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
 157 {
 158     int max_grid_x_size;
 159
 160     assert(dinfo);
 161     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
 162        empty domain) and that case should be handled before this point. */
 163     assert(nwork_units > 0);
 164
 165     max_grid_x_size = dinfo->prop.maxGridSize[0];
 166
 167     /* do we exceed the grid x dimension limit? */
 168     if (nwork_units > max_grid_x_size)
 169     {
 170         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 171                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 172                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 173     }
 174
 175     return nwork_units;
 176 }
 177
 178
 179 /* Constant arrays listing all kernel function pointers and enabling selection
 180    of a kernel in an elegant manner. */
 181
 182 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 183  *  electrostatics and VDW type.
 184  *
 185  *  Note that the row- and column-order of function pointers has to match the
 186  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 187  *  defined in nbnxn_cuda_types.h.
 188  */
 189
 190 /*! Force-only kernel function pointers. */
 191 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 192 {
 193     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
 194     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
 195     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
 196     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 197     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
 198     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 199 };
 200
 201 /*! Force + energy kernel function pointers. */
 202 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 203 {
 204     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda            },
 205     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda             },
 206     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda        },
 207     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
 208     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda             },
 209     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda      }
 210 };
 211
 212 /*! Force + pruning kernel function pointers. */
 213 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 214 {
 215     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda             },
 216     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda              },
 217     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda         },
 218     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda  },
 219     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda              },
 220     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda       }
 221 };
 222
 223 /*! Force + energy + pruning kernel function pointers. */
 224 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 225 {
 226     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
 227     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
 228     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
 229     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 230     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
 231     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 232 };
 233
 234 /*! Return a pointer to the kernel version to be executed at the current step. */
 235 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int                                  eeltype,
 236                                                        int                                  evdwtype,
 237                                                        bool                                 bDoEne,
 238                                                        bool                                 bDoPrune,
 239                                                        const gmx_device_info_t gmx_unused  *devInfo)
 240 {
 241     nbnxn_cu_kfunc_ptr_t res;
 242
 243     GMX_ASSERT(eeltype < eelCuNR,
 244                "The electrostatics type requested is not implemented in the CUDA kernels.");
 245     GMX_ASSERT(evdwtype < evdwCuNR,
 246                "The VdW type requested is not implemented in the CUDA kernels.");
 247
 248     /* assert assumptions made by the kernels */
 249     GMX_ASSERT(c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit == devInfo->prop.warpSize,
 250                "The CUDA kernels require the cluster_size_i*cluster_size_j/nbnxn_gpu_clusterpair_split to match the warp size of the architecture targeted.");
 251
 252     if (bDoEne)
 253     {
 254         if (bDoPrune)
 255         {
 256             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 257         }
 258         else
 259         {
 260             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 261         }
 262     }
 263     else
 264     {
 265         if (bDoPrune)
 266         {
 267             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 268         }
 269         else
 270         {
 271             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 272         }
 273     }
 274
 275     return res;
 276 }
 277
 278 /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use. */
 279 static inline int calc_shmem_required_nonbonded(const int num_threads_z, const gmx_device_info_t gmx_unused *dinfo, const cu_nbparam_t *nbp)
 280 {
 281     int shmem;
 282
 283     assert(dinfo);
 284
 285     /* size of shmem (force-buffers/xq/atom type preloading) */
 286     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 287     /* i-atom x+q in shared memory */
 288     shmem  = c_numClPerSupercl * c_clSize * sizeof(float4);
 289     /* cj in shared memory, for each warp separately */
 290     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 291     if (dinfo->prop.major >= 3)
 292     {
 293         if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
 294             nbp->vdwtype == evdwCuCUTCOMBLB)
 295         {
 296             /* i-atom LJ combination parameters in shared memory */
 297             shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
 298         }
 299         else
 300         {
 301             /* i-atom types in shared memory */
 302             shmem += c_numClPerSupercl * c_clSize * sizeof(int);
 303         }
 304     }
 305     if (dinfo->prop.major < 3)
 306     {
 307         /* force reduction buffers in shared memory */
 308         shmem += c_clSize * c_clSize * 3 * sizeof(float);
 309     }
 310     return shmem;
 311 }
 312
 313 /*! As we execute nonbonded workload in separate streams, before launching
 314    the kernel we need to make sure that he following operations have completed:
 315    - atomdata allocation and related H2D transfers (every nstlist step);
 316    - pair list H2D transfer (every nstlist step);
 317    - shift vector H2D transfer (every nstlist step);
 318    - force (+shift force and energy) output clearing (every step).
 319
 320    These operations are issued in the local stream at the beginning of the step
 321    and therefore always complete before the local kernel launch. The non-local
 322    kernel is launched after the local on the same device/context hence it is
 323    inherently scheduled after the operations in the local stream (including the
 324    above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
 325    devices with multiple hardware queues the dependency needs to be enforced.
 326    We use the misc_ops_and_local_H2D_done event to record the point where
 327    the local x+q H2D (and all preceding) tasks are complete and synchronize
 328    with this event in the non-local stream before launching the non-bonded kernel.
 329  */
 330 void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
 331                              const nbnxn_atomdata_t *nbatom,
 332                              int                     flags,
 333                              int                     iloc)
 334 {
 335     cudaError_t          stat;
 336     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 337     /* CUDA kernel launch-related stuff */
 338     int                  shmem, nblock;
 339     dim3                 dim_block, dim_grid;
 340     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 341
 342     cu_atomdata_t       *adat    = nb->atdat;
 343     cu_nbparam_t        *nbp     = nb->nbparam;
 344     cu_plist_t          *plist   = nb->plist[iloc];
 345     cu_timers_t         *t       = nb->timers;
 346     cudaStream_t         stream  = nb->stream[iloc];
 347
 348     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 349     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
 350     bool                 bDoTime     = nb->bDoTime;
 351
 352     /* turn energy calculation always on/off (for debugging/testing only) */
 353     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 354
 355     /* Don't launch the non-local kernel if there is no work to do.
 356        Doing the same for the local kernel is more complicated, since the
 357        local part of the force array also depends on the non-local kernel.
 358        So to avoid complicating the code and to reduce the risk of bugs,
 359        we always call the local kernel, the local x+q copy and later (not in
 360        this function) the stream wait, local f copyback and the f buffer
 361        clearing. All these operations, except for the local interaction kernel,
 362        are needed for the non-local interactions. The skip of the local kernel
 363        call is taken care of later in this function. */
 364     if (iloc == eintNonlocal && plist->nsci == 0)
 365     {
 366         plist->haveFreshList = false;
 367
 368         return;
 369     }
 370
 371     /* calculate the atom data index range based on locality */
 372     if (LOCAL_I(iloc))
 373     {
 374         adat_begin  = 0;
 375         adat_len    = adat->natoms_local;
 376     }
 377     else
 378     {
 379         adat_begin  = adat->natoms_local;
 380         adat_len    = adat->natoms - adat->natoms_local;
 381     }
 382
 383     /* beginning of timed HtoD section */
 384     if (bDoTime)
 385     {
 386         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 387         CU_RET_ERR(stat, "cudaEventRecord failed");
 388     }
 389
 390     /* HtoD x, q */
 391     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 392                       adat_len * sizeof(*adat->xq), stream);
 393
 394     /* When we get here all misc operations issues in the local stream as well as
 395        the local xq H2D are done,
 396        so we record that in the local stream and wait for it in the nonlocal one. */
 397     if (nb->bUseTwoStreams)
 398     {
 399         if (iloc == eintLocal)
 400         {
 401             stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
 402             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
 403         }
 404         else
 405         {
 406             stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
 407             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
 408         }
 409     }
 410
 411     if (bDoTime)
 412     {
 413         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 414         CU_RET_ERR(stat, "cudaEventRecord failed");
 415     }
 416
 417     if (nbp->useDynamicPruning && plist->haveFreshList)
 418     {
 419         /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
 420            (TODO: ATM that's the way the timing accounting can distinguish between
 421            separate prune kernel and combined force+prune, maybe we need a better way?).
 422          */
 423         nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
 424     }
 425
 426     if (plist->nsci == 0)
 427     {
 428         /* Don't launch an empty local kernel (not allowed with CUDA) */
 429         return;
 430     }
 431
 432     /* beginning of timed nonbonded calculation section */
 433     if (bDoTime)
 434     {
 435         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 436         CU_RET_ERR(stat, "cudaEventRecord failed");
 437     }
 438
 439     /* get the pointer to the kernel flavor we need to use */
 440     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
 441                                     nbp->vdwtype,
 442                                     bCalcEner,
 443                                     (plist->haveFreshList && !nb->timers->didPrune[iloc]) || always_prune,
 444                                     nb->dev_info);
 445
 446     /* Kernel launch config:
 447      * - The thread block dimensions match the size of i-clusters, j-clusters,
 448      *   and j-cluster concurrency, in x, y, and z, respectively.
 449      * - The 1D block-grid contains as many blocks as super-clusters.
 450      */
 451     int num_threads_z = 1;
 452     if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
 453     {
 454         num_threads_z = 2;
 455     }
 456     nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
 457     dim_block = dim3(c_clSize, c_clSize, num_threads_z);
 458     dim_grid  = dim3(nblock, 1, 1);
 459     shmem     = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
 460
 461     if (debug)
 462     {
 463         fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t"
 464                 "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 465                 "\tShMem: %d\n",
 466                 dim_block.x, dim_block.y, dim_block.z,
 467                 dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl,
 468                 c_numClPerSupercl, plist->na_c,
 469                 shmem);
 470     }
 471
 472     if (bUseCudaLaunchKernel)
 473     {
 474         gmx_unused void* kernel_args[4];
 475         kernel_args[0] = adat;
 476         kernel_args[1] = nbp;
 477         kernel_args[2] = plist;
 478         kernel_args[3] = &bCalcFshift;
 479
 480 #if GMX_CUDA_VERSION >= 7000
 481         cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
 482 #endif
 483     }
 484     else
 485     {
 486         nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
 487     }
 488     CU_LAUNCH_ERR("k_calc_nb");
 489
 490     if (bDoTime)
 491     {
 492         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 493         CU_RET_ERR(stat, "cudaEventRecord failed");
 494     }
 495
 496 #if (defined(WIN32) || defined( _WIN32 ))
 497     /* Windows: force flushing WDDM queue */
 498     stat = cudaStreamQuery(stream);
 499 #endif
 500 }
 501
 502 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 503 static inline int calc_shmem_required_prune(const int num_threads_z)
 504 {
 505     int shmem;
 506
 507     /* i-atom x in shared memory */
 508     shmem  = c_numClPerSupercl * c_clSize * sizeof(float4);
 509     /* cj in shared memory, for each warp separately */
 510     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
 511
 512     return shmem;
 513 }
 514
 515 void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
 516                                        int                     iloc,
 517                                        int                     numParts)
 518 {
 519     cudaError_t          stat;
 520
 521     cu_atomdata_t       *adat    = nb->atdat;
 522     cu_nbparam_t        *nbp     = nb->nbparam;
 523     cu_plist_t          *plist   = nb->plist[iloc];
 524     cu_timers_t         *t       = nb->timers;
 525     cudaStream_t         stream  = nb->stream[iloc];
 526
 527     bool                 bDoTime = nb->bDoTime;
 528
 529     if (plist->haveFreshList)
 530     {
 531         GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
 532
 533         /* Set rollingPruningNumParts to signal that it is not set */
 534         plist->rollingPruningNumParts = 0;
 535         plist->rollingPruningPart     = 0;
 536     }
 537     else
 538     {
 539         if (plist->rollingPruningNumParts == 0)
 540         {
 541             plist->rollingPruningNumParts = numParts;
 542         }
 543         else
 544         {
 545             GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
 546         }
 547     }
 548
 549     /* Use a local variable for part and update in plist, so we can return here
 550      * without duplicating the part increment code.
 551      */
 552     int part = plist->rollingPruningPart;
 553
 554     plist->rollingPruningPart++;
 555     if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
 556     {
 557         plist->rollingPruningPart = 0;
 558     }
 559
 560     /* Compute the number of list entries to prune in this pass */
 561     int numSciInPart = (plist->nsci - part)/numParts;
 562
 563     /* Don't launch the kernel if there is no work to do (not allowed with CUDA) */
 564     if (numSciInPart <= 0)
 565     {
 566         plist->haveFreshList = false;
 567
 568         return;
 569     }
 570
 571     cudaEvent_t startEvent, stopEvent;
 572     if (bDoTime)
 573     {
 574         startEvent = (plist->haveFreshList ? t->start_prune_k[iloc] : t->start_rollingPrune_k[iloc]);
 575         stopEvent  = (plist->haveFreshList ? t->stop_prune_k[iloc]  : t->stop_rollingPrune_k[iloc]);
 576     }
 577
 578     /* beginning of timed prune calculation section */
 579     if (bDoTime)
 580     {
 581         stat = cudaEventRecord(startEvent, stream);
 582         CU_RET_ERR(stat, "cudaEventRecord failed");
 583     }
 584
 585     /* Kernel launch config:
 586      * - The thread block dimensions match the size of i-clusters, j-clusters,
 587      *   and j-cluster concurrency, in x, y, and z, respectively.
 588      * - The 1D block-grid contains as many blocks as super-clusters.
 589      */
 590     int  num_threads_z  = c_cudaPruneKernelJ4Concurrency;
 591     int  nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
 592     dim3 dim_block      = dim3(c_clSize, c_clSize, num_threads_z);
 593     dim3 dim_grid       = dim3(nblock, 1, 1);
 594     int  shmem          = calc_shmem_required_prune(num_threads_z);
 595
 596     if (debug)
 597     {
 598         fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %dx%dx%d\n\t"
 599                 "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 600                 "\tShMem: %d\n",
 601                 dim_block.x, dim_block.y, dim_block.z,
 602                 dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl,
 603                 c_numClPerSupercl, plist->na_c,
 604                 shmem);
 605     }
 606
 607     if (bUseCudaLaunchKernel)
 608     {
 609         gmx_unused void* kernel_args[5];
 610         kernel_args[0] = adat;
 611         kernel_args[1] = nbp;
 612         kernel_args[2] = plist;
 613         kernel_args[3] = &numParts;
 614         kernel_args[4] = &part;
 615
 616 #if GMX_CUDA_VERSION >= 7000
 617         if (plist->haveFreshList)
 618         {
 619             cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
 620         }
 621         else
 622         {
 623             cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
 624         }
 625 #endif
 626     }
 627     else
 628     {
 629         if (plist->haveFreshList)
 630         {
 631             nbnxn_kernel_prune_cuda<true><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
 632         }
 633         else
 634         {
 635             nbnxn_kernel_prune_cuda<false><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
 636         }
 637     }
 638     CU_LAUNCH_ERR("k_pruneonly");
 639
 640     /* TODO: consider a more elegant way to track which kernel has been called
 641        (combined or separate 1st pass prune, rolling prune). */
 642     if (plist->haveFreshList)
 643     {
 644         plist->haveFreshList         = false;
 645         /* Mark that pruning has been done */
 646         nb->timers->didPrune[iloc] = true;
 647     }
 648     else
 649     {
 650         /* Mark that rolling pruning has been done */
 651         nb->timers->didRollingPrune[iloc] = true;
 652     }
 653
 654     if (bDoTime)
 655     {
 656         stat = cudaEventRecord(stopEvent, stream);
 657         CU_RET_ERR(stat, "cudaEventRecord failed");
 658     }
 659
 660 #if (defined(WIN32) || defined( _WIN32 ))
 661     /* Windows: force flushing WDDM queue */
 662     stat = cudaStreamQuery(stream);
 663 #endif
 664 }
 665
 666 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
 667                               const nbnxn_atomdata_t *nbatom,
 668                               int                     flags,
 669                               int                     aloc)
 670 {
 671     cudaError_t stat;
 672     int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 673     int         iloc = -1;
 674
 675     /* determine interaction locality from atom locality */
 676     if (LOCAL_A(aloc))
 677     {
 678         iloc = eintLocal;
 679     }
 680     else if (NONLOCAL_A(aloc))
 681     {
 682         iloc = eintNonlocal;
 683     }
 684     else
 685     {
 686         char stmp[STRLEN];
 687         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 688                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 689         gmx_incons(stmp);
 690     }
 691
 692     cu_atomdata_t   *adat    = nb->atdat;
 693     cu_timers_t     *t       = nb->timers;
 694     bool             bDoTime = nb->bDoTime;
 695     cudaStream_t     stream  = nb->stream[iloc];
 696
 697     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 698     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 699
 700     /* don't launch non-local copy-back if there was no non-local work to do */
 701     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 702     {
 703         return;
 704     }
 705
 706     /* calculate the atom data index range based on locality */
 707     if (LOCAL_A(aloc))
 708     {
 709         adat_begin  = 0;
 710         adat_len    = adat->natoms_local;
 711     }
 712     else
 713     {
 714         adat_begin  = adat->natoms_local;
 715         adat_len    = adat->natoms - adat->natoms_local;
 716     }
 717
 718     /* beginning of timed D2H section */
 719     if (bDoTime)
 720     {
 721         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 722         CU_RET_ERR(stat, "cudaEventRecord failed");
 723     }
 724
 725     /* With DD the local D2H transfer can only start after the non-local
 726        kernel has finished. */
 727     if (iloc == eintLocal && nb->bUseTwoStreams)
 728     {
 729         stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
 730         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 731     }
 732
 733     /* DtoH f */
 734     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 735                       (adat_len)*sizeof(*adat->f), stream);
 736
 737     /* After the non-local D2H is launched the nonlocal_done event can be
 738        recorded which signals that the local D2H can proceed. This event is not
 739        placed after the non-local kernel because we want the non-local data
 740        back first. */
 741     if (iloc == eintNonlocal)
 742     {
 743         stat = cudaEventRecord(nb->nonlocal_done, stream);
 744         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 745     }
 746
 747     /* only transfer energies in the local stream */
 748     if (LOCAL_I(iloc))
 749     {
 750         /* DtoH fshift */
 751         if (bCalcFshift)
 752         {
 753             cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
 754                               SHIFTS * sizeof(*nb->nbst.fshift), stream);
 755         }
 756
 757         /* DtoH energies */
 758         if (bCalcEner)
 759         {
 760             cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
 761                               sizeof(*nb->nbst.e_lj), stream);
 762             cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
 763                               sizeof(*nb->nbst.e_el), stream);
 764         }
 765     }
 766
 767     if (bDoTime)
 768     {
 769         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 770         CU_RET_ERR(stat, "cudaEventRecord failed");
 771     }
 772 }
 773
 774 /*! \brief Count pruning kernel time if either kernel has been triggered
 775  *
 776  *  We do the accounting for either of the two pruning kernel flavors:
 777  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 778  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 779  *
 780  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 781  * after calling this function.
 782  *
 783  * \param[in] timers      structs with CUDA timer objects
 784  * \param[inout] timings  GPU task timing data
 785  * \param[in] iloc        interaction locality
 786  */
 787 static void countPruneKernelTime(const cu_timers_t   *timers,
 788                                  gmx_wallclock_gpu_t *timings,
 789                                  const int            iloc)
 790 {
 791     // We might have not done any pruning (e.g. if we skipped with empty domains).
 792     if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
 793     {
 794         return;
 795     }
 796
 797     if (timers->didPrune[iloc])
 798     {
 799         timings->pruneTime.c++;
 800         timings->pruneTime.t += cu_event_elapsed(timers->start_prune_k[iloc],
 801                                                  timers->stop_prune_k[iloc]);
 802     }
 803     if (timers->didRollingPrune[iloc])
 804     {
 805         timings->dynamicPruneTime.c++;
 806         timings->dynamicPruneTime.t += cu_event_elapsed(timers->start_rollingPrune_k[iloc],
 807                                                         timers->stop_rollingPrune_k[iloc]);
 808     }
 809 }
 810
 811 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
 812                             int flags, int aloc,
 813                             real *e_lj, real *e_el, rvec *fshift)
 814 {
 815     /* NOTE:  only implemented for single-precision at this time */
 816     cudaError_t stat;
 817     int         iloc = -1;
 818
 819     /* determine interaction locality from atom locality */
 820     if (LOCAL_A(aloc))
 821     {
 822         iloc = eintLocal;
 823     }
 824     else if (NONLOCAL_A(aloc))
 825     {
 826         iloc = eintNonlocal;
 827     }
 828     else
 829     {
 830         char stmp[STRLEN];
 831         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 832                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 833         gmx_incons(stmp);
 834     }
 835
 836     cu_plist_t                 *plist    = nb->plist[iloc];
 837     cu_timers_t                *timers   = nb->timers;
 838     struct gmx_wallclock_gpu_t *timings  = nb->timings;
 839     nb_staging                  nbst     = nb->nbst;
 840
 841     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
 842     bool                        bCalcFshift = flags & GMX_FORCE_VIRIAL;
 843
 844     /* turn energy calculation always on/off (for debugging/testing only) */
 845     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 846
 847     /* Launch wait/update timers & counters and do reduction into staging buffers
 848        BUT skip it when during the non-local phase there was actually no work to do.
 849        This is consistent with nbnxn_gpu_launch_kernel.
 850
 851        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 852        counters could end up being inconsistent due to not being incremented
 853        on some of the nodes! */
 854     if (!(iloc == eintNonlocal && nb->plist[iloc]->nsci == 0))
 855     {
 856         stat = cudaStreamSynchronize(nb->stream[iloc]);
 857         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 858
 859         /* timing data accumulation */
 860         if (nb->bDoTime)
 861         {
 862             /* only increase counter once (at local F wait) */
 863             if (LOCAL_I(iloc))
 864             {
 865                 timings->nb_c++;
 866                 timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 867             }
 868
 869             /* kernel timings */
 870             timings->ktime[plist->haveFreshList ? 1 : 0][bCalcEner ? 1 : 0].t +=
 871                 cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 872
 873             /* X/q H2D and F D2H timings */
 874             timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 875                                                   timers->stop_nb_h2d[iloc]);
 876             timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 877                                                   timers->stop_nb_d2h[iloc]);
 878
 879             /* Count the pruning kernel times for both cases:1st pass (at search step)
 880                and rolling pruning (if called at the previous step).
 881                We do the accounting here as this is the only sync point where we
 882                know (without checking or additional sync-ing) that prune tasks in
 883                in the current stream have completed (having just blocking-waited
 884                for the force D2H). */
 885             countPruneKernelTime(timers, timings, iloc);
 886
 887             /* only count atdat and pair-list H2D at pair-search step */
 888             if (timers->didPairlistH2D[iloc])
 889             {
 890                 /* atdat transfer timing (add only once, at local F wait) */
 891                 if (LOCAL_A(aloc))
 892                 {
 893                     timings->pl_h2d_c++;
 894                     timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 895                                                           timers->stop_atdat);
 896                 }
 897
 898                 timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 899                                                       timers->stop_pl_h2d[iloc]);
 900
 901                 /* Clear the timing flag for the next step */
 902                 timers->didPairlistH2D[iloc] = false;
 903             }
 904         }
 905
 906         /* add up energies and shift forces (only once at local F wait) */
 907         if (LOCAL_I(iloc))
 908         {
 909             if (bCalcEner)
 910             {
 911                 *e_lj += *nbst.e_lj;
 912                 *e_el += *nbst.e_el;
 913             }
 914
 915             if (bCalcFshift)
 916             {
 917                 for (int i = 0; i < SHIFTS; i++)
 918                 {
 919                     fshift[i][0] += nbst.fshift[i].x;
 920                     fshift[i][1] += nbst.fshift[i].y;
 921                     fshift[i][2] += nbst.fshift[i].z;
 922                 }
 923             }
 924         }
 925     }
 926
 927     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 928     timers->didPrune[iloc] = timers->didRollingPrune[iloc] = false;
 929
 930     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 931     plist->haveFreshList = false;
 932 }
 933
 934 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 935 {
 936     assert(!c_disableCudaTextures);
 937     return nbfp_texref;
 938 }
 939
 940 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 941 {
 942     assert(!c_disableCudaTextures);
 943     return nbfp_comb_texref;
 944 }
 945
 946 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 947 {
 948     assert(!c_disableCudaTextures);
 949     return coulomb_tab_texref;
 950 }
 951
 952 void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
 953 {
 954     cudaError_t stat;
 955
 956     for (int i = 0; i < eelCuNR; i++)
 957     {
 958         for (int j = 0; j < evdwCuNR; j++)
 959         {
 960             if (devinfo->prop.major >= 3)
 961             {
 962                 /* Default kernel on sm 3.x and later 32/32 kB Shared/L1 */
 963                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 964                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 965                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
 966                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
 967             }
 968             else
 969             {
 970                 /* On Fermi prefer L1 gives 2% higher performance */
 971                 /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 972                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 973                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 974                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 975                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 976             }
 977             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 978         }
 979     }
 980 }