src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdlib.h>
  46
  47 #include "gromacs/mdlib/nbnxn_gpu.h"
  48
  49 #if defined(_MSVC)
  50 #include <limits>
  51 #endif
  52
  53 #include <cuda.h>
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  60 #include "gromacs/legacyheaders/types/force_flags.h"
  61 #include "gromacs/legacyheaders/types/simple.h"
  62 #include "gromacs/mdlib/nb_verlet.h"
  63 #include "gromacs/mdlib/nbnxn_consts.h"
  64 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  65 #include "gromacs/mdlib/nbnxn_pairlist.h"
  66 #include "gromacs/pbcutil/ishift.h"
  67 #include "gromacs/timing/gpu_timing.h"
  68 #include "gromacs/utility/cstringutil.h"
  69
  70 #include "nbnxn_cuda_types.h"
  71
  72 #if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
  73 #define USE_TEXOBJ
  74 #endif
  75
  76 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  77 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  78
  79 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  80 texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  81
  82 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  83 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  84
  85 /* Convenience defines */
  86 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  87 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  88
  89 /* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
  90  * warp-pairs per block.
  91  *
  92  * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
  93  * blocks/multiproc, is the fastest even though this setup gives low occupancy.
  94  * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
  95  * per multiprocessor is reduced proportionally to get the original number of max
  96  * threads in flight (and slightly lower performance).
  97  * - On CC 3.7 there are enough registers to double the number of threads; using
  98  * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
  99  * with low-register use).
 100  *
 101  * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
 102  * shuffle-based reduction, hence CC >= 3.0.
 103  */
 104
 105 /* Kernel launch bounds as function of NTHREAD_Z.
 106  * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
 107  * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
 108  */
 109 #if __CUDA_ARCH__ == 370
 110 #define NTHREAD_Z           (2)
 111 #define MIN_BLOCKS_PER_MP   (16)
 112 #else
 113 #define NTHREAD_Z           (1)
 114 #define MIN_BLOCKS_PER_MP   (16)
 115 #endif
 116 #define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
 117
 118
 119 /***** The kernels come here *****/
 120 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
 121
 122 /* Top-level kernel generation: will generate through multiple inclusion the
 123  * following flavors for all kernels:
 124  * - force-only output;
 125  * - force and energy output;
 126  * - force-only with pair list pruning;
 127  * - force and energy output with pair list pruning.
 128  */
 129 /** Force only **/
 130 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 131 /** Force & energy **/
 132 #define CALC_ENERGIES
 133 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 134 #undef CALC_ENERGIES
 135
 136 /*** Pair-list pruning kernels ***/
 137 /** Force only **/
 138 #define PRUNE_NBL
 139 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 140 /** Force & energy **/
 141 #define CALC_ENERGIES
 142 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 143 #undef CALC_ENERGIES
 144 #undef PRUNE_NBL
 145
 146
 147 /*! Nonbonded kernel function pointer type */
 148 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 149                                      const cu_nbparam_t,
 150                                      const cu_plist_t,
 151                                      bool);
 152
 153 /*********************************/
 154
 155 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 156 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 157 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 158 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 159
 160
 161 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 162  * and corresponds to having the exponent set to the maximum (127 -- single
 163  * precision) and the mantissa to 0.
 164  */
 165 static unsigned int poll_wait_pattern = (0x7FU << 23);
 166
 167 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 168 static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
 169 {
 170     int max_grid_x_size;
 171
 172     assert(dinfo);
 173     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
 174        empty domain) and that case should be handled before this point. */
 175     assert(nwork_units > 0);
 176
 177     max_grid_x_size = dinfo->prop.maxGridSize[0];
 178
 179     /* do we exceed the grid x dimension limit? */
 180     if (nwork_units > max_grid_x_size)
 181     {
 182         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 183                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 184                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 185     }
 186
 187     return nwork_units;
 188 }
 189
 190
 191 /* Constant arrays listing all kernel function pointers and enabling selection
 192    of a kernel in an elegant manner. */
 193
 194 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 195  *  electrostatics and VDW type.
 196  *
 197  *  Note that the row- and column-order of function pointers has to match the
 198  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 199  *  defined in nbnxn_cuda_types.h.
 200  */
 201
 202 /*! Force-only kernel function pointers. */
 203 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 204 {
 205     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
 206     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
 207     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
 208     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 209     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
 210     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 211 };
 212
 213 /*! Force + energy kernel function pointers. */
 214 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 215 {
 216     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda              },
 217     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda               },
 218     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda          },
 219     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda     },
 220     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda               },
 221     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda        }
 222 };
 223
 224 /*! Force + pruning kernel function pointers. */
 225 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 226 {
 227     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda            },
 228     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda             },
 229     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda        },
 230     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
 231     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda             },
 232     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda      }
 233 };
 234
 235 /*! Force + energy + pruning kernel function pointers. */
 236 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 237 {
 238     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
 239     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
 240     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
 241     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 242     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
 243     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 244 };
 245
 246 /*! Return a pointer to the kernel version to be executed at the current step. */
 247 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
 248                                                        int  evdwtype,
 249                                                        bool bDoEne,
 250                                                        bool bDoPrune)
 251 {
 252     nbnxn_cu_kfunc_ptr_t res;
 253
 254     assert(eeltype < eelCuNR);
 255     assert(evdwtype < evdwCuNR);
 256
 257     if (bDoEne)
 258     {
 259         if (bDoPrune)
 260         {
 261             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 262         }
 263         else
 264         {
 265             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 266         }
 267     }
 268     else
 269     {
 270         if (bDoPrune)
 271         {
 272             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 273         }
 274         else
 275         {
 276             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 277         }
 278     }
 279
 280     return res;
 281 }
 282
 283 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 284 static inline int calc_shmem_required(const int num_threads_z)
 285 {
 286     int shmem;
 287
 288     /* size of shmem (force-buffers/xq/atom type preloading) */
 289     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 290     /* i-atom x+q in shared memory */
 291     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 292     /* cj in shared memory, for each warp separately */
 293     shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 294 #ifdef IATYPE_SHMEM
 295     /* i-atom types in shared memory */
 296     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 297 #endif
 298 #if __CUDA_ARCH__ < 300
 299     /* force reduction buffers in shared memory */
 300     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 301 #endif
 302
 303     return shmem;
 304 }
 305
 306 /*! As we execute nonbonded workload in separate streams, before launching
 307    the kernel we need to make sure that he following operations have completed:
 308    - atomdata allocation and related H2D transfers (every nstlist step);
 309    - pair list H2D transfer (every nstlist step);
 310    - shift vector H2D transfer (every nstlist step);
 311    - force (+shift force and energy) output clearing (every step).
 312
 313    These operations are issued in the local stream at the beginning of the step
 314    and therefore always complete before the local kernel launch. The non-local
 315    kernel is launched after the local on the same device/context, so this is
 316    inherently scheduled after the operations in the local stream (including the
 317    above "misc_ops").
 318    However, for the sake of having a future-proof implementation, we use the
 319    misc_ops_done event to record the point in time when the above  operations
 320    are finished and synchronize with this event in the non-local stream.
 321  */
 322 void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
 323                              const nbnxn_atomdata_t *nbatom,
 324                              int                     flags,
 325                              int                     iloc)
 326 {
 327     cudaError_t          stat;
 328     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 329     /* CUDA kernel launch-related stuff */
 330     int                  shmem, nblock;
 331     dim3                 dim_block, dim_grid;
 332     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 333
 334     cu_atomdata_t       *adat    = nb->atdat;
 335     cu_nbparam_t        *nbp     = nb->nbparam;
 336     cu_plist_t          *plist   = nb->plist[iloc];
 337     cu_timers_t         *t       = nb->timers;
 338     cudaStream_t         stream  = nb->stream[iloc];
 339
 340     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 341     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
 342     bool                 bDoTime     = nb->bDoTime;
 343
 344     /* turn energy calculation always on/off (for debugging/testing only) */
 345     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 346
 347     /* Don't launch the non-local kernel if there is no work to do.
 348        Doing the same for the local kernel is more complicated, since the
 349        local part of the force array also depends on the non-local kernel.
 350        So to avoid complicating the code and to reduce the risk of bugs,
 351        we always call the local kernel, the local x+q copy and later (not in
 352        this function) the stream wait, local f copyback and the f buffer
 353        clearing. All these operations, except for the local interaction kernel,
 354        are needed for the non-local interactions. The skip of the local kernel
 355        call is taken care of later in this function. */
 356     if (iloc == eintNonlocal && plist->nsci == 0)
 357     {
 358         return;
 359     }
 360
 361     /* calculate the atom data index range based on locality */
 362     if (LOCAL_I(iloc))
 363     {
 364         adat_begin  = 0;
 365         adat_len    = adat->natoms_local;
 366     }
 367     else
 368     {
 369         adat_begin  = adat->natoms_local;
 370         adat_len    = adat->natoms - adat->natoms_local;
 371     }
 372
 373     /* When we get here all misc operations issues in the local stream are done,
 374        so we record that in the local stream and wait for it in the nonlocal one. */
 375     if (nb->bUseTwoStreams)
 376     {
 377         if (iloc == eintLocal)
 378         {
 379             stat = cudaEventRecord(nb->misc_ops_done, stream);
 380             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 381         }
 382         else
 383         {
 384             stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
 385             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 386         }
 387     }
 388
 389     /* beginning of timed HtoD section */
 390     if (bDoTime)
 391     {
 392         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 393         CU_RET_ERR(stat, "cudaEventRecord failed");
 394     }
 395
 396     /* HtoD x, q */
 397     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 398                       adat_len * sizeof(*adat->xq), stream);
 399
 400     if (bDoTime)
 401     {
 402         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 403         CU_RET_ERR(stat, "cudaEventRecord failed");
 404     }
 405
 406     if (plist->nsci == 0)
 407     {
 408         /* Don't launch an empty local kernel (not allowed with CUDA) */
 409         return;
 410     }
 411
 412     /* beginning of timed nonbonded calculation section */
 413     if (bDoTime)
 414     {
 415         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 416         CU_RET_ERR(stat, "cudaEventRecord failed");
 417     }
 418
 419     /* get the pointer to the kernel flavor we need to use */
 420     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
 421                                     nbp->vdwtype,
 422                                     bCalcEner,
 423                                     plist->bDoPrune || always_prune);
 424
 425     /* Kernel launch config:
 426      * - The thread block dimensions match the size of i-clusters, j-clusters,
 427      *   and j-cluster concurrency, in x, y, and z, respectively.
 428      * - The 1D block-grid contains as many blocks as super-clusters.
 429      */
 430     int num_threads_z = 1;
 431     if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
 432     {
 433         num_threads_z = 2;
 434     }
 435     nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
 436     dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
 437     dim_grid  = dim3(nblock, 1, 1);
 438     shmem     = calc_shmem_required(num_threads_z);
 439
 440     if (debug)
 441     {
 442         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 443                 "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 444                 "\tShMem: %d\n",
 445                 dim_block.x, dim_block.y, dim_block.z,
 446                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 447                 NCL_PER_SUPERCL, plist->na_c,
 448                 shmem);
 449     }
 450
 451     nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
 452     CU_LAUNCH_ERR("k_calc_nb");
 453
 454     if (bDoTime)
 455     {
 456         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 457         CU_RET_ERR(stat, "cudaEventRecord failed");
 458     }
 459 }
 460
 461 void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
 462                               const nbnxn_atomdata_t *nbatom,
 463                               int                     flags,
 464                               int                     aloc)
 465 {
 466     cudaError_t stat;
 467     int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
 468     int         iloc = -1;
 469
 470     /* determine interaction locality from atom locality */
 471     if (LOCAL_A(aloc))
 472     {
 473         iloc = eintLocal;
 474     }
 475     else if (NONLOCAL_A(aloc))
 476     {
 477         iloc = eintNonlocal;
 478     }
 479     else
 480     {
 481         char stmp[STRLEN];
 482         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 483                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 484         gmx_incons(stmp);
 485     }
 486
 487     cu_atomdata_t   *adat    = nb->atdat;
 488     cu_timers_t     *t       = nb->timers;
 489     bool             bDoTime = nb->bDoTime;
 490     cudaStream_t     stream  = nb->stream[iloc];
 491
 492     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 493     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 494
 495     /* don't launch non-local copy-back if there was no non-local work to do */
 496     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 497     {
 498         return;
 499     }
 500
 501     /* calculate the atom data index range based on locality */
 502     if (LOCAL_A(aloc))
 503     {
 504         adat_begin  = 0;
 505         adat_len    = adat->natoms_local;
 506         adat_end    = nb->atdat->natoms_local;
 507     }
 508     else
 509     {
 510         adat_begin  = adat->natoms_local;
 511         adat_len    = adat->natoms - adat->natoms_local;
 512         adat_end    = nb->atdat->natoms;
 513     }
 514
 515     /* beginning of timed D2H section */
 516     if (bDoTime)
 517     {
 518         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 519         CU_RET_ERR(stat, "cudaEventRecord failed");
 520     }
 521
 522     if (!nb->bUseStreamSync)
 523     {
 524         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 525            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 526         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 527         {
 528 #ifdef NAN
 529             nbatom->out[0].f[i] = NAN;
 530 #else
 531 #  ifdef _MSVC
 532             if (numeric_limits<float>::has_quiet_NaN)
 533             {
 534                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 535             }
 536             else
 537 #  endif
 538             {
 539                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 540             }
 541 #endif
 542         }
 543
 544         /* Set the last four bytes of the force array to a bit pattern
 545            which can't be the result of the force calculation:
 546            max exponent (127) and zero mantissa. */
 547         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 548     }
 549
 550     /* With DD the local D2H transfer can only start after the non-local
 551        kernel has finished. */
 552     if (iloc == eintLocal && nb->bUseTwoStreams)
 553     {
 554         stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
 555         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 556     }
 557
 558     /* DtoH f */
 559     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 560                       (adat_len)*sizeof(*adat->f), stream);
 561
 562     /* After the non-local D2H is launched the nonlocal_done event can be
 563        recorded which signals that the local D2H can proceed. This event is not
 564        placed after the non-local kernel because we want the non-local data
 565        back first. */
 566     if (iloc == eintNonlocal)
 567     {
 568         stat = cudaEventRecord(nb->nonlocal_done, stream);
 569         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 570     }
 571
 572     /* only transfer energies in the local stream */
 573     if (LOCAL_I(iloc))
 574     {
 575         /* DtoH fshift */
 576         if (bCalcFshift)
 577         {
 578             cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
 579                               SHIFTS * sizeof(*nb->nbst.fshift), stream);
 580         }
 581
 582         /* DtoH energies */
 583         if (bCalcEner)
 584         {
 585             cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
 586                               sizeof(*nb->nbst.e_lj), stream);
 587             cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
 588                               sizeof(*nb->nbst.e_el), stream);
 589         }
 590     }
 591
 592     if (bDoTime)
 593     {
 594         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 595         CU_RET_ERR(stat, "cudaEventRecord failed");
 596     }
 597 }
 598
 599 /* Atomic compare-exchange operation on unsigned values. It is used in
 600  * polling wait for the GPU.
 601  */
 602 static inline bool atomic_cas(volatile unsigned int *ptr,
 603                               unsigned int           oldval,
 604                               unsigned int           newval)
 605 {
 606     assert(ptr);
 607
 608 #ifdef TMPI_ATOMICS
 609     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 610 #else
 611     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 612     return true;
 613 #endif
 614 }
 615
 616 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
 617                             const nbnxn_atomdata_t *nbatom,
 618                             int flags, int aloc,
 619                             real *e_lj, real *e_el, rvec *fshift)
 620 {
 621     /* NOTE:  only implemented for single-precision at this time */
 622     cudaError_t            stat;
 623     int                    i, adat_end, iloc = -1;
 624     volatile unsigned int *poll_word;
 625
 626     /* determine interaction locality from atom locality */
 627     if (LOCAL_A(aloc))
 628     {
 629         iloc = eintLocal;
 630     }
 631     else if (NONLOCAL_A(aloc))
 632     {
 633         iloc = eintNonlocal;
 634     }
 635     else
 636     {
 637         char stmp[STRLEN];
 638         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 639                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 640         gmx_incons(stmp);
 641     }
 642
 643     cu_plist_t                 *plist    = nb->plist[iloc];
 644     cu_timers_t                *timers   = nb->timers;
 645     struct gmx_wallclock_gpu_t *timings  = nb->timings;
 646     nb_staging                  nbst     = nb->nbst;
 647
 648     bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
 649     bool                        bCalcFshift = flags & GMX_FORCE_VIRIAL;
 650
 651     /* turn energy calculation always on/off (for debugging/testing only) */
 652     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 653
 654     /* Launch wait/update timers & counters, unless doing the non-local phase
 655        when there is not actually work to do. This is consistent with
 656        nbnxn_cuda_launch_kernel.
 657
 658        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 659        counters could end up being inconsistent due to not being incremented
 660        on some of the nodes! */
 661     if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
 662     {
 663         return;
 664     }
 665
 666     /* calculate the atom data index range based on locality */
 667     if (LOCAL_A(aloc))
 668     {
 669         adat_end = nb->atdat->natoms_local;
 670     }
 671     else
 672     {
 673         adat_end = nb->atdat->natoms;
 674     }
 675
 676     if (nb->bUseStreamSync)
 677     {
 678         stat = cudaStreamSynchronize(nb->stream[iloc]);
 679         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 680     }
 681     else
 682     {
 683         /* Busy-wait until we get the signal pattern set in last byte
 684          * of the l/nl float vector. This pattern corresponds to a floating
 685          * point number which can't be the result of the force calculation
 686          * (maximum, 127 exponent and 0 mantissa).
 687          * The polling uses atomic compare-exchange.
 688          */
 689         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 690         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
 691         {
 692         }
 693     }
 694
 695     /* timing data accumulation */
 696     if (nb->bDoTime)
 697     {
 698         /* only increase counter once (at local F wait) */
 699         if (LOCAL_I(iloc))
 700         {
 701             timings->nb_c++;
 702             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 703         }
 704
 705         /* kernel timings */
 706         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 707             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 708
 709         /* X/q H2D and F D2H timings */
 710         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 711                                               timers->stop_nb_h2d[iloc]);
 712         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 713                                               timers->stop_nb_d2h[iloc]);
 714
 715         /* only count atdat and pair-list H2D at pair-search step */
 716         if (plist->bDoPrune)
 717         {
 718             /* atdat transfer timing (add only once, at local F wait) */
 719             if (LOCAL_A(aloc))
 720             {
 721                 timings->pl_h2d_c++;
 722                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 723                                                       timers->stop_atdat);
 724             }
 725
 726             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 727                                                   timers->stop_pl_h2d[iloc]);
 728         }
 729     }
 730
 731     /* add up energies and shift forces (only once at local F wait) */
 732     if (LOCAL_I(iloc))
 733     {
 734         if (bCalcEner)
 735         {
 736             *e_lj += *nbst.e_lj;
 737             *e_el += *nbst.e_el;
 738         }
 739
 740         if (bCalcFshift)
 741         {
 742             for (i = 0; i < SHIFTS; i++)
 743             {
 744                 fshift[i][0] += nbst.fshift[i].x;
 745                 fshift[i][1] += nbst.fshift[i].y;
 746                 fshift[i][2] += nbst.fshift[i].z;
 747             }
 748         }
 749     }
 750
 751     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 752     plist->bDoPrune = false;
 753 }
 754
 755 /*! Return the reference to the nbfp texture. */
 756 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 757 {
 758     return nbfp_texref;
 759 }
 760
 761 /*! Return the reference to the nbfp_comb texture. */
 762 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 763 {
 764     return nbfp_comb_texref;
 765 }
 766
 767 /*! Return the reference to the coulomb_tab. */
 768 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 769 {
 770     return coulomb_tab_texref;
 771 }
 772
 773 /*! Set up the cache configuration for the non-bonded kernels,
 774  */
 775 void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
 776 {
 777     cudaError_t stat;
 778
 779     for (int i = 0; i < eelCuNR; i++)
 780     {
 781         for (int j = 0; j < evdwCuNR; j++)
 782         {
 783             if (devinfo->prop.major >= 3)
 784             {
 785                 /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 786                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 787                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 788                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
 789                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 790             }
 791             else
 792             {
 793                 /* On Fermi prefer L1 gives 2% higher performance */
 794                 /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 795                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 796                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 797                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 798                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 799             }
 800             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 801         }
 802     }
 803 }