src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "nbnxn_cuda.h"
  38
  39 #include "config.h"
  40
  41 #include <assert.h>
  42 #include <stdlib.h>
  43
  44 #if defined(_MSVC)
  45 #include <limits>
  46 #endif
  47
  48 #include <cuda.h>
  49
  50 #ifdef TMPI_ATOMICS
  51 #include "thread_mpi/atomic.h"
  52 #endif
  53
  54 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  55 #include "gromacs/legacyheaders/types/force_flags.h"
  56 #include "gromacs/legacyheaders/types/simple.h"
  57 #include "gromacs/mdlib/nb_verlet.h"
  58 #include "gromacs/mdlib/nbnxn_consts.h"
  59 #include "gromacs/mdlib/nbnxn_pairlist.h"
  60 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
  61 #include "gromacs/pbcutil/ishift.h"
  62 #include "gromacs/utility/cstringutil.h"
  63
  64 #include "nbnxn_cuda_types.h"
  65
  66 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  67 #define USE_TEXOBJ
  68 #endif
  69
  70 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  71 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  72
  73 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  74 texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  75
  76 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  77 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  78
  79 /* Convenience defines */
  80 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  81 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  82
  83 /* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
  84  * warp-pairs per block.
  85  *
  86  * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
  87  * blocks/multiproc, is the fastest even though this setup gives low occupancy.
  88  * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
  89  * per multiprocessor is reduced proportionally to get the original number of max
  90  * threads in flight (and slightly lower performance).
  91  * - On CC 3.7 there are enough registers to double the number of threads; using
  92  * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
  93  * with low-register use).
  94  *
  95  * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
  96  * shuffle-based reduction, hence CC >= 3.0.
  97  */
  98
  99 /* Kernel launch bounds as function of NTHREAD_Z.
 100  * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
 101  * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
 102  */
 103 #if __CUDA_ARCH__ == 370
 104 #define NTHREAD_Z           (2)
 105 #define MIN_BLOCKS_PER_MP   (16)
 106 #else
 107 #define NTHREAD_Z           (1)
 108 #define MIN_BLOCKS_PER_MP   (16)
 109 #endif
 110 #define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
 111
 112
 113 /***** The kernels come here *****/
 114 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
 115
 116 /* Top-level kernel generation: will generate through multiple inclusion the
 117  * following flavors for all kernels:
 118  * - force-only output;
 119  * - force and energy output;
 120  * - force-only with pair list pruning;
 121  * - force and energy output with pair list pruning.
 122  */
 123 /** Force only **/
 124 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 125 /** Force & energy **/
 126 #define CALC_ENERGIES
 127 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 128 #undef CALC_ENERGIES
 129
 130 /*** Pair-list pruning kernels ***/
 131 /** Force only **/
 132 #define PRUNE_NBL
 133 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 134 /** Force & energy **/
 135 #define CALC_ENERGIES
 136 #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
 137 #undef CALC_ENERGIES
 138 #undef PRUNE_NBL
 139
 140 /*! Nonbonded kernel function pointer type */
 141 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 142                                      const cu_nbparam_t,
 143                                      const cu_plist_t,
 144                                      bool);
 145
 146 /*********************************/
 147
 148 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 149 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 150 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 151 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 152
 153
 154 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 155  * and corresponds to having the exponent set to the maximum (127 -- single
 156  * precision) and the mantissa to 0.
 157  */
 158 static unsigned int poll_wait_pattern = (0x7FU << 23);
 159
 160 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 161 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 162 {
 163     int max_grid_x_size;
 164
 165     assert(dinfo);
 166
 167     max_grid_x_size = dinfo->prop.maxGridSize[0];
 168
 169     /* do we exceed the grid x dimension limit? */
 170     if (nwork_units > max_grid_x_size)
 171     {
 172         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 173                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 174                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 175     }
 176
 177     return nwork_units;
 178 }
 179
 180
 181 /* Constant arrays listing all kernel function pointers and enabling selection
 182    of a kernel in an elegant manner. */
 183
 184 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 185  *  electrostatics and VDW type.
 186  *
 187  *  Note that the row- and column-order of function pointers has to match the
 188  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 189  *  defined in nbnxn_cuda_types.h.
 190  */
 191
 192 /*! Force-only kernel function pointers. */
 193 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 194 {
 195     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
 196     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
 197     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
 198     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 199     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
 200     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 201 };
 202
 203 /*! Force + energy kernel function pointers. */
 204 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 205 {
 206     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda              },
 207     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda               },
 208     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda          },
 209     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda     },
 210     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda               },
 211     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda        }
 212 };
 213
 214 /*! Force + pruning kernel function pointers. */
 215 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 216 {
 217     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda            },
 218     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda             },
 219     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda        },
 220     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
 221     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda             },
 222     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda      }
 223 };
 224
 225 /*! Force + energy + pruning kernel function pointers. */
 226 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 227 {
 228     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
 229     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
 230     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
 231     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 232     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
 233     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 234 };
 235
 236 /*! Return a pointer to the kernel version to be executed at the current step. */
 237 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
 238                                                        int  evdwtype,
 239                                                        bool bDoEne,
 240                                                        bool bDoPrune)
 241 {
 242     nbnxn_cu_kfunc_ptr_t res;
 243
 244     assert(eeltype < eelCuNR);
 245     assert(evdwtype < evdwCuNR);
 246
 247     if (bDoEne)
 248     {
 249         if (bDoPrune)
 250         {
 251             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 252         }
 253         else
 254         {
 255             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 256         }
 257     }
 258     else
 259     {
 260         if (bDoPrune)
 261         {
 262             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 263         }
 264         else
 265         {
 266             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 267         }
 268     }
 269
 270     return res;
 271 }
 272
 273 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 274 static inline int calc_shmem_required(const int num_threads_z)
 275 {
 276     int shmem;
 277
 278     /* size of shmem (force-buffers/xq/atom type preloading) */
 279     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 280     /* i-atom x+q in shared memory */
 281     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 282     /* cj in shared memory, for each warp separately */
 283     shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 284 #ifdef IATYPE_SHMEM
 285     /* i-atom types in shared memory */
 286     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 287 #endif
 288 #if __CUDA_ARCH__ < 300
 289     /* force reduction buffers in shared memory */
 290     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 291 #endif
 292
 293     return shmem;
 294 }
 295
 296 /*! As we execute nonbonded workload in separate streams, before launching
 297    the kernel we need to make sure that he following operations have completed:
 298    - atomdata allocation and related H2D transfers (every nstlist step);
 299    - pair list H2D transfer (every nstlist step);
 300    - shift vector H2D transfer (every nstlist step);
 301    - force (+shift force and energy) output clearing (every step).
 302
 303    These operations are issued in the local stream at the beginning of the step
 304    and therefore always complete before the local kernel launch. The non-local
 305    kernel is launched after the local on the same device/context, so this is
 306    inherently scheduled after the operations in the local stream (including the
 307    above "misc_ops").
 308    However, for the sake of having a future-proof implementation, we use the
 309    misc_ops_done event to record the point in time when the above  operations
 310    are finished and synchronize with this event in the non-local stream.
 311  */
 312 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
 313                               const nbnxn_atomdata_t *nbatom,
 314                               int                     flags,
 315                               int                     iloc)
 316 {
 317     cudaError_t          stat;
 318     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 319     /* CUDA kernel launch-related stuff */
 320     int                  shmem, nblock;
 321     dim3                 dim_block, dim_grid;
 322     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 323
 324     cu_atomdata_t       *adat    = cu_nb->atdat;
 325     cu_nbparam_t        *nbp     = cu_nb->nbparam;
 326     cu_plist_t          *plist   = cu_nb->plist[iloc];
 327     cu_timers_t         *t       = cu_nb->timers;
 328     cudaStream_t         stream  = cu_nb->stream[iloc];
 329
 330     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 331     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
 332     bool                 bDoTime     = cu_nb->bDoTime;
 333
 334     /* turn energy calculation always on/off (for debugging/testing only) */
 335     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 336
 337     /* don't launch the kernel if there is no work to do */
 338     if (plist->nsci == 0)
 339     {
 340         return;
 341     }
 342
 343     /* calculate the atom data index range based on locality */
 344     if (LOCAL_I(iloc))
 345     {
 346         adat_begin  = 0;
 347         adat_len    = adat->natoms_local;
 348     }
 349     else
 350     {
 351         adat_begin  = adat->natoms_local;
 352         adat_len    = adat->natoms - adat->natoms_local;
 353     }
 354
 355     /* When we get here all misc operations issues in the local stream are done,
 356        so we record that in the local stream and wait for it in the nonlocal one. */
 357     if (cu_nb->bUseTwoStreams)
 358     {
 359         if (iloc == eintLocal)
 360         {
 361             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 362             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 363         }
 364         else
 365         {
 366             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 367             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 368         }
 369     }
 370
 371     /* beginning of timed HtoD section */
 372     if (bDoTime)
 373     {
 374         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 375         CU_RET_ERR(stat, "cudaEventRecord failed");
 376     }
 377
 378     /* HtoD x, q */
 379     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 380                       adat_len * sizeof(*adat->xq), stream);
 381
 382     if (bDoTime)
 383     {
 384         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 385         CU_RET_ERR(stat, "cudaEventRecord failed");
 386     }
 387
 388     /* beginning of timed nonbonded calculation section */
 389     if (bDoTime)
 390     {
 391         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 392         CU_RET_ERR(stat, "cudaEventRecord failed");
 393     }
 394
 395     /* get the pointer to the kernel flavor we need to use */
 396     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
 397                                     nbp->vdwtype,
 398                                     bCalcEner,
 399                                     plist->bDoPrune || always_prune);
 400
 401     /* Kernel launch config:
 402      * - The thread block dimensions match the size of i-clusters, j-clusters,
 403      *   and j-cluster concurrency, in x, y, and z, respectively.
 404      * - The 1D block-grid contains as many blocks as super-clusters.
 405      */
 406     int num_threads_z = 1;
 407     if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
 408     {
 409         num_threads_z = 2;
 410     }
 411     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 412     dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
 413     dim_grid  = dim3(nblock, 1, 1);
 414     shmem     = calc_shmem_required(num_threads_z);
 415
 416     if (debug)
 417     {
 418         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 419                 "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 420                 "\tShMem: %d\n",
 421                 dim_block.x, dim_block.y, dim_block.z,
 422                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 423                 NCL_PER_SUPERCL, plist->na_c,
 424                 shmem);
 425     }
 426
 427     nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
 428     CU_LAUNCH_ERR("k_calc_nb");
 429
 430     if (bDoTime)
 431     {
 432         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 433         CU_RET_ERR(stat, "cudaEventRecord failed");
 434     }
 435 }
 436
 437 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
 438                                const nbnxn_atomdata_t *nbatom,
 439                                int                     flags,
 440                                int                     aloc)
 441 {
 442     cudaError_t stat;
 443     int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
 444     int         iloc = -1;
 445
 446     /* determine interaction locality from atom locality */
 447     if (LOCAL_A(aloc))
 448     {
 449         iloc = eintLocal;
 450     }
 451     else if (NONLOCAL_A(aloc))
 452     {
 453         iloc = eintNonlocal;
 454     }
 455     else
 456     {
 457         char stmp[STRLEN];
 458         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 459                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 460         gmx_incons(stmp);
 461     }
 462
 463     cu_atomdata_t   *adat    = cu_nb->atdat;
 464     cu_timers_t     *t       = cu_nb->timers;
 465     bool             bDoTime = cu_nb->bDoTime;
 466     cudaStream_t     stream  = cu_nb->stream[iloc];
 467
 468     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 469     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 470
 471     /* don't launch copy-back if there was no work to do */
 472     if (cu_nb->plist[iloc]->nsci == 0)
 473     {
 474         return;
 475     }
 476
 477     /* calculate the atom data index range based on locality */
 478     if (LOCAL_A(aloc))
 479     {
 480         adat_begin  = 0;
 481         adat_len    = adat->natoms_local;
 482         adat_end    = cu_nb->atdat->natoms_local;
 483     }
 484     else
 485     {
 486         adat_begin  = adat->natoms_local;
 487         adat_len    = adat->natoms - adat->natoms_local;
 488         adat_end    = cu_nb->atdat->natoms;
 489     }
 490
 491     /* beginning of timed D2H section */
 492     if (bDoTime)
 493     {
 494         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 495         CU_RET_ERR(stat, "cudaEventRecord failed");
 496     }
 497
 498     if (!cu_nb->bUseStreamSync)
 499     {
 500         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 501            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 502         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 503         {
 504 #ifdef NAN
 505             nbatom->out[0].f[i] = NAN;
 506 #else
 507 #  ifdef _MSVC
 508             if (numeric_limits<float>::has_quiet_NaN)
 509             {
 510                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 511             }
 512             else
 513 #  endif
 514             {
 515                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 516             }
 517 #endif
 518         }
 519
 520         /* Set the last four bytes of the force array to a bit pattern
 521            which can't be the result of the force calculation:
 522            max exponent (127) and zero mantissa. */
 523         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 524     }
 525
 526     /* With DD the local D2H transfer can only start after the non-local
 527        has been launched. */
 528     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 529     {
 530         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 531         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 532     }
 533
 534     /* DtoH f */
 535     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 536                       (adat_len)*sizeof(*adat->f), stream);
 537
 538     /* After the non-local D2H is launched the nonlocal_done event can be
 539        recorded which signals that the local D2H can proceed. This event is not
 540        placed after the non-local kernel because we first need the non-local
 541        data back first. */
 542     if (iloc == eintNonlocal)
 543     {
 544         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 545         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 546     }
 547
 548     /* only transfer energies in the local stream */
 549     if (LOCAL_I(iloc))
 550     {
 551         /* DtoH fshift */
 552         if (bCalcFshift)
 553         {
 554             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 555                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 556         }
 557
 558         /* DtoH energies */
 559         if (bCalcEner)
 560         {
 561             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 562                               sizeof(*cu_nb->nbst.e_lj), stream);
 563             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 564                               sizeof(*cu_nb->nbst.e_el), stream);
 565         }
 566     }
 567
 568     if (bDoTime)
 569     {
 570         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 571         CU_RET_ERR(stat, "cudaEventRecord failed");
 572     }
 573 }
 574
 575 /* Atomic compare-exchange operation on unsigned values. It is used in
 576  * polling wait for the GPU.
 577  */
 578 static inline bool atomic_cas(volatile unsigned int *ptr,
 579                               unsigned int           oldval,
 580                               unsigned int           newval)
 581 {
 582     assert(ptr);
 583
 584 #ifdef TMPI_ATOMICS
 585     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 586 #else
 587     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 588     return true;
 589 #endif
 590 }
 591
 592 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 593                          const nbnxn_atomdata_t *nbatom,
 594                          int flags, int aloc,
 595                          real *e_lj, real *e_el, rvec *fshift)
 596 {
 597     /* NOTE:  only implemented for single-precision at this time */
 598     cudaError_t            stat;
 599     int                    i, adat_end, iloc = -1;
 600     volatile unsigned int *poll_word;
 601
 602     /* determine interaction locality from atom locality */
 603     if (LOCAL_A(aloc))
 604     {
 605         iloc = eintLocal;
 606     }
 607     else if (NONLOCAL_A(aloc))
 608     {
 609         iloc = eintNonlocal;
 610     }
 611     else
 612     {
 613         char stmp[STRLEN];
 614         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 615                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 616         gmx_incons(stmp);
 617     }
 618
 619     cu_plist_t      *plist    = cu_nb->plist[iloc];
 620     cu_timers_t     *timers   = cu_nb->timers;
 621     wallclock_gpu_t *timings  = cu_nb->timings;
 622     nb_staging       nbst     = cu_nb->nbst;
 623
 624     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 625     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 626
 627     /* turn energy calculation always on/off (for debugging/testing only) */
 628     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 629
 630     /* don't launch wait/update timers & counters if there was no work to do
 631
 632        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 633        counters could end up being inconsistent due to not being incremented
 634        on some of the nodes! */
 635     if (cu_nb->plist[iloc]->nsci == 0)
 636     {
 637         return;
 638     }
 639
 640     /* calculate the atom data index range based on locality */
 641     if (LOCAL_A(aloc))
 642     {
 643         adat_end = cu_nb->atdat->natoms_local;
 644     }
 645     else
 646     {
 647         adat_end = cu_nb->atdat->natoms;
 648     }
 649
 650     if (cu_nb->bUseStreamSync)
 651     {
 652         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 653         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 654     }
 655     else
 656     {
 657         /* Busy-wait until we get the signal pattern set in last byte
 658          * of the l/nl float vector. This pattern corresponds to a floating
 659          * point number which can't be the result of the force calculation
 660          * (maximum, 127 exponent and 0 mantissa).
 661          * The polling uses atomic compare-exchange.
 662          */
 663         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 664         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
 665         {
 666         }
 667     }
 668
 669     /* timing data accumulation */
 670     if (cu_nb->bDoTime)
 671     {
 672         /* only increase counter once (at local F wait) */
 673         if (LOCAL_I(iloc))
 674         {
 675             timings->nb_c++;
 676             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 677         }
 678
 679         /* kernel timings */
 680         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 681             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 682
 683         /* X/q H2D and F D2H timings */
 684         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 685                                               timers->stop_nb_h2d[iloc]);
 686         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 687                                               timers->stop_nb_d2h[iloc]);
 688
 689         /* only count atdat and pair-list H2D at pair-search step */
 690         if (plist->bDoPrune)
 691         {
 692             /* atdat transfer timing (add only once, at local F wait) */
 693             if (LOCAL_A(aloc))
 694             {
 695                 timings->pl_h2d_c++;
 696                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 697                                                       timers->stop_atdat);
 698             }
 699
 700             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 701                                                   timers->stop_pl_h2d[iloc]);
 702         }
 703     }
 704
 705     /* add up energies and shift forces (only once at local F wait) */
 706     if (LOCAL_I(iloc))
 707     {
 708         if (bCalcEner)
 709         {
 710             *e_lj += *nbst.e_lj;
 711             *e_el += *nbst.e_el;
 712         }
 713
 714         if (bCalcFshift)
 715         {
 716             for (i = 0; i < SHIFTS; i++)
 717             {
 718                 fshift[i][0] += nbst.fshift[i].x;
 719                 fshift[i][1] += nbst.fshift[i].y;
 720                 fshift[i][2] += nbst.fshift[i].z;
 721             }
 722         }
 723     }
 724
 725     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 726     plist->bDoPrune = false;
 727 }
 728
 729 /*! Return the reference to the nbfp texture. */
 730 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 731 {
 732     return nbfp_texref;
 733 }
 734
 735 /*! Return the reference to the nbfp_comb texture. */
 736 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 737 {
 738     return nbfp_comb_texref;
 739 }
 740
 741 /*! Return the reference to the coulomb_tab. */
 742 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 743 {
 744     return coulomb_tab_texref;
 745 }
 746
 747 /*! Set up the cache configuration for the non-bonded kernels,
 748  */
 749 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 750 {
 751     cudaError_t stat;
 752
 753     for (int i = 0; i < eelCuNR; i++)
 754     {
 755         for (int j = 0; j < evdwCuNR; j++)
 756         {
 757             if (devinfo->prop.major >= 3)
 758             {
 759                 /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 760                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 761                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 762                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
 763                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 764             }
 765             else
 766             {
 767                 /* On Fermi prefer L1 gives 2% higher performance */
 768                 /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 769                 cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 770                 cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 771                 cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 772                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 773             }
 774             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 775         }
 776     }
 777 }