src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include <cuda.h>
  47
  48 #include "types/simple.h"
  49 #include "types/nbnxn_pairlist.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/ishift.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "nbnxn_cuda_types.h"
  60 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  61 #include "nbnxn_cuda.h"
  62 #include "nbnxn_cuda_data_mgmt.h"
  63
  64 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  65 #define USE_TEXOBJ
  66 #endif
  67
  68 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  69 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  70
  71 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  72 texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  73
  74 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  75 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  76
  77 /* Convenience defines */
  78 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  79 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  80
  81 /***** The kernels come here *****/
  82 #include "nbnxn_cuda_kernel_utils.cuh"
  83
  84 /* Top-level kernel generation: will generate through multiple inclusion the
  85  * following flavors for all kernels:
  86  * - force-only output;
  87  * - force and energy output;
  88  * - force-only with pair list pruning;
  89  * - force and energy output with pair list pruning.
  90  */
  91 /** Force only **/
  92 #include "nbnxn_cuda_kernels.cuh"
  93 /** Force & energy **/
  94 #define CALC_ENERGIES
  95 #include "nbnxn_cuda_kernels.cuh"
  96 #undef CALC_ENERGIES
  97
  98 /*** Pair-list pruning kernels ***/
  99 /** Force only **/
 100 #define PRUNE_NBL
 101 #include "nbnxn_cuda_kernels.cuh"
 102 /** Force & energy **/
 103 #define CALC_ENERGIES
 104 #include "nbnxn_cuda_kernels.cuh"
 105 #undef CALC_ENERGIES
 106 #undef PRUNE_NBL
 107
 108 /*! Nonbonded kernel function pointer type */
 109 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 110                                      const cu_nbparam_t,
 111                                      const cu_plist_t,
 112                                      bool);
 113
 114 /*********************************/
 115
 116 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 117 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 118 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 119 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 120
 121
 122 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 123  * and corresponds to having the exponent set to the maximum (127 -- single
 124  * precision) and the mantissa to 0.
 125  */
 126 static unsigned int poll_wait_pattern = (0x7FU << 23);
 127
 128 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 129 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 130 {
 131     int max_grid_x_size;
 132
 133     assert(dinfo);
 134     /* CUDA does not accept grid dimension of 0 (which can happen e.g. with an
 135        empty domain) and that case should be handled before this point. */
 136     assert(nwork_units > 0);
 137
 138     max_grid_x_size = dinfo->prop.maxGridSize[0];
 139
 140     /* do we exceed the grid x dimension limit? */
 141     if (nwork_units > max_grid_x_size)
 142     {
 143         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 144                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 145                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 146     }
 147
 148     return nwork_units;
 149 }
 150
 151
 152 /* Constant arrays listing all kernel function pointers and enabling selection
 153    of a kernel in an elegant manner. */
 154
 155 /*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
 156  *  electrostatics and VDW type.
 157  *
 158  *  Note that the row- and column-order of function pointers has to match the
 159  *  order of corresponding enumerated electrostatics and vdw types, resp.,
 160  *  defined in nbnxn_cuda_types.h.
 161  */
 162
 163 /*! Force-only kernel function pointers. */
 164 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 165 {
 166     { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
 167     { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
 168     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
 169     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
 170     { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
 171     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 172 };
 173
 174 /*! Force + energy kernel function pointers. */
 175 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 176 {
 177     { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda              },
 178     { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda               },
 179     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda          },
 180     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda     },
 181     { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda               },
 182     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda        }
 183 };
 184
 185 /*! Force + pruning kernel function pointers. */
 186 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 187 {
 188     { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda            },
 189     { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda             },
 190     { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda        },
 191     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
 192     { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda             },
 193     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda      }
 194 };
 195
 196 /*! Force + energy + pruning kernel function pointers. */
 197 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 198 {
 199     { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
 200     { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
 201     { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
 202     { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
 203     { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
 204     { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 205 };
 206
 207 /*! Return a pointer to the kernel version to be executed at the current step. */
 208 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
 209                                                        int  evdwtype,
 210                                                        bool bDoEne,
 211                                                        bool bDoPrune)
 212 {
 213     nbnxn_cu_kfunc_ptr_t res;
 214
 215     assert(eeltype < eelCuNR);
 216     assert(evdwtype < evdwCuNR);
 217
 218     if (bDoEne)
 219     {
 220         if (bDoPrune)
 221         {
 222             res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
 223         }
 224         else
 225         {
 226             res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
 227         }
 228     }
 229     else
 230     {
 231         if (bDoPrune)
 232         {
 233             res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
 234         }
 235         else
 236         {
 237             res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
 238         }
 239     }
 240
 241     return res;
 242 }
 243
 244 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 245 static inline int calc_shmem_required()
 246 {
 247     int shmem;
 248
 249     /* size of shmem (force-buffers/xq/atom type preloading) */
 250     /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 251     /* i-atom x+q in shared memory */
 252     shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 253     /* cj in shared memory, for both warps separately */
 254     shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 255 #ifdef IATYPE_SHMEM
 256     /* i-atom types in shared memory */
 257     shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 258 #endif
 259 #if __CUDA_ARCH__ < 300
 260     /* force reduction buffers in shared memory */
 261     shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 262 #endif
 263
 264     return shmem;
 265 }
 266
 267 /*! As we execute nonbonded workload in separate streams, before launching
 268    the kernel we need to make sure that he following operations have completed:
 269    - atomdata allocation and related H2D transfers (every nstlist step);
 270    - pair list H2D transfer (every nstlist step);
 271    - shift vector H2D transfer (every nstlist step);
 272    - force (+shift force and energy) output clearing (every step).
 273
 274    These operations are issued in the local stream at the beginning of the step
 275    and therefore always complete before the local kernel launch. The non-local
 276    kernel is launched after the local on the same device/context hence it is
 277    inherently scheduled after the operations in the local stream (including the
 278    above "misc_ops") on pre-GK110 devices with single hardware queue, but on later
 279    devices with multiple hardware queues the dependency needs to be enforced.
 280    We use the misc_ops_and_local_H2D_done event to record the point where
 281    the local x+q H2D (and all preceding) tasks are complete and synchronize
 282    with this event in the non-local stream before launching the non-bonded kernel.
 283  */
 284 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
 285                               const nbnxn_atomdata_t *nbatom,
 286                               int                     flags,
 287                               int                     iloc)
 288 {
 289     cudaError_t          stat;
 290     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 291     /* CUDA kernel launch-related stuff */
 292     int                  shmem, nblock;
 293     dim3                 dim_block, dim_grid;
 294     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 295
 296     cu_atomdata_t       *adat    = cu_nb->atdat;
 297     cu_nbparam_t        *nbp     = cu_nb->nbparam;
 298     cu_plist_t          *plist   = cu_nb->plist[iloc];
 299     cu_timers_t         *t       = cu_nb->timers;
 300     cudaStream_t         stream  = cu_nb->stream[iloc];
 301
 302     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
 303     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
 304     bool                 bDoTime     = cu_nb->bDoTime;
 305
 306     /* turn energy calculation always on/off (for debugging/testing only) */
 307     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 308
 309     /* Don't launch the non-local kernel if there is no work to do.
 310        Doing the same for the local kernel is more complicated, since the
 311        local part of the force array also depends on the non-local kernel.
 312        So to avoid complicating the code and to reduce the risk of bugs,
 313        we always call the local kernel, the local x+q copy and later (not in
 314        this function) the stream wait, local f copyback and the f buffer
 315        clearing. All these operations, except for the local interaction kernel,
 316        are needed for the non-local interactions. The skip of the local kernel
 317        call is taken care of later in this function. */
 318     if (iloc == eintNonlocal && plist->nsci == 0)
 319     {
 320         return;
 321     }
 322
 323     /* calculate the atom data index range based on locality */
 324     if (LOCAL_I(iloc))
 325     {
 326         adat_begin  = 0;
 327         adat_len    = adat->natoms_local;
 328     }
 329     else
 330     {
 331         adat_begin  = adat->natoms_local;
 332         adat_len    = adat->natoms - adat->natoms_local;
 333     }
 334
 335     /* beginning of timed HtoD section */
 336     if (bDoTime)
 337     {
 338         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 339         CU_RET_ERR(stat, "cudaEventRecord failed");
 340     }
 341
 342     /* HtoD x, q */
 343     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 344                       adat_len * sizeof(*adat->xq), stream);
 345
 346     /* When we get here all misc operations issues in the local stream as well as
 347        the local xq H2D are done,
 348        so we record that in the local stream and wait for it in the nonlocal one. */
 349     if (cu_nb->bUseTwoStreams)
 350     {
 351         if (iloc == eintLocal)
 352         {
 353             stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream);
 354             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
 355         }
 356         else
 357         {
 358             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0);
 359             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
 360         }
 361     }
 362
 363     if (bDoTime)
 364     {
 365         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 366         CU_RET_ERR(stat, "cudaEventRecord failed");
 367     }
 368
 369     if (plist->nsci == 0)
 370     {
 371         /* Don't launch an empty local kernel (not allowed with CUDA) */
 372         return;
 373     }
 374
 375     /* beginning of timed nonbonded calculation section */
 376     if (bDoTime)
 377     {
 378         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 379         CU_RET_ERR(stat, "cudaEventRecord failed");
 380     }
 381
 382     /* get the pointer to the kernel flavor we need to use */
 383     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
 384                                     nbp->vdwtype,
 385                                     bCalcEner,
 386                                     plist->bDoPrune || always_prune);
 387
 388     /* kernel launch config */
 389     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 390     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 391     dim_grid  = dim3(nblock, 1, 1);
 392     shmem     = calc_shmem_required();
 393
 394     if (debug)
 395     {
 396         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 397                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 398                 dim_block.x, dim_block.y, dim_block.z,
 399                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 400                 NCL_PER_SUPERCL, plist->na_c);
 401     }
 402
 403     nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
 404     CU_LAUNCH_ERR("k_calc_nb");
 405
 406     if (bDoTime)
 407     {
 408         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 409         CU_RET_ERR(stat, "cudaEventRecord failed");
 410     }
 411 }
 412
 413 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
 414                                const nbnxn_atomdata_t *nbatom,
 415                                int                     flags,
 416                                int                     aloc)
 417 {
 418     cudaError_t stat;
 419     int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
 420     int         iloc = -1;
 421
 422     /* determine interaction locality from atom locality */
 423     if (LOCAL_A(aloc))
 424     {
 425         iloc = eintLocal;
 426     }
 427     else if (NONLOCAL_A(aloc))
 428     {
 429         iloc = eintNonlocal;
 430     }
 431     else
 432     {
 433         char stmp[STRLEN];
 434         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 435                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 436         gmx_incons(stmp);
 437     }
 438
 439     cu_atomdata_t   *adat    = cu_nb->atdat;
 440     cu_timers_t     *t       = cu_nb->timers;
 441     bool             bDoTime = cu_nb->bDoTime;
 442     cudaStream_t     stream  = cu_nb->stream[iloc];
 443
 444     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 445     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 446
 447     /* don't launch non-local copy-back if there was no non-local work to do */
 448     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 449     {
 450         return;
 451     }
 452
 453     /* calculate the atom data index range based on locality */
 454     if (LOCAL_A(aloc))
 455     {
 456         adat_begin  = 0;
 457         adat_len    = adat->natoms_local;
 458         adat_end    = cu_nb->atdat->natoms_local;
 459     }
 460     else
 461     {
 462         adat_begin  = adat->natoms_local;
 463         adat_len    = adat->natoms - adat->natoms_local;
 464         adat_end    = cu_nb->atdat->natoms;
 465     }
 466
 467     /* beginning of timed D2H section */
 468     if (bDoTime)
 469     {
 470         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 471         CU_RET_ERR(stat, "cudaEventRecord failed");
 472     }
 473
 474     if (!cu_nb->bUseStreamSync)
 475     {
 476         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 477            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 478         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 479         {
 480 #ifdef NAN
 481             nbatom->out[0].f[i] = NAN;
 482 #else
 483 #  ifdef _MSVC
 484             if (numeric_limits<float>::has_quiet_NaN)
 485             {
 486                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 487             }
 488             else
 489 #  endif
 490             {
 491                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 492             }
 493 #endif
 494         }
 495
 496         /* Set the last four bytes of the force array to a bit pattern
 497            which can't be the result of the force calculation:
 498            max exponent (127) and zero mantissa. */
 499         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 500     }
 501
 502     /* With DD the local D2H transfer can only start after the non-local
 503        kernel has finished. */
 504     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 505     {
 506         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 507         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 508     }
 509
 510     /* DtoH f */
 511     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 512                       (adat_len)*sizeof(*adat->f), stream);
 513
 514     /* After the non-local D2H is launched the nonlocal_done event can be
 515        recorded which signals that the local D2H can proceed. This event is not
 516        placed after the non-local kernel because we want the non-local data
 517        back first. */
 518     if (iloc == eintNonlocal)
 519     {
 520         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 521         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 522     }
 523
 524     /* only transfer energies in the local stream */
 525     if (LOCAL_I(iloc))
 526     {
 527         /* DtoH fshift */
 528         if (bCalcFshift)
 529         {
 530             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 531                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 532         }
 533
 534         /* DtoH energies */
 535         if (bCalcEner)
 536         {
 537             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 538                               sizeof(*cu_nb->nbst.e_lj), stream);
 539             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 540                               sizeof(*cu_nb->nbst.e_el), stream);
 541         }
 542     }
 543
 544     if (bDoTime)
 545     {
 546         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 547         CU_RET_ERR(stat, "cudaEventRecord failed");
 548     }
 549 }
 550
 551 /* Atomic compare-exchange operation on unsigned values. It is used in
 552  * polling wait for the GPU.
 553  */
 554 static inline bool atomic_cas(volatile unsigned int *ptr,
 555                               unsigned int           oldval,
 556                               unsigned int           newval)
 557 {
 558     assert(ptr);
 559
 560 #ifdef TMPI_ATOMICS
 561     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 562 #else
 563     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 564     return true;
 565 #endif
 566 }
 567
 568 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 569                          const nbnxn_atomdata_t *nbatom,
 570                          int flags, int aloc,
 571                          real *e_lj, real *e_el, rvec *fshift)
 572 {
 573     /* NOTE:  only implemented for single-precision at this time */
 574     cudaError_t            stat;
 575     int                    i, adat_end, iloc = -1;
 576     volatile unsigned int *poll_word;
 577
 578     /* determine interaction locality from atom locality */
 579     if (LOCAL_A(aloc))
 580     {
 581         iloc = eintLocal;
 582     }
 583     else if (NONLOCAL_A(aloc))
 584     {
 585         iloc = eintNonlocal;
 586     }
 587     else
 588     {
 589         char stmp[STRLEN];
 590         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 591                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 592         gmx_incons(stmp);
 593     }
 594
 595     cu_plist_t      *plist    = cu_nb->plist[iloc];
 596     cu_timers_t     *timers   = cu_nb->timers;
 597     wallclock_gpu_t *timings  = cu_nb->timings;
 598     nb_staging       nbst     = cu_nb->nbst;
 599
 600     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
 601     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 602
 603     /* turn energy calculation always on/off (for debugging/testing only) */
 604     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 605
 606     /* Launch wait/update timers & counters, unless doing the non-local phase
 607        when there is not actually work to do. This is consistent with
 608        nbnxn_cuda_launch_kernel.
 609
 610        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 611        counters could end up being inconsistent due to not being incremented
 612        on some of the nodes! */
 613     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 614     {
 615         return;
 616     }
 617
 618     /* calculate the atom data index range based on locality */
 619     if (LOCAL_A(aloc))
 620     {
 621         adat_end = cu_nb->atdat->natoms_local;
 622     }
 623     else
 624     {
 625         adat_end = cu_nb->atdat->natoms;
 626     }
 627
 628     if (cu_nb->bUseStreamSync)
 629     {
 630         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 631         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 632     }
 633     else
 634     {
 635         /* Busy-wait until we get the signal pattern set in last byte
 636          * of the l/nl float vector. This pattern corresponds to a floating
 637          * point number which can't be the result of the force calculation
 638          * (maximum, 127 exponent and 0 mantissa).
 639          * The polling uses atomic compare-exchange.
 640          */
 641         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 642         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
 643         {
 644         }
 645     }
 646
 647     /* timing data accumulation */
 648     if (cu_nb->bDoTime)
 649     {
 650         /* only increase counter once (at local F wait) */
 651         if (LOCAL_I(iloc))
 652         {
 653             timings->nb_c++;
 654             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 655         }
 656
 657         /* kernel timings */
 658         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 659             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 660
 661         /* X/q H2D and F D2H timings */
 662         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 663                                               timers->stop_nb_h2d[iloc]);
 664         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 665                                               timers->stop_nb_d2h[iloc]);
 666
 667         /* only count atdat and pair-list H2D at pair-search step */
 668         if (plist->bDoPrune)
 669         {
 670             /* atdat transfer timing (add only once, at local F wait) */
 671             if (LOCAL_A(aloc))
 672             {
 673                 timings->pl_h2d_c++;
 674                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 675                                                       timers->stop_atdat);
 676             }
 677
 678             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 679                                                   timers->stop_pl_h2d[iloc]);
 680         }
 681     }
 682
 683     /* add up energies and shift forces (only once at local F wait) */
 684     if (LOCAL_I(iloc))
 685     {
 686         if (bCalcEner)
 687         {
 688             *e_lj += *nbst.e_lj;
 689             *e_el += *nbst.e_el;
 690         }
 691
 692         if (bCalcFshift)
 693         {
 694             for (i = 0; i < SHIFTS; i++)
 695             {
 696                 fshift[i][0] += nbst.fshift[i].x;
 697                 fshift[i][1] += nbst.fshift[i].y;
 698                 fshift[i][2] += nbst.fshift[i].z;
 699             }
 700         }
 701     }
 702
 703     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 704     plist->bDoPrune = false;
 705 }
 706
 707 /*! Return the reference to the nbfp texture. */
 708 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 709 {
 710     return nbfp_texref;
 711 }
 712
 713 /*! Return the reference to the nbfp_comb texture. */
 714 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 715 {
 716     return nbfp_comb_texref;
 717 }
 718
 719 /*! Return the reference to the coulomb_tab. */
 720 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 721 {
 722     return coulomb_tab_texref;
 723 }
 724
 725 /*! Set up the cache configuration for the non-bonded kernels,
 726  */
 727 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 728 {
 729     cudaError_t stat;
 730
 731     for (int i = 0; i < eelCuNR; i++)
 732     {
 733         for (int j = 0; j < evdwCuNR; j++)
 734         {
 735             if (devinfo->prop.major >= 3)
 736             {
 737                 /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 738                 stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 739                 stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 740                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
 741                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 742             }
 743             else
 744             {
 745                 /* On Fermi prefer L1 gives 2% higher performance */
 746                 /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 747                 stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 748                 stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 749                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 750                 stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 751             }
 752             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 753         }
 754     }
 755 }