src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdarg.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  55 #include "gromacs/legacyheaders/typedefs.h"
  56 #include "gromacs/legacyheaders/types/enums.h"
  57 #include "gromacs/legacyheaders/types/force_flags.h"
  58 #include "gromacs/legacyheaders/types/interaction_const.h"
  59 #include "gromacs/mdlib/nb_verlet.h"
  60 #include "gromacs/mdlib/nbnxn_consts.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/smalloc.h"
  68
  69 #include "nbnxn_cuda_types.h"
  70
  71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  72
  73 /* This is a heuristically determined parameter for the Fermi, Kepler
  74  * and Maxwell architectures for the minimum size of ci lists by multiplying
  75  * this constant with the # of multiprocessors on the current device.
  76  * Since the maximum number of blocks per multiprocessor is 16, the ideal
  77  * count for small systems is 32 or 48 blocks per multiprocessor. Because
  78  * there is a bit of fluctuations in the generated block counts, we use
  79  * a target of 44 instead of the ideal value of 48.
  80  */
  81 static unsigned int gpu_min_ci_balanced_factor = 44;
  82
  83 /* Functions from nbnxn_cuda.cu */
  84 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  85 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  88
  89 /* We should actually be using md_print_warn in md_logging.c,
  90  * but we can't include mpi.h in CUDA code.
  91  */
  92 static void md_print_warn(FILE       *fplog,
  93                           const char *fmt, ...)
  94 {
  95     va_list ap;
  96
  97     if (fplog != NULL)
  98     {
  99         /* We should only print to stderr on the master node,
 100          * in most cases fplog is only set on the master node, so this works.
 101          */
 102         va_start(ap, fmt);
 103         fprintf(stderr, "\n");
 104         vfprintf(stderr, fmt, ap);
 105         fprintf(stderr, "\n");
 106         va_end(ap);
 107
 108         va_start(ap, fmt);
 109         fprintf(fplog, "\n");
 110         vfprintf(fplog, fmt, ap);
 111         fprintf(fplog, "\n");
 112         va_end(ap);
 113     }
 114 }
 115
 116
 117 /* Fw. decl. */
 118 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 119
 120 /* Fw. decl, */
 121 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 122                                           const gmx_device_info_t *dev_info);
 123
 124
 125
 126 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 127 static bool use_texobj(const gmx_device_info_t *dev_info)
 128 {
 129     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 130     return (dev_info->prop.major >= 3);
 131 }
 132 #endif
 133
 134 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 135     and the table GPU array. If called with an already allocated table,
 136     it just re-uploads the table.
 137  */
 138 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
 139                                            cu_nbparam_t              *nbp,
 140                                            const gmx_device_info_t   *dev_info)
 141 {
 142     float       *coul_tab;
 143     cudaError_t  stat;
 144
 145     if (nbp->coulomb_tab != NULL)
 146     {
 147         nbnxn_cuda_free_nbparam_table(nbp, dev_info);
 148     }
 149
 150     stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
 151     CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 152
 153     nbp->coulomb_tab = coul_tab;
 154
 155 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 156     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 157     if (use_texobj(dev_info))
 158     {
 159         cudaResourceDesc rd;
 160         memset(&rd, 0, sizeof(rd));
 161         rd.resType                  = cudaResourceTypeLinear;
 162         rd.res.linear.devPtr        = nbp->coulomb_tab;
 163         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 164         rd.res.linear.desc.x        = 32;
 165         rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
 166
 167         cudaTextureDesc td;
 168         memset(&td, 0, sizeof(td));
 169         td.readMode                 = cudaReadModeElementType;
 170         stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 171         CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 172     }
 173     else
 174 #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
 175     {
 176         GMX_UNUSED_VALUE(dev_info);
 177         cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 178         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 179                                coul_tab, &cd,
 180                                ic->tabq_size*sizeof(*coul_tab));
 181         CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 182     }
 183
 184     cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
 185
 186     nbp->coulomb_tab_size     = ic->tabq_size;
 187     nbp->coulomb_tab_scale    = ic->tabq_scale;
 188 }
 189
 190
 191 /*! Initializes the atomdata structure first time, it only gets filled at
 192     pair-search. */
 193 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 194 {
 195     cudaError_t stat;
 196
 197     ad->ntypes  = ntypes;
 198     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 199     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 200     ad->bShiftVecUploaded = false;
 201
 202     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 203     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 204
 205     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 206     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 207     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 208     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 209
 210     /* initialize to NULL poiters to data that is not allocated here and will
 211        need reallocation in nbnxn_cuda_init_atomdata */
 212     ad->xq = NULL;
 213     ad->f  = NULL;
 214
 215     /* size -1 indicates that the respective array hasn't been initialized yet */
 216     ad->natoms = -1;
 217     ad->nalloc = -1;
 218 }
 219
 220 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 221     earlier GPUs, single or twin cut-off. */
 222 static int pick_ewald_kernel_type(bool                     bTwinCut,
 223                                   const gmx_device_info_t *dev_info)
 224 {
 225     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 226     int  kernel_type;
 227
 228     /* Benchmarking/development environment variables to force the use of
 229        analytical or tabulated Ewald kernel. */
 230     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 231     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 232
 233     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 234     {
 235         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 236                    "requested through environment variables.");
 237     }
 238
 239     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 240     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 241     {
 242         bUseAnalyticalEwald = true;
 243
 244         if (debug)
 245         {
 246             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 247         }
 248     }
 249     else
 250     {
 251         bUseAnalyticalEwald = false;
 252
 253         if (debug)
 254         {
 255             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 256         }
 257     }
 258
 259     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 260        forces it (use it for debugging/benchmarking only). */
 261     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 262     {
 263         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 264     }
 265     else
 266     {
 267         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 268     }
 269
 270     return kernel_type;
 271 }
 272
 273 /*! Copies all parameters related to the cut-off from ic to nbp */
 274 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 275                                   const interaction_const_t *ic)
 276 {
 277     nbp->ewald_beta       = ic->ewaldcoeff_q;
 278     nbp->sh_ewald         = ic->sh_ewald;
 279     nbp->epsfac           = ic->epsfac;
 280     nbp->two_k_rf         = 2.0 * ic->k_rf;
 281     nbp->c_rf             = ic->c_rf;
 282     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 283     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 284     nbp->rlist_sq         = ic->rlist * ic->rlist;
 285
 286     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 287     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 288
 289     nbp->rvdw_switch      = ic->rvdw_switch;
 290     nbp->dispersion_shift = ic->dispersion_shift;
 291     nbp->repulsion_shift  = ic->repulsion_shift;
 292     nbp->vdw_switch       = ic->vdw_switch;
 293 }
 294
 295 /*! Initializes the nonbonded parameter data structure. */
 296 static void init_nbparam(cu_nbparam_t              *nbp,
 297                          const interaction_const_t *ic,
 298                          const nbnxn_atomdata_t    *nbat,
 299                          const gmx_device_info_t   *dev_info)
 300 {
 301     cudaError_t stat;
 302     int         ntypes, nnbfp, nnbfp_comb;
 303
 304     ntypes  = nbat->ntype;
 305
 306     set_cutoff_parameters(nbp, ic);
 307
 308     if (ic->vdwtype == evdwCUT)
 309     {
 310         switch (ic->vdw_modifier)
 311         {
 312             case eintmodNONE:
 313             case eintmodPOTSHIFT:
 314                 nbp->vdwtype = evdwCuCUT;
 315                 break;
 316             case eintmodFORCESWITCH:
 317                 nbp->vdwtype = evdwCuFSWITCH;
 318                 break;
 319             case eintmodPOTSWITCH:
 320                 nbp->vdwtype = evdwCuPSWITCH;
 321                 break;
 322             default:
 323                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 324                 break;
 325         }
 326     }
 327     else if (ic->vdwtype == evdwPME)
 328     {
 329         if (ic->ljpme_comb_rule == ljcrGEOM)
 330         {
 331             assert(nbat->comb_rule == ljcrGEOM);
 332             nbp->vdwtype = evdwCuEWALDGEOM;
 333         }
 334         else
 335         {
 336             assert(nbat->comb_rule == ljcrLB);
 337             nbp->vdwtype = evdwCuEWALDLB;
 338         }
 339     }
 340     else
 341     {
 342         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 343     }
 344
 345     if (ic->eeltype == eelCUT)
 346     {
 347         nbp->eeltype = eelCuCUT;
 348     }
 349     else if (EEL_RF(ic->eeltype))
 350     {
 351         nbp->eeltype = eelCuRF;
 352     }
 353     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 354     {
 355         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 356         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 357     }
 358     else
 359     {
 360         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 361         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 362     }
 363
 364     /* generate table for PME */
 365     nbp->coulomb_tab = NULL;
 366     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 367     {
 368         init_ewald_coulomb_force_table(ic, nbp, dev_info);
 369     }
 370
 371     nnbfp      = 2*ntypes*ntypes;
 372     nnbfp_comb = 2*ntypes;
 373
 374     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 375     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 376     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 377
 378
 379     if (ic->vdwtype == evdwPME)
 380     {
 381         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 382         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 383         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 384     }
 385
 386 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 387     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 388     if (use_texobj(dev_info))
 389     {
 390         cudaResourceDesc rd;
 391         cudaTextureDesc  td;
 392
 393         memset(&rd, 0, sizeof(rd));
 394         rd.resType                  = cudaResourceTypeLinear;
 395         rd.res.linear.devPtr        = nbp->nbfp;
 396         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 397         rd.res.linear.desc.x        = 32;
 398         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 399
 400         memset(&td, 0, sizeof(td));
 401         td.readMode                 = cudaReadModeElementType;
 402         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 403         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 404
 405         if (ic->vdwtype == evdwPME)
 406         {
 407             memset(&rd, 0, sizeof(rd));
 408             rd.resType                  = cudaResourceTypeLinear;
 409             rd.res.linear.devPtr        = nbp->nbfp_comb;
 410             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 411             rd.res.linear.desc.x        = 32;
 412             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 413
 414             memset(&td, 0, sizeof(td));
 415             td.readMode = cudaReadModeElementType;
 416             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 417             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 418         }
 419     }
 420     else
 421 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
 422     {
 423         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 424         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 425                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 426         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 427
 428         if (ic->vdwtype == evdwPME)
 429         {
 430             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 431                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 432             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 433         }
 434     }
 435 }
 436
 437 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 438  *  electrostatic type switching to twin cut-off (or back) if needed. */
 439 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 440                                         const interaction_const_t   *ic)
 441 {
 442     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 443     {
 444         return;
 445     }
 446     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
 447     cu_nbparam_t     *nbp   = nb->nbparam;
 448
 449     set_cutoff_parameters(nbp, ic);
 450
 451     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 452                                                  nb->dev_info);
 453
 454     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
 455 }
 456
 457 /*! Initializes the pair list data structure. */
 458 static void init_plist(cu_plist_t *pl)
 459 {
 460     /* initialize to NULL pointers to data that is not allocated here and will
 461        need reallocation in nbnxn_gpu_init_pairlist */
 462     pl->sci     = NULL;
 463     pl->cj4     = NULL;
 464     pl->excl    = NULL;
 465
 466     /* size -1 indicates that the respective array hasn't been initialized yet */
 467     pl->na_c        = -1;
 468     pl->nsci        = -1;
 469     pl->sci_nalloc  = -1;
 470     pl->ncj4        = -1;
 471     pl->cj4_nalloc  = -1;
 472     pl->nexcl       = -1;
 473     pl->excl_nalloc = -1;
 474     pl->bDoPrune    = false;
 475 }
 476
 477 /*! Initializes the timer data structure. */
 478 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 479 {
 480     cudaError_t stat;
 481     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 482
 483     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 484     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 485     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 486     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 487
 488     /* The non-local counters/stream (second in the array) are needed only with DD. */
 489     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 490     {
 491         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 492         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 493         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 494         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 495
 496
 497         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 498         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 499         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 500         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 501
 502         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 503         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 504         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 505         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 506
 507         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 508         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 509         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 510         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 511     }
 512 }
 513
 514 /*! Initializes the timings data structure. */
 515 static void init_timings(gmx_wallclock_gpu_t *t)
 516 {
 517     int i, j;
 518
 519     t->nb_h2d_t = 0.0;
 520     t->nb_d2h_t = 0.0;
 521     t->nb_c     = 0;
 522     t->pl_h2d_t = 0.0;
 523     t->pl_h2d_c = 0;
 524     for (i = 0; i < 2; i++)
 525     {
 526         for (j = 0; j < 2; j++)
 527         {
 528             t->ktime[i][j].t = 0.0;
 529             t->ktime[i][j].c = 0;
 530         }
 531     }
 532 }
 533
 534 void nbnxn_gpu_init(FILE                 *fplog,
 535                     gmx_nbnxn_cuda_t    **p_nb,
 536                     const gmx_gpu_info_t *gpu_info,
 537                     const gmx_gpu_opt_t  *gpu_opt,
 538                     int                   my_gpu_index,
 539                     gmx_bool              bLocalAndNonlocal)
 540 {
 541     cudaError_t       stat;
 542     gmx_nbnxn_cuda_t *nb;
 543     char              sbuf[STRLEN];
 544     bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 545     int               cuda_drv_ver;
 546
 547     assert(gpu_info);
 548
 549     if (p_nb == NULL)
 550     {
 551         return;
 552     }
 553
 554     snew(nb, 1);
 555     snew(nb->atdat, 1);
 556     snew(nb->nbparam, 1);
 557     snew(nb->plist[eintLocal], 1);
 558     if (bLocalAndNonlocal)
 559     {
 560         snew(nb->plist[eintNonlocal], 1);
 561     }
 562
 563     nb->bUseTwoStreams = bLocalAndNonlocal;
 564
 565     snew(nb->timers, 1);
 566     snew(nb->timings, 1);
 567
 568     /* init nbst */
 569     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 570     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 571     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 572
 573     init_plist(nb->plist[eintLocal]);
 574
 575     /* set device info, just point it to the right GPU among the detected ones */
 576     nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 577
 578     /* local/non-local GPU streams */
 579     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 580     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 581     if (nb->bUseTwoStreams)
 582     {
 583         init_plist(nb->plist[eintNonlocal]);
 584
 585         /* CUDA stream priority available in the CUDA RT 5.5 API.
 586          * Note that the device we're running on does not have to support
 587          * priorities, because we are querying the priority range which in this
 588          * case will be a single value.
 589          */
 590 #if GMX_CUDA_VERSION >= 5050
 591         {
 592             int highest_priority;
 593             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 594             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 595
 596             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 597                                                 cudaStreamDefault,
 598                                                 highest_priority);
 599             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 600         }
 601 #else
 602         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 603         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 604 #endif
 605     }
 606
 607     /* init events for sychronization (timing disabled for performance reasons!) */
 608     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 609     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 610     stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 611     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 612
 613     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 614      * (which increases with shorter time/step) caused by a known CUDA driver bug.
 615      * To work around the issue we'll use an (admittedly fragile) memory polling
 616      * waiting to preserve performance. This requires support for atomic
 617      * operations and only works on x86/x86_64.
 618      * With polling wait event-timing also needs to be disabled.
 619      *
 620      * The overhead is greatly reduced in API v5.0 drivers and the improvement
 621      * is independent of runtime version. Hence, with API v5.0 drivers and later
 622      * we won't switch to polling.
 623      *
 624      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 625      * ranks so we will also disable it in that case.
 626      */
 627
 628     bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 629     bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 630
 631 #ifdef TMPI_ATOMICS
 632     bTMPIAtomics = true;
 633 #else
 634     bTMPIAtomics = false;
 635 #endif
 636
 637 #ifdef GMX_TARGET_X86
 638     bX86 = true;
 639 #else
 640     bX86 = false;
 641 #endif
 642
 643     if (bStreamSync && bNoStreamSync)
 644     {
 645         gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 646     }
 647
 648     stat = cudaDriverGetVersion(&cuda_drv_ver);
 649     CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 650
 651     bOldDriver = (cuda_drv_ver < 5000);
 652
 653     if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
 654     {
 655         /* Polling wait should be used instead of cudaStreamSynchronize only if:
 656          *   - ECC is ON & driver is old (checked above),
 657          *   - we're on x86/x86_64,
 658          *   - atomics are available, and
 659          *   - GPUs are not being shared.
 660          */
 661         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
 662                                    (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 663
 664         if (bStreamSync)
 665         {
 666             nb->bUseStreamSync = true;
 667
 668             /* only warn if polling should be used */
 669             if (bShouldUsePollSync)
 670             {
 671                 md_print_warn(fplog,
 672                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 673                               "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 674             }
 675         }
 676         else
 677         {
 678             nb->bUseStreamSync = !bShouldUsePollSync;
 679
 680             if (bShouldUsePollSync)
 681             {
 682                 md_print_warn(fplog,
 683                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 684                               "      cause performance loss. Switching to the alternative polling GPU wait.\n"
 685                               "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 686                               "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 687             }
 688             else
 689             {
 690                 /* Tell the user that the ECC+old driver combination can be bad */
 691                 sprintf(sbuf,
 692                         "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
 693                         "      A known bug in this driver version can cause performance loss.\n"
 694                         "      However, the polling wait workaround can not be used because\n%s\n"
 695                         "      Consider updating the driver or turning ECC off.",
 696                         (bX86 && bTMPIAtomics) ?
 697                         "      GPU(s) are being oversubscribed." :
 698                         "      atomic operations are not supported by the platform/CPU+compiler.");
 699                 md_print_warn(fplog, sbuf);
 700             }
 701         }
 702     }
 703     else
 704     {
 705         if (bNoStreamSync)
 706         {
 707             nb->bUseStreamSync = false;
 708
 709             md_print_warn(fplog,
 710                           "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 711         }
 712         else
 713         {
 714             /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 715             nb->bUseStreamSync = true;
 716         }
 717     }
 718
 719     /* CUDA timing disabled as event timers don't work:
 720        - with multiple streams = domain-decomposition;
 721        - with the polling waiting hack (without cudaStreamSynchronize);
 722        - when turned off by GMX_DISABLE_CUDA_TIMING.
 723      */
 724     nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 725                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 726
 727     if (nb->bDoTime)
 728     {
 729         init_timers(nb->timers, nb->bUseTwoStreams);
 730         init_timings(nb->timings);
 731     }
 732
 733     /* set the kernel type for the current GPU */
 734     /* pick L1 cache configuration */
 735     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 736
 737     *p_nb = nb;
 738
 739     if (debug)
 740     {
 741         fprintf(debug, "Initialized CUDA data structures.\n");
 742     }
 743 }
 744
 745 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
 746                           const interaction_const_t      *ic,
 747                           const nonbonded_verlet_group_t *nbv_group)
 748 {
 749     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
 750     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
 751
 752     /* clear energy and shift force outputs */
 753     nbnxn_cuda_clear_e_fshift(nb);
 754 }
 755
 756 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 757                              const nbnxn_pairlist_t *h_plist,
 758                              int                     iloc)
 759 {
 760     char          sbuf[STRLEN];
 761     cudaError_t   stat;
 762     bool          bDoTime    = nb->bDoTime;
 763     cudaStream_t  stream     = nb->stream[iloc];
 764     cu_plist_t   *d_plist    = nb->plist[iloc];
 765
 766     if (d_plist->na_c < 0)
 767     {
 768         d_plist->na_c = h_plist->na_ci;
 769     }
 770     else
 771     {
 772         if (d_plist->na_c != h_plist->na_ci)
 773         {
 774             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 775                     d_plist->na_c, h_plist->na_ci);
 776             gmx_incons(sbuf);
 777         }
 778     }
 779
 780     if (bDoTime)
 781     {
 782         stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
 783         CU_RET_ERR(stat, "cudaEventRecord failed");
 784     }
 785
 786     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 787                         &d_plist->nsci, &d_plist->sci_nalloc,
 788                         h_plist->nsci,
 789                         stream, true);
 790
 791     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 792                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 793                         h_plist->ncj4,
 794                         stream, true);
 795
 796     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 797                         &d_plist->nexcl, &d_plist->excl_nalloc,
 798                         h_plist->nexcl,
 799                         stream, true);
 800
 801     if (bDoTime)
 802     {
 803         stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
 804         CU_RET_ERR(stat, "cudaEventRecord failed");
 805     }
 806
 807     /* need to prune the pair list during the next step */
 808     d_plist->bDoPrune = true;
 809 }
 810
 811 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
 812                                const nbnxn_atomdata_t *nbatom)
 813 {
 814     cu_atomdata_t *adat  = nb->atdat;
 815     cudaStream_t   ls    = nb->stream[eintLocal];
 816
 817     /* only if we have a dynamic box */
 818     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 819     {
 820         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 821                           SHIFTS * sizeof(*adat->shift_vec), ls);
 822         adat->bShiftVecUploaded = true;
 823     }
 824 }
 825
 826 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 827 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 828 {
 829     cudaError_t    stat;
 830     cu_atomdata_t *adat  = nb->atdat;
 831     cudaStream_t   ls    = nb->stream[eintLocal];
 832
 833     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 834     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 835 }
 836
 837 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 838 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 839 {
 840     cudaError_t    stat;
 841     cu_atomdata_t *adat  = nb->atdat;
 842     cudaStream_t   ls    = nb->stream[eintLocal];
 843
 844     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 845     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 846     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 847     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 848     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 849     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 850 }
 851
 852 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 853 {
 854     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
 855     /* clear shift force array and energies if the outputs were
 856        used in the current step */
 857     if (flags & GMX_FORCE_VIRIAL)
 858     {
 859         nbnxn_cuda_clear_e_fshift(nb);
 860     }
 861 }
 862
 863 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 864                              const struct nbnxn_atomdata_t *nbat)
 865 {
 866     cudaError_t    stat;
 867     int            nalloc, natoms;
 868     bool           realloced;
 869     bool           bDoTime   = nb->bDoTime;
 870     cu_timers_t   *timers    = nb->timers;
 871     cu_atomdata_t *d_atdat   = nb->atdat;
 872     cudaStream_t   ls        = nb->stream[eintLocal];
 873
 874     natoms    = nbat->natoms;
 875     realloced = false;
 876
 877     if (bDoTime)
 878     {
 879         /* time async copy */
 880         stat = cudaEventRecord(timers->start_atdat, ls);
 881         CU_RET_ERR(stat, "cudaEventRecord failed");
 882     }
 883
 884     /* need to reallocate if we have to copy more atoms than the amount of space
 885        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 886     if (natoms > d_atdat->nalloc)
 887     {
 888         nalloc = over_alloc_small(natoms);
 889
 890         /* free up first if the arrays have already been initialized */
 891         if (d_atdat->nalloc != -1)
 892         {
 893             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 894             cu_free_buffered(d_atdat->xq);
 895             cu_free_buffered(d_atdat->atom_types);
 896         }
 897
 898         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 899         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 900         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 901         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 902
 903         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 904         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 905
 906         d_atdat->nalloc = nalloc;
 907         realloced       = true;
 908     }
 909
 910     d_atdat->natoms       = natoms;
 911     d_atdat->natoms_local = nbat->natoms_local;
 912
 913     /* need to clear GPU f output if realloc happened */
 914     if (realloced)
 915     {
 916         nbnxn_cuda_clear_f(nb, nalloc);
 917     }
 918
 919     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 920                       natoms*sizeof(*d_atdat->atom_types), ls);
 921
 922     if (bDoTime)
 923     {
 924         stat = cudaEventRecord(timers->stop_atdat, ls);
 925         CU_RET_ERR(stat, "cudaEventRecord failed");
 926     }
 927 }
 928
 929 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 930                                           const gmx_device_info_t *dev_info)
 931 {
 932     cudaError_t stat;
 933
 934     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 935     {
 936 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 937         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 938         if (use_texobj(dev_info))
 939         {
 940             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 941             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 942         }
 943         else
 944 #endif
 945         {
 946             GMX_UNUSED_VALUE(dev_info);
 947             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 948             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 949         }
 950         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 951     }
 952 }
 953
 954 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 955 {
 956     cudaError_t      stat;
 957     cu_atomdata_t   *atdat;
 958     cu_nbparam_t    *nbparam;
 959     cu_plist_t      *plist, *plist_nl;
 960     cu_timers_t     *timers;
 961
 962     /* Stopping the nvidia profiler here allows us to eliminate the subsequent
 963        uninitialization API calls from the trace. */
 964     if (getenv("NVPROF_ID") != NULL)
 965     {
 966         stat = cudaProfilerStop();
 967         CU_RET_ERR(stat, "cudaProfilerStop failed");
 968     }
 969
 970     if (nb == NULL)
 971     {
 972         return;
 973     }
 974
 975     atdat       = nb->atdat;
 976     nbparam     = nb->nbparam;
 977     plist       = nb->plist[eintLocal];
 978     plist_nl    = nb->plist[eintNonlocal];
 979     timers      = nb->timers;
 980
 981     nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 982
 983     stat = cudaEventDestroy(nb->nonlocal_done);
 984     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 985     stat = cudaEventDestroy(nb->misc_ops_done);
 986     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 987
 988     if (nb->bDoTime)
 989     {
 990         stat = cudaEventDestroy(timers->start_atdat);
 991         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 992         stat = cudaEventDestroy(timers->stop_atdat);
 993         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 994
 995         /* The non-local counters/stream (second in the array) are needed only with DD. */
 996         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
 997         {
 998             stat = cudaEventDestroy(timers->start_nb_k[i]);
 999             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
1000             stat = cudaEventDestroy(timers->stop_nb_k[i]);
1001             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
1002
1003             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
1004             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
1005             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
1006             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
1007
1008             stat = cudaStreamDestroy(nb->stream[i]);
1009             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1010
1011             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1012             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1013             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1014             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1015
1016             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1017             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1018             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1019             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1020         }
1021     }
1022
1023 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1024     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1025     if (use_texobj(nb->dev_info))
1026     {
1027         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1028         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1029     }
1030     else
1031 #endif
1032     {
1033         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1034         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1035     }
1036     cu_free_buffered(nbparam->nbfp);
1037
1038     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1039     {
1040 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1041         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1042         if (use_texobj(nb->dev_info))
1043         {
1044             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1045             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1046         }
1047         else
1048 #endif
1049         {
1050             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1051             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1052         }
1053         cu_free_buffered(nbparam->nbfp_comb);
1054     }
1055
1056     stat = cudaFree(atdat->shift_vec);
1057     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1058     stat = cudaFree(atdat->fshift);
1059     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1060
1061     stat = cudaFree(atdat->e_lj);
1062     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1063     stat = cudaFree(atdat->e_el);
1064     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1065
1066     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1067     cu_free_buffered(atdat->xq);
1068     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1069
1070     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1071     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1072     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1073     if (nb->bUseTwoStreams)
1074     {
1075         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1076         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1077         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1078     }
1079
1080     sfree(atdat);
1081     sfree(nbparam);
1082     sfree(plist);
1083     if (nb->bUseTwoStreams)
1084     {
1085         sfree(plist_nl);
1086     }
1087     sfree(timers);
1088     sfree(nb->timings);
1089     sfree(nb);
1090
1091     if (debug)
1092     {
1093         fprintf(debug, "Cleaned up CUDA data structures.\n");
1094     }
1095 }
1096
1097 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1098 {
1099     cudaError_t  stat;
1100     cudaStream_t stream = nb->stream[iloc];
1101
1102     stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1103     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1104 }
1105
1106 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1107 {
1108     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1109 }
1110
1111 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1112 {
1113     /* The NVPROF_ID environment variable is set by nvprof and indicates that
1114        mdrun is executed in the CUDA profiler.
1115        If nvprof was run is with "--profile-from-start off", the profiler will
1116        be started here. This way we can avoid tracing the CUDA events from the
1117        first part of the run. Starting the profiler again does nothing.
1118      */
1119     if (getenv("NVPROF_ID") != NULL)
1120     {
1121         cudaError_t stat;
1122         stat = cudaProfilerStart();
1123         CU_RET_ERR(stat, "cudaProfilerStart failed");
1124     }
1125
1126     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1127     {
1128         init_timings(nbv->gpu_nbv->timings);
1129     }
1130 }
1131
1132 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1133 {
1134     return nb != NULL ?
1135            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1136
1137 }
1138
1139 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1140 {
1141     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1142             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1143 }