src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdarg.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  55 #include "gromacs/legacyheaders/tables.h"
  56 #include "gromacs/legacyheaders/typedefs.h"
  57 #include "gromacs/legacyheaders/types/enums.h"
  58 #include "gromacs/legacyheaders/types/force_flags.h"
  59 #include "gromacs/legacyheaders/types/interaction_const.h"
  60 #include "gromacs/mdlib/nb_verlet.h"
  61 #include "gromacs/mdlib/nbnxn_consts.h"
  62 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/utility/basedefinitions.h"
  66 #include "gromacs/utility/cstringutil.h"
  67 #include "gromacs/utility/fatalerror.h"
  68 #include "gromacs/utility/smalloc.h"
  69
  70 #include "nbnxn_cuda_types.h"
  71
  72 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  73
  74 /* This is a heuristically determined parameter for the Fermi architecture for
  75  * the minimum size of ci lists by multiplying this constant with the # of
  76  * multiprocessors on the current device.
  77  */
  78 static unsigned int gpu_min_ci_balanced_factor = 40;
  79
  80 /* Functions from nbnxn_cuda.cu */
  81 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  82 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  83 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  84 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  85
  86 /* We should actually be using md_print_warn in md_logging.c,
  87  * but we can't include mpi.h in CUDA code.
  88  */
  89 static void md_print_warn(FILE       *fplog,
  90                           const char *fmt, ...)
  91 {
  92     va_list ap;
  93
  94     if (fplog != NULL)
  95     {
  96         /* We should only print to stderr on the master node,
  97          * in most cases fplog is only set on the master node, so this works.
  98          */
  99         va_start(ap, fmt);
 100         fprintf(stderr, "\n");
 101         vfprintf(stderr, fmt, ap);
 102         fprintf(stderr, "\n");
 103         va_end(ap);
 104
 105         va_start(ap, fmt);
 106         fprintf(fplog, "\n");
 107         vfprintf(fplog, fmt, ap);
 108         fprintf(fplog, "\n");
 109         va_end(ap);
 110     }
 111 }
 112
 113
 114 /* Fw. decl. */
 115 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 116
 117
 118 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 119     and the table GPU array. If called with an already allocated table,
 120     it just re-uploads the table.
 121  */
 122 static void init_ewald_coulomb_force_table(cu_nbparam_t            *nbp,
 123                                            const gmx_device_info_t *dev_info)
 124 {
 125     float       *ftmp, *coul_tab;
 126     int          tabsize;
 127     double       tabscale;
 128     cudaError_t  stat;
 129
 130     tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 131     /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 132     tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
 133
 134     pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
 135
 136     table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
 137                                 1/tabscale, nbp->ewald_beta, v_q_ewald_lr);
 138
 139     /* If the table pointer == NULL the table is generated the first time =>
 140        the array pointer will be saved to nbparam and the texture is bound.
 141      */
 142     coul_tab = nbp->coulomb_tab;
 143     if (coul_tab == NULL)
 144     {
 145         stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
 146         CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 147
 148         nbp->coulomb_tab = coul_tab;
 149
 150 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 151         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 152         if (dev_info->prop.major >= 3)
 153         {
 154             cudaResourceDesc rd;
 155             memset(&rd, 0, sizeof(rd));
 156             rd.resType                  = cudaResourceTypeLinear;
 157             rd.res.linear.devPtr        = nbp->coulomb_tab;
 158             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 159             rd.res.linear.desc.x        = 32;
 160             rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
 161
 162             cudaTextureDesc td;
 163             memset(&td, 0, sizeof(td));
 164             td.readMode                 = cudaReadModeElementType;
 165             stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 166             CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 167         }
 168         else
 169 #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
 170         {
 171             GMX_UNUSED_VALUE(dev_info);
 172             cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 173             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 174                                    coul_tab, &cd, tabsize*sizeof(*coul_tab));
 175             CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 176         }
 177     }
 178
 179     cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
 180
 181     nbp->coulomb_tab_size     = tabsize;
 182     nbp->coulomb_tab_scale    = tabscale;
 183
 184     pfree(ftmp);
 185 }
 186
 187
 188 /*! Initializes the atomdata structure first time, it only gets filled at
 189     pair-search. */
 190 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 191 {
 192     cudaError_t stat;
 193
 194     ad->ntypes  = ntypes;
 195     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 196     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 197     ad->bShiftVecUploaded = false;
 198
 199     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 200     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 201
 202     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 203     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 204     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 205     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 206
 207     /* initialize to NULL poiters to data that is not allocated here and will
 208        need reallocation in nbnxn_cuda_init_atomdata */
 209     ad->xq = NULL;
 210     ad->f  = NULL;
 211
 212     /* size -1 indicates that the respective array hasn't been initialized yet */
 213     ad->natoms = -1;
 214     ad->nalloc = -1;
 215 }
 216
 217 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 218     earlier GPUs, single or twin cut-off. */
 219 static int pick_ewald_kernel_type(bool                     bTwinCut,
 220                                   const gmx_device_info_t *dev_info)
 221 {
 222     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 223     int  kernel_type;
 224
 225     /* Benchmarking/development environment variables to force the use of
 226        analytical or tabulated Ewald kernel. */
 227     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 228     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 229
 230     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 231     {
 232         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 233                    "requested through environment variables.");
 234     }
 235
 236     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 237     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 238     {
 239         bUseAnalyticalEwald = true;
 240
 241         if (debug)
 242         {
 243             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 244         }
 245     }
 246     else
 247     {
 248         bUseAnalyticalEwald = false;
 249
 250         if (debug)
 251         {
 252             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 253         }
 254     }
 255
 256     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 257        forces it (use it for debugging/benchmarking only). */
 258     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 259     {
 260         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 261     }
 262     else
 263     {
 264         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 265     }
 266
 267     return kernel_type;
 268 }
 269
 270 /*! Copies all parameters related to the cut-off from ic to nbp */
 271 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 272                                   const interaction_const_t *ic)
 273 {
 274     nbp->ewald_beta       = ic->ewaldcoeff_q;
 275     nbp->sh_ewald         = ic->sh_ewald;
 276     nbp->epsfac           = ic->epsfac;
 277     nbp->two_k_rf         = 2.0 * ic->k_rf;
 278     nbp->c_rf             = ic->c_rf;
 279     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 280     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 281     nbp->rlist_sq         = ic->rlist * ic->rlist;
 282
 283     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 284     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 285
 286     nbp->rvdw_switch      = ic->rvdw_switch;
 287     nbp->dispersion_shift = ic->dispersion_shift;
 288     nbp->repulsion_shift  = ic->repulsion_shift;
 289     nbp->vdw_switch       = ic->vdw_switch;
 290 }
 291
 292 /*! Initializes the nonbonded parameter data structure. */
 293 static void init_nbparam(cu_nbparam_t              *nbp,
 294                          const interaction_const_t *ic,
 295                          const nbnxn_atomdata_t    *nbat,
 296                          const gmx_device_info_t   *dev_info)
 297 {
 298     cudaError_t stat;
 299     int         ntypes, nnbfp, nnbfp_comb;
 300
 301     ntypes  = nbat->ntype;
 302
 303     set_cutoff_parameters(nbp, ic);
 304
 305     if (ic->vdwtype == evdwCUT)
 306     {
 307         switch (ic->vdw_modifier)
 308         {
 309             case eintmodNONE:
 310             case eintmodPOTSHIFT:
 311                 nbp->vdwtype = evdwCuCUT;
 312                 break;
 313             case eintmodFORCESWITCH:
 314                 nbp->vdwtype = evdwCuFSWITCH;
 315                 break;
 316             case eintmodPOTSWITCH:
 317                 nbp->vdwtype = evdwCuPSWITCH;
 318                 break;
 319             default:
 320                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 321                 break;
 322         }
 323     }
 324     else if (ic->vdwtype == evdwPME)
 325     {
 326         if (ic->ljpme_comb_rule == ljcrGEOM)
 327         {
 328             assert(nbat->comb_rule == ljcrGEOM);
 329             nbp->vdwtype = evdwCuEWALDGEOM;
 330         }
 331         else
 332         {
 333             assert(nbat->comb_rule == ljcrLB);
 334             nbp->vdwtype = evdwCuEWALDLB;
 335         }
 336     }
 337     else
 338     {
 339         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 340     }
 341
 342     if (ic->eeltype == eelCUT)
 343     {
 344         nbp->eeltype = eelCuCUT;
 345     }
 346     else if (EEL_RF(ic->eeltype))
 347     {
 348         nbp->eeltype = eelCuRF;
 349     }
 350     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 351     {
 352         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 353         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 354     }
 355     else
 356     {
 357         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 358         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 359     }
 360
 361     /* generate table for PME */
 362     nbp->coulomb_tab = NULL;
 363     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 364     {
 365         init_ewald_coulomb_force_table(nbp, dev_info);
 366     }
 367
 368     nnbfp      = 2*ntypes*ntypes;
 369     nnbfp_comb = 2*ntypes;
 370
 371     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 372     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 373     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 374
 375
 376     if (ic->vdwtype == evdwPME)
 377     {
 378         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 379         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 380         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 381     }
 382
 383 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 384     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 385     if (dev_info->prop.major >= 3)
 386     {
 387         cudaResourceDesc rd;
 388         cudaTextureDesc  td;
 389
 390         memset(&rd, 0, sizeof(rd));
 391         rd.resType                  = cudaResourceTypeLinear;
 392         rd.res.linear.devPtr        = nbp->nbfp;
 393         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 394         rd.res.linear.desc.x        = 32;
 395         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 396
 397         memset(&td, 0, sizeof(td));
 398         td.readMode                 = cudaReadModeElementType;
 399         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 400         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 401
 402         if (ic->vdwtype == evdwPME)
 403         {
 404             memset(&rd, 0, sizeof(rd));
 405             rd.resType                  = cudaResourceTypeLinear;
 406             rd.res.linear.devPtr        = nbp->nbfp_comb;
 407             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 408             rd.res.linear.desc.x        = 32;
 409             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 410
 411             memset(&td, 0, sizeof(td));
 412             td.readMode = cudaReadModeElementType;
 413             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 414             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 415         }
 416     }
 417     else
 418 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
 419     {
 420         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 421         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 422                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 423         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 424
 425         if (ic->vdwtype == evdwPME)
 426         {
 427             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 428                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 429             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 430         }
 431     }
 432 }
 433
 434 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 435  *  electrostatic type switching to twin cut-off (or back) if needed. */
 436 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 437                                         const interaction_const_t   *ic)
 438 {
 439     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 440     {
 441         return;
 442     }
 443     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
 444     cu_nbparam_t     *nbp   = nb->nbparam;
 445
 446     set_cutoff_parameters(nbp, ic);
 447
 448     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 449                                                  nb->dev_info);
 450
 451     init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
 452 }
 453
 454 /*! Initializes the pair list data structure. */
 455 static void init_plist(cu_plist_t *pl)
 456 {
 457     /* initialize to NULL pointers to data that is not allocated here and will
 458        need reallocation in nbnxn_gpu_init_pairlist */
 459     pl->sci     = NULL;
 460     pl->cj4     = NULL;
 461     pl->excl    = NULL;
 462
 463     /* size -1 indicates that the respective array hasn't been initialized yet */
 464     pl->na_c        = -1;
 465     pl->nsci        = -1;
 466     pl->sci_nalloc  = -1;
 467     pl->ncj4        = -1;
 468     pl->cj4_nalloc  = -1;
 469     pl->nexcl       = -1;
 470     pl->excl_nalloc = -1;
 471     pl->bDoPrune    = false;
 472 }
 473
 474 /*! Initializes the timer data structure. */
 475 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 476 {
 477     cudaError_t stat;
 478     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 479
 480     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 481     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 482     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 483     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 484
 485     /* The non-local counters/stream (second in the array) are needed only with DD. */
 486     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 487     {
 488         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 489         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 490         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 491         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 492
 493
 494         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 495         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 496         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 497         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 498
 499         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 500         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 501         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 502         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 503
 504         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 505         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 506         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 507         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 508     }
 509 }
 510
 511 /*! Initializes the timings data structure. */
 512 static void init_timings(gmx_wallclock_gpu_t *t)
 513 {
 514     int i, j;
 515
 516     t->nb_h2d_t = 0.0;
 517     t->nb_d2h_t = 0.0;
 518     t->nb_c     = 0;
 519     t->pl_h2d_t = 0.0;
 520     t->pl_h2d_c = 0;
 521     for (i = 0; i < 2; i++)
 522     {
 523         for (j = 0; j < 2; j++)
 524         {
 525             t->ktime[i][j].t = 0.0;
 526             t->ktime[i][j].c = 0;
 527         }
 528     }
 529 }
 530
 531 void nbnxn_gpu_init(FILE                 *fplog,
 532                     gmx_nbnxn_cuda_t    **p_nb,
 533                     const gmx_gpu_info_t *gpu_info,
 534                     const gmx_gpu_opt_t  *gpu_opt,
 535                     int                   my_gpu_index,
 536                     gmx_bool              bLocalAndNonlocal)
 537 {
 538     cudaError_t       stat;
 539     gmx_nbnxn_cuda_t *nb;
 540     char              sbuf[STRLEN];
 541     bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 542     int               cuda_drv_ver;
 543
 544     assert(gpu_info);
 545
 546     if (p_nb == NULL)
 547     {
 548         return;
 549     }
 550
 551     snew(nb, 1);
 552     snew(nb->atdat, 1);
 553     snew(nb->nbparam, 1);
 554     snew(nb->plist[eintLocal], 1);
 555     if (bLocalAndNonlocal)
 556     {
 557         snew(nb->plist[eintNonlocal], 1);
 558     }
 559
 560     nb->bUseTwoStreams = bLocalAndNonlocal;
 561
 562     snew(nb->timers, 1);
 563     snew(nb->timings, 1);
 564
 565     /* init nbst */
 566     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 567     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 568     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 569
 570     init_plist(nb->plist[eintLocal]);
 571
 572     /* set device info, just point it to the right GPU among the detected ones */
 573     nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 574
 575     /* local/non-local GPU streams */
 576     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 577     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 578     if (nb->bUseTwoStreams)
 579     {
 580         init_plist(nb->plist[eintNonlocal]);
 581
 582         /* CUDA stream priority available in the CUDA RT 5.5 API.
 583          * Note that the device we're running on does not have to support
 584          * priorities, because we are querying the priority range which in this
 585          * case will be a single value.
 586          */
 587 #if GMX_CUDA_VERSION >= 5050
 588         {
 589             int highest_priority;
 590             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 591             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 592
 593             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 594                                                 cudaStreamDefault,
 595                                                 highest_priority);
 596             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 597         }
 598 #else
 599         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 600         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 601 #endif
 602     }
 603
 604     /* init events for sychronization (timing disabled for performance reasons!) */
 605     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 606     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 607     stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 608     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 609
 610     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 611      * (which increases with shorter time/step) caused by a known CUDA driver bug.
 612      * To work around the issue we'll use an (admittedly fragile) memory polling
 613      * waiting to preserve performance. This requires support for atomic
 614      * operations and only works on x86/x86_64.
 615      * With polling wait event-timing also needs to be disabled.
 616      *
 617      * The overhead is greatly reduced in API v5.0 drivers and the improvement
 618      * is independent of runtime version. Hence, with API v5.0 drivers and later
 619      * we won't switch to polling.
 620      *
 621      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 622      * ranks so we will also disable it in that case.
 623      */
 624
 625     bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 626     bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 627
 628 #ifdef TMPI_ATOMICS
 629     bTMPIAtomics = true;
 630 #else
 631     bTMPIAtomics = false;
 632 #endif
 633
 634 #ifdef GMX_TARGET_X86
 635     bX86 = true;
 636 #else
 637     bX86 = false;
 638 #endif
 639
 640     if (bStreamSync && bNoStreamSync)
 641     {
 642         gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 643     }
 644
 645     stat = cudaDriverGetVersion(&cuda_drv_ver);
 646     CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 647
 648     bOldDriver = (cuda_drv_ver < 5000);
 649
 650     if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
 651     {
 652         /* Polling wait should be used instead of cudaStreamSynchronize only if:
 653          *   - ECC is ON & driver is old (checked above),
 654          *   - we're on x86/x86_64,
 655          *   - atomics are available, and
 656          *   - GPUs are not being shared.
 657          */
 658         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
 659                                    (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 660
 661         if (bStreamSync)
 662         {
 663             nb->bUseStreamSync = true;
 664
 665             /* only warn if polling should be used */
 666             if (bShouldUsePollSync)
 667             {
 668                 md_print_warn(fplog,
 669                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 670                               "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 671             }
 672         }
 673         else
 674         {
 675             nb->bUseStreamSync = !bShouldUsePollSync;
 676
 677             if (bShouldUsePollSync)
 678             {
 679                 md_print_warn(fplog,
 680                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 681                               "      cause performance loss. Switching to the alternative polling GPU wait.\n"
 682                               "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 683                               "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 684             }
 685             else
 686             {
 687                 /* Tell the user that the ECC+old driver combination can be bad */
 688                 sprintf(sbuf,
 689                         "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
 690                         "      A known bug in this driver version can cause performance loss.\n"
 691                         "      However, the polling wait workaround can not be used because\n%s\n"
 692                         "      Consider updating the driver or turning ECC off.",
 693                         (bX86 && bTMPIAtomics) ?
 694                         "      GPU(s) are being oversubscribed." :
 695                         "      atomic operations are not supported by the platform/CPU+compiler.");
 696                 md_print_warn(fplog, sbuf);
 697             }
 698         }
 699     }
 700     else
 701     {
 702         if (bNoStreamSync)
 703         {
 704             nb->bUseStreamSync = false;
 705
 706             md_print_warn(fplog,
 707                           "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 708         }
 709         else
 710         {
 711             /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 712             nb->bUseStreamSync = true;
 713         }
 714     }
 715
 716     /* CUDA timing disabled as event timers don't work:
 717        - with multiple streams = domain-decomposition;
 718        - with the polling waiting hack (without cudaStreamSynchronize);
 719        - when turned off by GMX_DISABLE_CUDA_TIMING.
 720      */
 721     nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 722                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 723
 724     if (nb->bDoTime)
 725     {
 726         init_timers(nb->timers, nb->bUseTwoStreams);
 727         init_timings(nb->timings);
 728     }
 729
 730     /* set the kernel type for the current GPU */
 731     /* pick L1 cache configuration */
 732     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 733
 734     *p_nb = nb;
 735
 736     if (debug)
 737     {
 738         fprintf(debug, "Initialized CUDA data structures.\n");
 739     }
 740 }
 741
 742 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
 743                           const interaction_const_t      *ic,
 744                           const nonbonded_verlet_group_t *nbv_group)
 745 {
 746     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
 747     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
 748
 749     /* clear energy and shift force outputs */
 750     nbnxn_cuda_clear_e_fshift(nb);
 751 }
 752
 753 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 754                              const nbnxn_pairlist_t *h_plist,
 755                              int                     iloc)
 756 {
 757     char          sbuf[STRLEN];
 758     cudaError_t   stat;
 759     bool          bDoTime    = nb->bDoTime;
 760     cudaStream_t  stream     = nb->stream[iloc];
 761     cu_plist_t   *d_plist    = nb->plist[iloc];
 762
 763     if (d_plist->na_c < 0)
 764     {
 765         d_plist->na_c = h_plist->na_ci;
 766     }
 767     else
 768     {
 769         if (d_plist->na_c != h_plist->na_ci)
 770         {
 771             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 772                     d_plist->na_c, h_plist->na_ci);
 773             gmx_incons(sbuf);
 774         }
 775     }
 776
 777     if (bDoTime)
 778     {
 779         stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
 780         CU_RET_ERR(stat, "cudaEventRecord failed");
 781     }
 782
 783     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 784                         &d_plist->nsci, &d_plist->sci_nalloc,
 785                         h_plist->nsci,
 786                         stream, true);
 787
 788     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 789                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 790                         h_plist->ncj4,
 791                         stream, true);
 792
 793     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 794                         &d_plist->nexcl, &d_plist->excl_nalloc,
 795                         h_plist->nexcl,
 796                         stream, true);
 797
 798     if (bDoTime)
 799     {
 800         stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
 801         CU_RET_ERR(stat, "cudaEventRecord failed");
 802     }
 803
 804     /* need to prune the pair list during the next step */
 805     d_plist->bDoPrune = true;
 806 }
 807
 808 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
 809                                const nbnxn_atomdata_t *nbatom)
 810 {
 811     cu_atomdata_t *adat  = nb->atdat;
 812     cudaStream_t   ls    = nb->stream[eintLocal];
 813
 814     /* only if we have a dynamic box */
 815     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 816     {
 817         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 818                           SHIFTS * sizeof(*adat->shift_vec), ls);
 819         adat->bShiftVecUploaded = true;
 820     }
 821 }
 822
 823 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 824 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 825 {
 826     cudaError_t    stat;
 827     cu_atomdata_t *adat  = nb->atdat;
 828     cudaStream_t   ls    = nb->stream[eintLocal];
 829
 830     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 831     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 832 }
 833
 834 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 835 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 836 {
 837     cudaError_t    stat;
 838     cu_atomdata_t *adat  = nb->atdat;
 839     cudaStream_t   ls    = nb->stream[eintLocal];
 840
 841     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 842     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 843     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 844     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 845     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 846     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 847 }
 848
 849 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 850 {
 851     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
 852     /* clear shift force array and energies if the outputs were
 853        used in the current step */
 854     if (flags & GMX_FORCE_VIRIAL)
 855     {
 856         nbnxn_cuda_clear_e_fshift(nb);
 857     }
 858 }
 859
 860 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 861                              const struct nbnxn_atomdata_t *nbat)
 862 {
 863     cudaError_t    stat;
 864     int            nalloc, natoms;
 865     bool           realloced;
 866     bool           bDoTime   = nb->bDoTime;
 867     cu_timers_t   *timers    = nb->timers;
 868     cu_atomdata_t *d_atdat   = nb->atdat;
 869     cudaStream_t   ls        = nb->stream[eintLocal];
 870
 871     natoms    = nbat->natoms;
 872     realloced = false;
 873
 874     if (bDoTime)
 875     {
 876         /* time async copy */
 877         stat = cudaEventRecord(timers->start_atdat, ls);
 878         CU_RET_ERR(stat, "cudaEventRecord failed");
 879     }
 880
 881     /* need to reallocate if we have to copy more atoms than the amount of space
 882        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 883     if (natoms > d_atdat->nalloc)
 884     {
 885         nalloc = over_alloc_small(natoms);
 886
 887         /* free up first if the arrays have already been initialized */
 888         if (d_atdat->nalloc != -1)
 889         {
 890             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 891             cu_free_buffered(d_atdat->xq);
 892             cu_free_buffered(d_atdat->atom_types);
 893         }
 894
 895         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 896         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 897         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 898         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 899
 900         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 901         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 902
 903         d_atdat->nalloc = nalloc;
 904         realloced       = true;
 905     }
 906
 907     d_atdat->natoms       = natoms;
 908     d_atdat->natoms_local = nbat->natoms_local;
 909
 910     /* need to clear GPU f output if realloc happened */
 911     if (realloced)
 912     {
 913         nbnxn_cuda_clear_f(nb, nalloc);
 914     }
 915
 916     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 917                       natoms*sizeof(*d_atdat->atom_types), ls);
 918
 919     if (bDoTime)
 920     {
 921         stat = cudaEventRecord(timers->stop_atdat, ls);
 922         CU_RET_ERR(stat, "cudaEventRecord failed");
 923     }
 924 }
 925
 926 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 927 {
 928     cudaError_t      stat;
 929     cu_atomdata_t   *atdat;
 930     cu_nbparam_t    *nbparam;
 931     cu_plist_t      *plist, *plist_nl;
 932     cu_timers_t     *timers;
 933
 934     /* Stopping the nvidia profiler here allows us to eliminate the subsequent
 935        uninitialization API calls from the trace. */
 936     if (getenv("NVPROF_ID") != NULL)
 937     {
 938         stat = cudaProfilerStop();
 939         CU_RET_ERR(stat, "cudaProfilerStop failed");
 940     }
 941
 942     if (nb == NULL)
 943     {
 944         return;
 945     }
 946
 947     atdat       = nb->atdat;
 948     nbparam     = nb->nbparam;
 949     plist       = nb->plist[eintLocal];
 950     plist_nl    = nb->plist[eintNonlocal];
 951     timers      = nb->timers;
 952
 953     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 954     {
 955
 956 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 957         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 958         if (nb->dev_info->prop.major >= 3)
 959         {
 960             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 961             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 962         }
 963         else
 964 #endif
 965         {
 966             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 967             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 968         }
 969         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 970     }
 971
 972     stat = cudaEventDestroy(nb->nonlocal_done);
 973     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 974     stat = cudaEventDestroy(nb->misc_ops_done);
 975     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 976
 977     if (nb->bDoTime)
 978     {
 979         stat = cudaEventDestroy(timers->start_atdat);
 980         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 981         stat = cudaEventDestroy(timers->stop_atdat);
 982         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 983
 984         /* The non-local counters/stream (second in the array) are needed only with DD. */
 985         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
 986         {
 987             stat = cudaEventDestroy(timers->start_nb_k[i]);
 988             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 989             stat = cudaEventDestroy(timers->stop_nb_k[i]);
 990             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 991
 992             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 993             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 994             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 995             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 996
 997             stat = cudaStreamDestroy(nb->stream[i]);
 998             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 999
1000             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1001             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1002             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1003             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1004
1005             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1006             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1007             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1008             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1009         }
1010     }
1011
1012 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1013     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1014     if (nb->dev_info->prop.major >= 3)
1015     {
1016         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1017         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1018     }
1019     else
1020 #endif
1021     {
1022         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1023         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1024     }
1025     cu_free_buffered(nbparam->nbfp);
1026
1027     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1028     {
1029 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1030         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1031         if (nb->dev_info->prop.major >= 3)
1032         {
1033             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1034             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1035         }
1036         else
1037 #endif
1038         {
1039             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1040             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1041         }
1042         cu_free_buffered(nbparam->nbfp_comb);
1043     }
1044
1045     stat = cudaFree(atdat->shift_vec);
1046     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1047     stat = cudaFree(atdat->fshift);
1048     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1049
1050     stat = cudaFree(atdat->e_lj);
1051     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1052     stat = cudaFree(atdat->e_el);
1053     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1054
1055     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1056     cu_free_buffered(atdat->xq);
1057     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1058
1059     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1060     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1061     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1062     if (nb->bUseTwoStreams)
1063     {
1064         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1065         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1066         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1067     }
1068
1069     sfree(atdat);
1070     sfree(nbparam);
1071     sfree(plist);
1072     if (nb->bUseTwoStreams)
1073     {
1074         sfree(plist_nl);
1075     }
1076     sfree(timers);
1077     sfree(nb->timings);
1078     sfree(nb);
1079
1080     if (debug)
1081     {
1082         fprintf(debug, "Cleaned up CUDA data structures.\n");
1083     }
1084 }
1085
1086 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1087 {
1088     cudaError_t  stat;
1089     cudaStream_t stream = nb->stream[iloc];
1090
1091     stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1092     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1093 }
1094
1095 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1096 {
1097     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1098 }
1099
1100 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1101 {
1102     /* The NVPROF_ID environment variable is set by nvprof and indicates that
1103        mdrun is executed in the CUDA profiler.
1104        If nvprof was run is with "--profile-from-start off", the profiler will
1105        be started here. This way we can avoid tracing the CUDA events from the
1106        first part of the run. Starting the profiler again does nothing.
1107      */
1108     if (getenv("NVPROF_ID") != NULL)
1109     {
1110         cudaError_t stat;
1111         stat = cudaProfilerStart();
1112         CU_RET_ERR(stat, "cudaProfilerStart failed");
1113     }
1114
1115     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1116     {
1117         init_timings(nbv->gpu_nbv->timings);
1118     }
1119 }
1120
1121 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1122 {
1123     return nb != NULL ?
1124            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1125
1126 }
1127
1128 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1129 {
1130     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1131             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1132 }