src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdarg.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  55 #include "gromacs/legacyheaders/typedefs.h"
  56 #include "gromacs/legacyheaders/types/enums.h"
  57 #include "gromacs/legacyheaders/types/force_flags.h"
  58 #include "gromacs/legacyheaders/types/interaction_const.h"
  59 #include "gromacs/mdlib/nb_verlet.h"
  60 #include "gromacs/mdlib/nbnxn_consts.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/smalloc.h"
  68
  69 #include "nbnxn_cuda_types.h"
  70
  71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  72
  73 /* This is a heuristically determined parameter for the Fermi architecture for
  74  * the minimum size of ci lists by multiplying this constant with the # of
  75  * multiprocessors on the current device.
  76  */
  77 static unsigned int gpu_min_ci_balanced_factor = 40;
  78
  79 /* Functions from nbnxn_cuda.cu */
  80 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  81 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  82 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  83 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  84
  85 /* We should actually be using md_print_warn in md_logging.c,
  86  * but we can't include mpi.h in CUDA code.
  87  */
  88 static void md_print_warn(FILE       *fplog,
  89                           const char *fmt, ...)
  90 {
  91     va_list ap;
  92
  93     if (fplog != NULL)
  94     {
  95         /* We should only print to stderr on the master node,
  96          * in most cases fplog is only set on the master node, so this works.
  97          */
  98         va_start(ap, fmt);
  99         fprintf(stderr, "\n");
 100         vfprintf(stderr, fmt, ap);
 101         fprintf(stderr, "\n");
 102         va_end(ap);
 103
 104         va_start(ap, fmt);
 105         fprintf(fplog, "\n");
 106         vfprintf(fplog, fmt, ap);
 107         fprintf(fplog, "\n");
 108         va_end(ap);
 109     }
 110 }
 111
 112
 113 /* Fw. decl. */
 114 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 115
 116 /* Fw. decl, */
 117 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 118                                           const gmx_device_info_t *dev_info);
 119
 120
 121 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 122     and the table GPU array. If called with an already allocated table,
 123     it just re-uploads the table.
 124  */
 125 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
 126                                            cu_nbparam_t              *nbp,
 127                                            const gmx_device_info_t   *dev_info)
 128 {
 129     float       *coul_tab;
 130     cudaError_t  stat;
 131
 132     if (nbp->coulomb_tab != NULL)
 133     {
 134         nbnxn_cuda_free_nbparam_table(nbp, dev_info);
 135     }
 136
 137     stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
 138     CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 139
 140     nbp->coulomb_tab = coul_tab;
 141
 142 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 143     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 144     if (dev_info->prop.major >= 3)
 145     {
 146         cudaResourceDesc rd;
 147         memset(&rd, 0, sizeof(rd));
 148         rd.resType                  = cudaResourceTypeLinear;
 149         rd.res.linear.devPtr        = nbp->coulomb_tab;
 150         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 151         rd.res.linear.desc.x        = 32;
 152         rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
 153
 154         cudaTextureDesc td;
 155         memset(&td, 0, sizeof(td));
 156         td.readMode                 = cudaReadModeElementType;
 157         stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 158         CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 159     }
 160     else
 161 #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
 162     {
 163         GMX_UNUSED_VALUE(dev_info);
 164         cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 165         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 166                                coul_tab, &cd,
 167                                ic->tabq_size*sizeof(*coul_tab));
 168         CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 169     }
 170
 171     cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
 172
 173     nbp->coulomb_tab_size     = ic->tabq_size;
 174     nbp->coulomb_tab_scale    = ic->tabq_scale;
 175 }
 176
 177
 178 /*! Initializes the atomdata structure first time, it only gets filled at
 179     pair-search. */
 180 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 181 {
 182     cudaError_t stat;
 183
 184     ad->ntypes  = ntypes;
 185     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 186     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 187     ad->bShiftVecUploaded = false;
 188
 189     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 190     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 191
 192     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 193     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 194     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 195     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 196
 197     /* initialize to NULL poiters to data that is not allocated here and will
 198        need reallocation in nbnxn_cuda_init_atomdata */
 199     ad->xq = NULL;
 200     ad->f  = NULL;
 201
 202     /* size -1 indicates that the respective array hasn't been initialized yet */
 203     ad->natoms = -1;
 204     ad->nalloc = -1;
 205 }
 206
 207 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 208     earlier GPUs, single or twin cut-off. */
 209 static int pick_ewald_kernel_type(bool                     bTwinCut,
 210                                   const gmx_device_info_t *dev_info)
 211 {
 212     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 213     int  kernel_type;
 214
 215     /* Benchmarking/development environment variables to force the use of
 216        analytical or tabulated Ewald kernel. */
 217     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 218     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 219
 220     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 221     {
 222         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 223                    "requested through environment variables.");
 224     }
 225
 226     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 227     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 228     {
 229         bUseAnalyticalEwald = true;
 230
 231         if (debug)
 232         {
 233             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 234         }
 235     }
 236     else
 237     {
 238         bUseAnalyticalEwald = false;
 239
 240         if (debug)
 241         {
 242             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 243         }
 244     }
 245
 246     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 247        forces it (use it for debugging/benchmarking only). */
 248     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 249     {
 250         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 251     }
 252     else
 253     {
 254         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 255     }
 256
 257     return kernel_type;
 258 }
 259
 260 /*! Copies all parameters related to the cut-off from ic to nbp */
 261 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 262                                   const interaction_const_t *ic)
 263 {
 264     nbp->ewald_beta       = ic->ewaldcoeff_q;
 265     nbp->sh_ewald         = ic->sh_ewald;
 266     nbp->epsfac           = ic->epsfac;
 267     nbp->two_k_rf         = 2.0 * ic->k_rf;
 268     nbp->c_rf             = ic->c_rf;
 269     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 270     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 271     nbp->rlist_sq         = ic->rlist * ic->rlist;
 272
 273     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 274     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 275
 276     nbp->rvdw_switch      = ic->rvdw_switch;
 277     nbp->dispersion_shift = ic->dispersion_shift;
 278     nbp->repulsion_shift  = ic->repulsion_shift;
 279     nbp->vdw_switch       = ic->vdw_switch;
 280 }
 281
 282 /*! Initializes the nonbonded parameter data structure. */
 283 static void init_nbparam(cu_nbparam_t              *nbp,
 284                          const interaction_const_t *ic,
 285                          const nbnxn_atomdata_t    *nbat,
 286                          const gmx_device_info_t   *dev_info)
 287 {
 288     cudaError_t stat;
 289     int         ntypes, nnbfp, nnbfp_comb;
 290
 291     ntypes  = nbat->ntype;
 292
 293     set_cutoff_parameters(nbp, ic);
 294
 295     if (ic->vdwtype == evdwCUT)
 296     {
 297         switch (ic->vdw_modifier)
 298         {
 299             case eintmodNONE:
 300             case eintmodPOTSHIFT:
 301                 nbp->vdwtype = evdwCuCUT;
 302                 break;
 303             case eintmodFORCESWITCH:
 304                 nbp->vdwtype = evdwCuFSWITCH;
 305                 break;
 306             case eintmodPOTSWITCH:
 307                 nbp->vdwtype = evdwCuPSWITCH;
 308                 break;
 309             default:
 310                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 311                 break;
 312         }
 313     }
 314     else if (ic->vdwtype == evdwPME)
 315     {
 316         if (ic->ljpme_comb_rule == ljcrGEOM)
 317         {
 318             assert(nbat->comb_rule == ljcrGEOM);
 319             nbp->vdwtype = evdwCuEWALDGEOM;
 320         }
 321         else
 322         {
 323             assert(nbat->comb_rule == ljcrLB);
 324             nbp->vdwtype = evdwCuEWALDLB;
 325         }
 326     }
 327     else
 328     {
 329         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 330     }
 331
 332     if (ic->eeltype == eelCUT)
 333     {
 334         nbp->eeltype = eelCuCUT;
 335     }
 336     else if (EEL_RF(ic->eeltype))
 337     {
 338         nbp->eeltype = eelCuRF;
 339     }
 340     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 341     {
 342         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 343         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 344     }
 345     else
 346     {
 347         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 348         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 349     }
 350
 351     /* generate table for PME */
 352     nbp->coulomb_tab = NULL;
 353     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 354     {
 355         init_ewald_coulomb_force_table(ic, nbp, dev_info);
 356     }
 357
 358     nnbfp      = 2*ntypes*ntypes;
 359     nnbfp_comb = 2*ntypes;
 360
 361     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 362     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 363     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 364
 365
 366     if (ic->vdwtype == evdwPME)
 367     {
 368         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 369         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 370         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 371     }
 372
 373 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 374     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 375     if (dev_info->prop.major >= 3)
 376     {
 377         cudaResourceDesc rd;
 378         cudaTextureDesc  td;
 379
 380         memset(&rd, 0, sizeof(rd));
 381         rd.resType                  = cudaResourceTypeLinear;
 382         rd.res.linear.devPtr        = nbp->nbfp;
 383         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 384         rd.res.linear.desc.x        = 32;
 385         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 386
 387         memset(&td, 0, sizeof(td));
 388         td.readMode                 = cudaReadModeElementType;
 389         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 390         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 391
 392         if (ic->vdwtype == evdwPME)
 393         {
 394             memset(&rd, 0, sizeof(rd));
 395             rd.resType                  = cudaResourceTypeLinear;
 396             rd.res.linear.devPtr        = nbp->nbfp_comb;
 397             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 398             rd.res.linear.desc.x        = 32;
 399             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 400
 401             memset(&td, 0, sizeof(td));
 402             td.readMode = cudaReadModeElementType;
 403             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 404             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 405         }
 406     }
 407     else
 408 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
 409     {
 410         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 411         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 412                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 413         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 414
 415         if (ic->vdwtype == evdwPME)
 416         {
 417             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 418                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 419             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 420         }
 421     }
 422 }
 423
 424 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 425  *  electrostatic type switching to twin cut-off (or back) if needed. */
 426 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 427                                         const interaction_const_t   *ic)
 428 {
 429     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 430     {
 431         return;
 432     }
 433     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
 434     cu_nbparam_t     *nbp   = nb->nbparam;
 435
 436     set_cutoff_parameters(nbp, ic);
 437
 438     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 439                                                  nb->dev_info);
 440
 441     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
 442 }
 443
 444 /*! Initializes the pair list data structure. */
 445 static void init_plist(cu_plist_t *pl)
 446 {
 447     /* initialize to NULL pointers to data that is not allocated here and will
 448        need reallocation in nbnxn_gpu_init_pairlist */
 449     pl->sci     = NULL;
 450     pl->cj4     = NULL;
 451     pl->excl    = NULL;
 452
 453     /* size -1 indicates that the respective array hasn't been initialized yet */
 454     pl->na_c        = -1;
 455     pl->nsci        = -1;
 456     pl->sci_nalloc  = -1;
 457     pl->ncj4        = -1;
 458     pl->cj4_nalloc  = -1;
 459     pl->nexcl       = -1;
 460     pl->excl_nalloc = -1;
 461     pl->bDoPrune    = false;
 462 }
 463
 464 /*! Initializes the timer data structure. */
 465 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 466 {
 467     cudaError_t stat;
 468     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 469
 470     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 471     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 472     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 473     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 474
 475     /* The non-local counters/stream (second in the array) are needed only with DD. */
 476     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 477     {
 478         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 479         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 480         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 481         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 482
 483
 484         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 485         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 486         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 487         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 488
 489         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 490         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 491         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 492         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 493
 494         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 495         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 496         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 497         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 498     }
 499 }
 500
 501 /*! Initializes the timings data structure. */
 502 static void init_timings(gmx_wallclock_gpu_t *t)
 503 {
 504     int i, j;
 505
 506     t->nb_h2d_t = 0.0;
 507     t->nb_d2h_t = 0.0;
 508     t->nb_c     = 0;
 509     t->pl_h2d_t = 0.0;
 510     t->pl_h2d_c = 0;
 511     for (i = 0; i < 2; i++)
 512     {
 513         for (j = 0; j < 2; j++)
 514         {
 515             t->ktime[i][j].t = 0.0;
 516             t->ktime[i][j].c = 0;
 517         }
 518     }
 519 }
 520
 521 void nbnxn_gpu_init(FILE                 *fplog,
 522                     gmx_nbnxn_cuda_t    **p_nb,
 523                     const gmx_gpu_info_t *gpu_info,
 524                     const gmx_gpu_opt_t  *gpu_opt,
 525                     int                   my_gpu_index,
 526                     gmx_bool              bLocalAndNonlocal)
 527 {
 528     cudaError_t       stat;
 529     gmx_nbnxn_cuda_t *nb;
 530     char              sbuf[STRLEN];
 531     bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 532     int               cuda_drv_ver;
 533
 534     assert(gpu_info);
 535
 536     if (p_nb == NULL)
 537     {
 538         return;
 539     }
 540
 541     snew(nb, 1);
 542     snew(nb->atdat, 1);
 543     snew(nb->nbparam, 1);
 544     snew(nb->plist[eintLocal], 1);
 545     if (bLocalAndNonlocal)
 546     {
 547         snew(nb->plist[eintNonlocal], 1);
 548     }
 549
 550     nb->bUseTwoStreams = bLocalAndNonlocal;
 551
 552     snew(nb->timers, 1);
 553     snew(nb->timings, 1);
 554
 555     /* init nbst */
 556     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 557     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 558     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 559
 560     init_plist(nb->plist[eintLocal]);
 561
 562     /* set device info, just point it to the right GPU among the detected ones */
 563     nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 564
 565     /* local/non-local GPU streams */
 566     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 567     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 568     if (nb->bUseTwoStreams)
 569     {
 570         init_plist(nb->plist[eintNonlocal]);
 571
 572         /* CUDA stream priority available in the CUDA RT 5.5 API.
 573          * Note that the device we're running on does not have to support
 574          * priorities, because we are querying the priority range which in this
 575          * case will be a single value.
 576          */
 577 #if GMX_CUDA_VERSION >= 5050
 578         {
 579             int highest_priority;
 580             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 581             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 582
 583             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 584                                                 cudaStreamDefault,
 585                                                 highest_priority);
 586             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 587         }
 588 #else
 589         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 590         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 591 #endif
 592     }
 593
 594     /* init events for sychronization (timing disabled for performance reasons!) */
 595     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 596     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 597     stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 598     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 599
 600     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 601      * (which increases with shorter time/step) caused by a known CUDA driver bug.
 602      * To work around the issue we'll use an (admittedly fragile) memory polling
 603      * waiting to preserve performance. This requires support for atomic
 604      * operations and only works on x86/x86_64.
 605      * With polling wait event-timing also needs to be disabled.
 606      *
 607      * The overhead is greatly reduced in API v5.0 drivers and the improvement
 608      * is independent of runtime version. Hence, with API v5.0 drivers and later
 609      * we won't switch to polling.
 610      *
 611      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 612      * ranks so we will also disable it in that case.
 613      */
 614
 615     bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 616     bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 617
 618 #ifdef TMPI_ATOMICS
 619     bTMPIAtomics = true;
 620 #else
 621     bTMPIAtomics = false;
 622 #endif
 623
 624 #ifdef GMX_TARGET_X86
 625     bX86 = true;
 626 #else
 627     bX86 = false;
 628 #endif
 629
 630     if (bStreamSync && bNoStreamSync)
 631     {
 632         gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 633     }
 634
 635     stat = cudaDriverGetVersion(&cuda_drv_ver);
 636     CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 637
 638     bOldDriver = (cuda_drv_ver < 5000);
 639
 640     if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
 641     {
 642         /* Polling wait should be used instead of cudaStreamSynchronize only if:
 643          *   - ECC is ON & driver is old (checked above),
 644          *   - we're on x86/x86_64,
 645          *   - atomics are available, and
 646          *   - GPUs are not being shared.
 647          */
 648         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
 649                                    (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 650
 651         if (bStreamSync)
 652         {
 653             nb->bUseStreamSync = true;
 654
 655             /* only warn if polling should be used */
 656             if (bShouldUsePollSync)
 657             {
 658                 md_print_warn(fplog,
 659                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 660                               "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 661             }
 662         }
 663         else
 664         {
 665             nb->bUseStreamSync = !bShouldUsePollSync;
 666
 667             if (bShouldUsePollSync)
 668             {
 669                 md_print_warn(fplog,
 670                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 671                               "      cause performance loss. Switching to the alternative polling GPU wait.\n"
 672                               "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 673                               "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 674             }
 675             else
 676             {
 677                 /* Tell the user that the ECC+old driver combination can be bad */
 678                 sprintf(sbuf,
 679                         "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
 680                         "      A known bug in this driver version can cause performance loss.\n"
 681                         "      However, the polling wait workaround can not be used because\n%s\n"
 682                         "      Consider updating the driver or turning ECC off.",
 683                         (bX86 && bTMPIAtomics) ?
 684                         "      GPU(s) are being oversubscribed." :
 685                         "      atomic operations are not supported by the platform/CPU+compiler.");
 686                 md_print_warn(fplog, sbuf);
 687             }
 688         }
 689     }
 690     else
 691     {
 692         if (bNoStreamSync)
 693         {
 694             nb->bUseStreamSync = false;
 695
 696             md_print_warn(fplog,
 697                           "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 698         }
 699         else
 700         {
 701             /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 702             nb->bUseStreamSync = true;
 703         }
 704     }
 705
 706     /* CUDA timing disabled as event timers don't work:
 707        - with multiple streams = domain-decomposition;
 708        - with the polling waiting hack (without cudaStreamSynchronize);
 709        - when turned off by GMX_DISABLE_CUDA_TIMING.
 710      */
 711     nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 712                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 713
 714     if (nb->bDoTime)
 715     {
 716         init_timers(nb->timers, nb->bUseTwoStreams);
 717         init_timings(nb->timings);
 718     }
 719
 720     /* set the kernel type for the current GPU */
 721     /* pick L1 cache configuration */
 722     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 723
 724     *p_nb = nb;
 725
 726     if (debug)
 727     {
 728         fprintf(debug, "Initialized CUDA data structures.\n");
 729     }
 730 }
 731
 732 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
 733                           const interaction_const_t      *ic,
 734                           const nonbonded_verlet_group_t *nbv_group)
 735 {
 736     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
 737     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
 738
 739     /* clear energy and shift force outputs */
 740     nbnxn_cuda_clear_e_fshift(nb);
 741 }
 742
 743 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 744                              const nbnxn_pairlist_t *h_plist,
 745                              int                     iloc)
 746 {
 747     char          sbuf[STRLEN];
 748     cudaError_t   stat;
 749     bool          bDoTime    = nb->bDoTime;
 750     cudaStream_t  stream     = nb->stream[iloc];
 751     cu_plist_t   *d_plist    = nb->plist[iloc];
 752
 753     if (d_plist->na_c < 0)
 754     {
 755         d_plist->na_c = h_plist->na_ci;
 756     }
 757     else
 758     {
 759         if (d_plist->na_c != h_plist->na_ci)
 760         {
 761             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 762                     d_plist->na_c, h_plist->na_ci);
 763             gmx_incons(sbuf);
 764         }
 765     }
 766
 767     if (bDoTime)
 768     {
 769         stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
 770         CU_RET_ERR(stat, "cudaEventRecord failed");
 771     }
 772
 773     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 774                         &d_plist->nsci, &d_plist->sci_nalloc,
 775                         h_plist->nsci,
 776                         stream, true);
 777
 778     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 779                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 780                         h_plist->ncj4,
 781                         stream, true);
 782
 783     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 784                         &d_plist->nexcl, &d_plist->excl_nalloc,
 785                         h_plist->nexcl,
 786                         stream, true);
 787
 788     if (bDoTime)
 789     {
 790         stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
 791         CU_RET_ERR(stat, "cudaEventRecord failed");
 792     }
 793
 794     /* need to prune the pair list during the next step */
 795     d_plist->bDoPrune = true;
 796 }
 797
 798 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
 799                                const nbnxn_atomdata_t *nbatom)
 800 {
 801     cu_atomdata_t *adat  = nb->atdat;
 802     cudaStream_t   ls    = nb->stream[eintLocal];
 803
 804     /* only if we have a dynamic box */
 805     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 806     {
 807         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 808                           SHIFTS * sizeof(*adat->shift_vec), ls);
 809         adat->bShiftVecUploaded = true;
 810     }
 811 }
 812
 813 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 814 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 815 {
 816     cudaError_t    stat;
 817     cu_atomdata_t *adat  = nb->atdat;
 818     cudaStream_t   ls    = nb->stream[eintLocal];
 819
 820     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 821     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 822 }
 823
 824 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 825 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 826 {
 827     cudaError_t    stat;
 828     cu_atomdata_t *adat  = nb->atdat;
 829     cudaStream_t   ls    = nb->stream[eintLocal];
 830
 831     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 832     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 833     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 834     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 835     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 836     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 837 }
 838
 839 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 840 {
 841     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
 842     /* clear shift force array and energies if the outputs were
 843        used in the current step */
 844     if (flags & GMX_FORCE_VIRIAL)
 845     {
 846         nbnxn_cuda_clear_e_fshift(nb);
 847     }
 848 }
 849
 850 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 851                              const struct nbnxn_atomdata_t *nbat)
 852 {
 853     cudaError_t    stat;
 854     int            nalloc, natoms;
 855     bool           realloced;
 856     bool           bDoTime   = nb->bDoTime;
 857     cu_timers_t   *timers    = nb->timers;
 858     cu_atomdata_t *d_atdat   = nb->atdat;
 859     cudaStream_t   ls        = nb->stream[eintLocal];
 860
 861     natoms    = nbat->natoms;
 862     realloced = false;
 863
 864     if (bDoTime)
 865     {
 866         /* time async copy */
 867         stat = cudaEventRecord(timers->start_atdat, ls);
 868         CU_RET_ERR(stat, "cudaEventRecord failed");
 869     }
 870
 871     /* need to reallocate if we have to copy more atoms than the amount of space
 872        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 873     if (natoms > d_atdat->nalloc)
 874     {
 875         nalloc = over_alloc_small(natoms);
 876
 877         /* free up first if the arrays have already been initialized */
 878         if (d_atdat->nalloc != -1)
 879         {
 880             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 881             cu_free_buffered(d_atdat->xq);
 882             cu_free_buffered(d_atdat->atom_types);
 883         }
 884
 885         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 886         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 887         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 888         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 889
 890         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 891         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 892
 893         d_atdat->nalloc = nalloc;
 894         realloced       = true;
 895     }
 896
 897     d_atdat->natoms       = natoms;
 898     d_atdat->natoms_local = nbat->natoms_local;
 899
 900     /* need to clear GPU f output if realloc happened */
 901     if (realloced)
 902     {
 903         nbnxn_cuda_clear_f(nb, nalloc);
 904     }
 905
 906     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 907                       natoms*sizeof(*d_atdat->atom_types), ls);
 908
 909     if (bDoTime)
 910     {
 911         stat = cudaEventRecord(timers->stop_atdat, ls);
 912         CU_RET_ERR(stat, "cudaEventRecord failed");
 913     }
 914 }
 915
 916 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 917                                           const gmx_device_info_t *dev_info)
 918 {
 919     cudaError_t stat;
 920
 921     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 922     {
 923 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 924         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 925         if (dev_info->prop.major >= 3)
 926         {
 927             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 928             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 929         }
 930         else
 931 #endif
 932         {
 933             GMX_UNUSED_VALUE(dev_info);
 934             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 935             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 936         }
 937         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 938     }
 939 }
 940
 941 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 942 {
 943     cudaError_t      stat;
 944     cu_atomdata_t   *atdat;
 945     cu_nbparam_t    *nbparam;
 946     cu_plist_t      *plist, *plist_nl;
 947     cu_timers_t     *timers;
 948
 949     /* Stopping the nvidia profiler here allows us to eliminate the subsequent
 950        uninitialization API calls from the trace. */
 951     if (getenv("NVPROF_ID") != NULL)
 952     {
 953         stat = cudaProfilerStop();
 954         CU_RET_ERR(stat, "cudaProfilerStop failed");
 955     }
 956
 957     if (nb == NULL)
 958     {
 959         return;
 960     }
 961
 962     atdat       = nb->atdat;
 963     nbparam     = nb->nbparam;
 964     plist       = nb->plist[eintLocal];
 965     plist_nl    = nb->plist[eintNonlocal];
 966     timers      = nb->timers;
 967
 968     nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 969
 970     stat = cudaEventDestroy(nb->nonlocal_done);
 971     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 972     stat = cudaEventDestroy(nb->misc_ops_done);
 973     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 974
 975     if (nb->bDoTime)
 976     {
 977         stat = cudaEventDestroy(timers->start_atdat);
 978         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 979         stat = cudaEventDestroy(timers->stop_atdat);
 980         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 981
 982         /* The non-local counters/stream (second in the array) are needed only with DD. */
 983         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
 984         {
 985             stat = cudaEventDestroy(timers->start_nb_k[i]);
 986             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 987             stat = cudaEventDestroy(timers->stop_nb_k[i]);
 988             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 989
 990             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 991             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 992             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 993             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 994
 995             stat = cudaStreamDestroy(nb->stream[i]);
 996             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 997
 998             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
 999             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1000             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1001             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1002
1003             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1004             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1005             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1006             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1007         }
1008     }
1009
1010 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1011     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1012     if (nb->dev_info->prop.major >= 3)
1013     {
1014         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1015         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1016     }
1017     else
1018 #endif
1019     {
1020         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1021         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1022     }
1023     cu_free_buffered(nbparam->nbfp);
1024
1025     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1026     {
1027 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1028         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1029         if (nb->dev_info->prop.major >= 3)
1030         {
1031             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1032             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1033         }
1034         else
1035 #endif
1036         {
1037             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1038             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1039         }
1040         cu_free_buffered(nbparam->nbfp_comb);
1041     }
1042
1043     stat = cudaFree(atdat->shift_vec);
1044     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1045     stat = cudaFree(atdat->fshift);
1046     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1047
1048     stat = cudaFree(atdat->e_lj);
1049     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1050     stat = cudaFree(atdat->e_el);
1051     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1052
1053     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1054     cu_free_buffered(atdat->xq);
1055     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1056
1057     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1058     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1059     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1060     if (nb->bUseTwoStreams)
1061     {
1062         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1063         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1064         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1065     }
1066
1067     sfree(atdat);
1068     sfree(nbparam);
1069     sfree(plist);
1070     if (nb->bUseTwoStreams)
1071     {
1072         sfree(plist_nl);
1073     }
1074     sfree(timers);
1075     sfree(nb->timings);
1076     sfree(nb);
1077
1078     if (debug)
1079     {
1080         fprintf(debug, "Cleaned up CUDA data structures.\n");
1081     }
1082 }
1083
1084 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1085 {
1086     cudaError_t  stat;
1087     cudaStream_t stream = nb->stream[iloc];
1088
1089     stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1090     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1091 }
1092
1093 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1094 {
1095     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1096 }
1097
1098 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1099 {
1100     /* The NVPROF_ID environment variable is set by nvprof and indicates that
1101        mdrun is executed in the CUDA profiler.
1102        If nvprof was run is with "--profile-from-start off", the profiler will
1103        be started here. This way we can avoid tracing the CUDA events from the
1104        first part of the run. Starting the profiler again does nothing.
1105      */
1106     if (getenv("NVPROF_ID") != NULL)
1107     {
1108         cudaError_t stat;
1109         stat = cudaProfilerStart();
1110         CU_RET_ERR(stat, "cudaProfilerStart failed");
1111     }
1112
1113     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1114     {
1115         init_timings(nbv->gpu_nbv->timings);
1116     }
1117 }
1118
1119 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1120 {
1121     return nb != NULL ?
1122            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1123
1124 }
1125
1126 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1127 {
1128     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1129             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1130 }