src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \file
  36  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  37  *
  38  *  \author Szilard Pall <pall.szilard@gmail.com>
  39  */
  40 #include "gmxpre.h"
  41
  42 #include "config.h"
  43
  44 #include <assert.h>
  45 #include <stdarg.h>
  46 #include <stdio.h>
  47 #include <stdlib.h>
  48
  49 #include <cuda_profiler_api.h>
  50
  51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  55 #include "gromacs/legacyheaders/typedefs.h"
  56 #include "gromacs/legacyheaders/types/enums.h"
  57 #include "gromacs/legacyheaders/types/force_flags.h"
  58 #include "gromacs/legacyheaders/types/interaction_const.h"
  59 #include "gromacs/mdlib/nb_verlet.h"
  60 #include "gromacs/mdlib/nbnxn_consts.h"
  61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/basedefinitions.h"
  65 #include "gromacs/utility/cstringutil.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/smalloc.h"
  68
  69 #include "nbnxn_cuda_types.h"
  70
  71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  72
  73 /* This is a heuristically determined parameter for the Fermi, Kepler
  74  * and Maxwell architectures for the minimum size of ci lists by multiplying
  75  * this constant with the # of multiprocessors on the current device.
  76  * Since the maximum number of blocks per multiprocessor is 16, the ideal
  77  * count for small systems is 32 or 48 blocks per multiprocessor. Because
  78  * there is a bit of fluctuations in the generated block counts, we use
  79  * a target of 44 instead of the ideal value of 48.
  80  */
  81 static unsigned int gpu_min_ci_balanced_factor = 44;
  82
  83 /* Functions from nbnxn_cuda.cu */
  84 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  85 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  88
  89 /* We should actually be using md_print_warn in md_logging.c,
  90  * but we can't include mpi.h in CUDA code.
  91  */
  92 static void md_print_warn(FILE       *fplog,
  93                           const char *fmt, ...)
  94 {
  95     va_list ap;
  96
  97     if (fplog != NULL)
  98     {
  99         /* We should only print to stderr on the master node,
 100          * in most cases fplog is only set on the master node, so this works.
 101          */
 102         va_start(ap, fmt);
 103         fprintf(stderr, "\n");
 104         vfprintf(stderr, fmt, ap);
 105         fprintf(stderr, "\n");
 106         va_end(ap);
 107
 108         va_start(ap, fmt);
 109         fprintf(fplog, "\n");
 110         vfprintf(fplog, fmt, ap);
 111         fprintf(fplog, "\n");
 112         va_end(ap);
 113     }
 114 }
 115
 116
 117 /* Fw. decl. */
 118 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 119
 120 /* Fw. decl, */
 121 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 122                                           const gmx_device_info_t *dev_info);
 123
 124
 125
 126 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 127 static bool use_texobj(const gmx_device_info_t *dev_info)
 128 {
 129     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 130     return (dev_info->prop.major >= 3);
 131 }
 132 #endif
 133
 134 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 135     and the table GPU array. If called with an already allocated table,
 136     it just re-uploads the table.
 137  */
 138 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
 139                                            cu_nbparam_t              *nbp,
 140                                            const gmx_device_info_t   *dev_info)
 141 {
 142     float       *coul_tab;
 143     cudaError_t  stat;
 144
 145     if (nbp->coulomb_tab != NULL)
 146     {
 147         nbnxn_cuda_free_nbparam_table(nbp, dev_info);
 148     }
 149
 150     stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
 151     CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 152
 153     nbp->coulomb_tab = coul_tab;
 154
 155 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 156     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 157     if (use_texobj(dev_info))
 158     {
 159         cudaResourceDesc rd;
 160         memset(&rd, 0, sizeof(rd));
 161         rd.resType                  = cudaResourceTypeLinear;
 162         rd.res.linear.devPtr        = nbp->coulomb_tab;
 163         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 164         rd.res.linear.desc.x        = 32;
 165         rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
 166
 167         cudaTextureDesc td;
 168         memset(&td, 0, sizeof(td));
 169         td.readMode                 = cudaReadModeElementType;
 170         stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 171         CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 172     }
 173     else
 174 #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
 175     {
 176         GMX_UNUSED_VALUE(dev_info);
 177         cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 178         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 179                                coul_tab, &cd,
 180                                ic->tabq_size*sizeof(*coul_tab));
 181         CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 182     }
 183
 184     cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
 185
 186     nbp->coulomb_tab_size     = ic->tabq_size;
 187     nbp->coulomb_tab_scale    = ic->tabq_scale;
 188 }
 189
 190
 191 /*! Initializes the atomdata structure first time, it only gets filled at
 192     pair-search. */
 193 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 194 {
 195     cudaError_t stat;
 196
 197     ad->ntypes  = ntypes;
 198     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 199     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 200     ad->bShiftVecUploaded = false;
 201
 202     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 203     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 204
 205     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 206     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 207     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 208     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 209
 210     /* initialize to NULL poiters to data that is not allocated here and will
 211        need reallocation in nbnxn_cuda_init_atomdata */
 212     ad->xq = NULL;
 213     ad->f  = NULL;
 214
 215     /* size -1 indicates that the respective array hasn't been initialized yet */
 216     ad->natoms = -1;
 217     ad->nalloc = -1;
 218 }
 219
 220 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 221     earlier GPUs, single or twin cut-off. */
 222 static int pick_ewald_kernel_type(bool                     bTwinCut,
 223                                   const gmx_device_info_t *dev_info)
 224 {
 225     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 226     int  kernel_type;
 227
 228     /* Benchmarking/development environment variables to force the use of
 229        analytical or tabulated Ewald kernel. */
 230     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 231     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 232
 233     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 234     {
 235         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 236                    "requested through environment variables.");
 237     }
 238
 239     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 240     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 241     {
 242         bUseAnalyticalEwald = true;
 243
 244         if (debug)
 245         {
 246             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 247         }
 248     }
 249     else
 250     {
 251         bUseAnalyticalEwald = false;
 252
 253         if (debug)
 254         {
 255             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 256         }
 257     }
 258
 259     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 260        forces it (use it for debugging/benchmarking only). */
 261     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 262     {
 263         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 264     }
 265     else
 266     {
 267         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 268     }
 269
 270     return kernel_type;
 271 }
 272
 273 /*! Copies all parameters related to the cut-off from ic to nbp */
 274 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 275                                   const interaction_const_t *ic)
 276 {
 277     nbp->ewald_beta       = ic->ewaldcoeff_q;
 278     nbp->sh_ewald         = ic->sh_ewald;
 279     nbp->epsfac           = ic->epsfac;
 280     nbp->two_k_rf         = 2.0 * ic->k_rf;
 281     nbp->c_rf             = ic->c_rf;
 282     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 283     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 284     nbp->rlist_sq         = ic->rlist * ic->rlist;
 285
 286     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 287     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 288
 289     nbp->rvdw_switch      = ic->rvdw_switch;
 290     nbp->dispersion_shift = ic->dispersion_shift;
 291     nbp->repulsion_shift  = ic->repulsion_shift;
 292     nbp->vdw_switch       = ic->vdw_switch;
 293 }
 294
 295 /*! Initializes the nonbonded parameter data structure. */
 296 static void init_nbparam(cu_nbparam_t              *nbp,
 297                          const interaction_const_t *ic,
 298                          const nbnxn_atomdata_t    *nbat,
 299                          const gmx_device_info_t   *dev_info)
 300 {
 301     cudaError_t stat;
 302     int         ntypes, nnbfp, nnbfp_comb;
 303
 304     ntypes  = nbat->ntype;
 305
 306     set_cutoff_parameters(nbp, ic);
 307
 308     if (ic->vdwtype == evdwCUT)
 309     {
 310         switch (ic->vdw_modifier)
 311         {
 312             case eintmodNONE:
 313             case eintmodPOTSHIFT:
 314                 nbp->vdwtype = evdwCuCUT;
 315                 break;
 316             case eintmodFORCESWITCH:
 317                 nbp->vdwtype = evdwCuFSWITCH;
 318                 break;
 319             case eintmodPOTSWITCH:
 320                 nbp->vdwtype = evdwCuPSWITCH;
 321                 break;
 322             default:
 323                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 324                 break;
 325         }
 326     }
 327     else if (ic->vdwtype == evdwPME)
 328     {
 329         if (ic->ljpme_comb_rule == ljcrGEOM)
 330         {
 331             assert(nbat->comb_rule == ljcrGEOM);
 332             nbp->vdwtype = evdwCuEWALDGEOM;
 333         }
 334         else
 335         {
 336             assert(nbat->comb_rule == ljcrLB);
 337             nbp->vdwtype = evdwCuEWALDLB;
 338         }
 339     }
 340     else
 341     {
 342         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 343     }
 344
 345     if (ic->eeltype == eelCUT)
 346     {
 347         nbp->eeltype = eelCuCUT;
 348     }
 349     else if (EEL_RF(ic->eeltype))
 350     {
 351         nbp->eeltype = eelCuRF;
 352     }
 353     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 354     {
 355         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 356         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 357     }
 358     else
 359     {
 360         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 361         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 362     }
 363
 364     /* generate table for PME */
 365     nbp->coulomb_tab = NULL;
 366     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 367     {
 368         init_ewald_coulomb_force_table(ic, nbp, dev_info);
 369     }
 370
 371     nnbfp      = 2*ntypes*ntypes;
 372     nnbfp_comb = 2*ntypes;
 373
 374     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 375     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 376     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 377
 378
 379     if (ic->vdwtype == evdwPME)
 380     {
 381         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 382         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 383         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 384     }
 385
 386 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 387     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 388     if (use_texobj(dev_info))
 389     {
 390         cudaResourceDesc rd;
 391         cudaTextureDesc  td;
 392
 393         memset(&rd, 0, sizeof(rd));
 394         rd.resType                  = cudaResourceTypeLinear;
 395         rd.res.linear.devPtr        = nbp->nbfp;
 396         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 397         rd.res.linear.desc.x        = 32;
 398         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 399
 400         memset(&td, 0, sizeof(td));
 401         td.readMode                 = cudaReadModeElementType;
 402         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 403         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 404
 405         if (ic->vdwtype == evdwPME)
 406         {
 407             memset(&rd, 0, sizeof(rd));
 408             rd.resType                  = cudaResourceTypeLinear;
 409             rd.res.linear.devPtr        = nbp->nbfp_comb;
 410             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 411             rd.res.linear.desc.x        = 32;
 412             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 413
 414             memset(&td, 0, sizeof(td));
 415             td.readMode = cudaReadModeElementType;
 416             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 417             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 418         }
 419     }
 420     else
 421 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
 422     {
 423         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 424         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 425                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 426         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 427
 428         if (ic->vdwtype == evdwPME)
 429         {
 430             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 431                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 432             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 433         }
 434     }
 435 }
 436
 437 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 438  *  electrostatic type switching to twin cut-off (or back) if needed. */
 439 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 440                                         const interaction_const_t   *ic)
 441 {
 442     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
 443     {
 444         return;
 445     }
 446     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
 447     cu_nbparam_t     *nbp   = nb->nbparam;
 448
 449     set_cutoff_parameters(nbp, ic);
 450
 451     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 452                                                  nb->dev_info);
 453
 454     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
 455 }
 456
 457 /*! Initializes the pair list data structure. */
 458 static void init_plist(cu_plist_t *pl)
 459 {
 460     /* initialize to NULL pointers to data that is not allocated here and will
 461        need reallocation in nbnxn_gpu_init_pairlist */
 462     pl->sci     = NULL;
 463     pl->cj4     = NULL;
 464     pl->excl    = NULL;
 465
 466     /* size -1 indicates that the respective array hasn't been initialized yet */
 467     pl->na_c        = -1;
 468     pl->nsci        = -1;
 469     pl->sci_nalloc  = -1;
 470     pl->ncj4        = -1;
 471     pl->cj4_nalloc  = -1;
 472     pl->nexcl       = -1;
 473     pl->excl_nalloc = -1;
 474     pl->bDoPrune    = false;
 475 }
 476
 477 /*! Initializes the timer data structure. */
 478 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 479 {
 480     cudaError_t stat;
 481     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 482
 483     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 484     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 485     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 486     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 487
 488     /* The non-local counters/stream (second in the array) are needed only with DD. */
 489     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 490     {
 491         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 492         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 493         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 494         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 495
 496
 497         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 498         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 499         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 500         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 501
 502         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 503         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 504         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 505         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 506
 507         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 508         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 509         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 510         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 511     }
 512 }
 513
 514 /*! Initializes the timings data structure. */
 515 static void init_timings(gmx_wallclock_gpu_t *t)
 516 {
 517     int i, j;
 518
 519     t->nb_h2d_t = 0.0;
 520     t->nb_d2h_t = 0.0;
 521     t->nb_c     = 0;
 522     t->pl_h2d_t = 0.0;
 523     t->pl_h2d_c = 0;
 524     for (i = 0; i < 2; i++)
 525     {
 526         for (j = 0; j < 2; j++)
 527         {
 528             t->ktime[i][j].t = 0.0;
 529             t->ktime[i][j].c = 0;
 530         }
 531     }
 532 }
 533
 534 /*! Initializes simulation constant data. */
 535 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
 536                                   const interaction_const_t      *ic,
 537                                   const nonbonded_verlet_group_t *nbv_group)
 538 {
 539     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
 540     init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
 541
 542     /* clear energy and shift force outputs */
 543     nbnxn_cuda_clear_e_fshift(nb);
 544 }
 545
 546 void nbnxn_gpu_init(FILE                      *fplog,
 547                     gmx_nbnxn_cuda_t         **p_nb,
 548                     const gmx_gpu_info_t      *gpu_info,
 549                     const gmx_gpu_opt_t       *gpu_opt,
 550                     const interaction_const_t *ic,
 551                     nonbonded_verlet_group_t  *nbv_grp,
 552                     int                        my_gpu_index,
 553                     int                        /*rank*/,
 554                     gmx_bool                   bLocalAndNonlocal)
 555 {
 556     cudaError_t       stat;
 557     gmx_nbnxn_cuda_t *nb;
 558     char              sbuf[STRLEN];
 559     bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 560     int               cuda_drv_ver;
 561
 562     assert(gpu_info);
 563
 564     if (p_nb == NULL)
 565     {
 566         return;
 567     }
 568
 569     snew(nb, 1);
 570     snew(nb->atdat, 1);
 571     snew(nb->nbparam, 1);
 572     snew(nb->plist[eintLocal], 1);
 573     if (bLocalAndNonlocal)
 574     {
 575         snew(nb->plist[eintNonlocal], 1);
 576     }
 577
 578     nb->bUseTwoStreams = bLocalAndNonlocal;
 579
 580     snew(nb->timers, 1);
 581     snew(nb->timings, 1);
 582
 583     /* init nbst */
 584     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 585     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 586     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 587
 588     init_plist(nb->plist[eintLocal]);
 589
 590     /* set device info, just point it to the right GPU among the detected ones */
 591     nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 592
 593     /* local/non-local GPU streams */
 594     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 595     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 596     if (nb->bUseTwoStreams)
 597     {
 598         init_plist(nb->plist[eintNonlocal]);
 599
 600         /* CUDA stream priority available in the CUDA RT 5.5 API.
 601          * Note that the device we're running on does not have to support
 602          * priorities, because we are querying the priority range which in this
 603          * case will be a single value.
 604          */
 605 #if GMX_CUDA_VERSION >= 5050
 606         {
 607             int highest_priority;
 608             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 609             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 610
 611             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 612                                                 cudaStreamDefault,
 613                                                 highest_priority);
 614             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 615         }
 616 #else
 617         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 618         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 619 #endif
 620     }
 621
 622     /* init events for sychronization (timing disabled for performance reasons!) */
 623     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 624     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 625     stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 626     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 627
 628     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 629      * (which increases with shorter time/step) caused by a known CUDA driver bug.
 630      * To work around the issue we'll use an (admittedly fragile) memory polling
 631      * waiting to preserve performance. This requires support for atomic
 632      * operations and only works on x86/x86_64.
 633      * With polling wait event-timing also needs to be disabled.
 634      *
 635      * The overhead is greatly reduced in API v5.0 drivers and the improvement
 636      * is independent of runtime version. Hence, with API v5.0 drivers and later
 637      * we won't switch to polling.
 638      *
 639      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 640      * ranks so we will also disable it in that case.
 641      */
 642
 643     bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 644     bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 645
 646 #ifdef TMPI_ATOMICS
 647     bTMPIAtomics = true;
 648 #else
 649     bTMPIAtomics = false;
 650 #endif
 651
 652 #ifdef GMX_TARGET_X86
 653     bX86 = true;
 654 #else
 655     bX86 = false;
 656 #endif
 657
 658     if (bStreamSync && bNoStreamSync)
 659     {
 660         gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 661     }
 662
 663     stat = cudaDriverGetVersion(&cuda_drv_ver);
 664     CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 665
 666     bOldDriver = (cuda_drv_ver < 5000);
 667
 668     if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
 669     {
 670         /* Polling wait should be used instead of cudaStreamSynchronize only if:
 671          *   - ECC is ON & driver is old (checked above),
 672          *   - we're on x86/x86_64,
 673          *   - atomics are available, and
 674          *   - GPUs are not being shared.
 675          */
 676         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
 677                                    (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 678
 679         if (bStreamSync)
 680         {
 681             nb->bUseStreamSync = true;
 682
 683             /* only warn if polling should be used */
 684             if (bShouldUsePollSync)
 685             {
 686                 md_print_warn(fplog,
 687                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 688                               "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 689             }
 690         }
 691         else
 692         {
 693             nb->bUseStreamSync = !bShouldUsePollSync;
 694
 695             if (bShouldUsePollSync)
 696             {
 697                 md_print_warn(fplog,
 698                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 699                               "      cause performance loss. Switching to the alternative polling GPU wait.\n"
 700                               "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 701                               "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 702             }
 703             else
 704             {
 705                 /* Tell the user that the ECC+old driver combination can be bad */
 706                 sprintf(sbuf,
 707                         "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
 708                         "      A known bug in this driver version can cause performance loss.\n"
 709                         "      However, the polling wait workaround can not be used because\n%s\n"
 710                         "      Consider updating the driver or turning ECC off.",
 711                         (bX86 && bTMPIAtomics) ?
 712                         "      GPU(s) are being oversubscribed." :
 713                         "      atomic operations are not supported by the platform/CPU+compiler.");
 714                 md_print_warn(fplog, sbuf);
 715             }
 716         }
 717     }
 718     else
 719     {
 720         if (bNoStreamSync)
 721         {
 722             nb->bUseStreamSync = false;
 723
 724             md_print_warn(fplog,
 725                           "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 726         }
 727         else
 728         {
 729             /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 730             nb->bUseStreamSync = true;
 731         }
 732     }
 733
 734     /* CUDA timing disabled as event timers don't work:
 735        - with multiple streams = domain-decomposition;
 736        - with the polling waiting hack (without cudaStreamSynchronize);
 737        - when turned off by GMX_DISABLE_CUDA_TIMING.
 738      */
 739     nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 740                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 741
 742     if (nb->bDoTime)
 743     {
 744         init_timers(nb->timers, nb->bUseTwoStreams);
 745         init_timings(nb->timings);
 746     }
 747
 748     /* set the kernel type for the current GPU */
 749     /* pick L1 cache configuration */
 750     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 751
 752     nbnxn_cuda_init_const(nb, ic, nbv_grp);
 753
 754     *p_nb = nb;
 755
 756     if (debug)
 757     {
 758         fprintf(debug, "Initialized CUDA data structures.\n");
 759     }
 760 }
 761
 762 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
 763                              const nbnxn_pairlist_t *h_plist,
 764                              int                     iloc)
 765 {
 766     char          sbuf[STRLEN];
 767     cudaError_t   stat;
 768     bool          bDoTime    = nb->bDoTime;
 769     cudaStream_t  stream     = nb->stream[iloc];
 770     cu_plist_t   *d_plist    = nb->plist[iloc];
 771
 772     if (d_plist->na_c < 0)
 773     {
 774         d_plist->na_c = h_plist->na_ci;
 775     }
 776     else
 777     {
 778         if (d_plist->na_c != h_plist->na_ci)
 779         {
 780             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 781                     d_plist->na_c, h_plist->na_ci);
 782             gmx_incons(sbuf);
 783         }
 784     }
 785
 786     if (bDoTime)
 787     {
 788         stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
 789         CU_RET_ERR(stat, "cudaEventRecord failed");
 790     }
 791
 792     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 793                         &d_plist->nsci, &d_plist->sci_nalloc,
 794                         h_plist->nsci,
 795                         stream, true);
 796
 797     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 798                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 799                         h_plist->ncj4,
 800                         stream, true);
 801
 802     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 803                         &d_plist->nexcl, &d_plist->excl_nalloc,
 804                         h_plist->nexcl,
 805                         stream, true);
 806
 807     if (bDoTime)
 808     {
 809         stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
 810         CU_RET_ERR(stat, "cudaEventRecord failed");
 811     }
 812
 813     /* need to prune the pair list during the next step */
 814     d_plist->bDoPrune = true;
 815 }
 816
 817 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
 818                                const nbnxn_atomdata_t *nbatom)
 819 {
 820     cu_atomdata_t *adat  = nb->atdat;
 821     cudaStream_t   ls    = nb->stream[eintLocal];
 822
 823     /* only if we have a dynamic box */
 824     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 825     {
 826         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 827                           SHIFTS * sizeof(*adat->shift_vec), ls);
 828         adat->bShiftVecUploaded = true;
 829     }
 830 }
 831
 832 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 833 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 834 {
 835     cudaError_t    stat;
 836     cu_atomdata_t *adat  = nb->atdat;
 837     cudaStream_t   ls    = nb->stream[eintLocal];
 838
 839     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 840     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 841 }
 842
 843 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 844 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 845 {
 846     cudaError_t    stat;
 847     cu_atomdata_t *adat  = nb->atdat;
 848     cudaStream_t   ls    = nb->stream[eintLocal];
 849
 850     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 851     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 852     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 853     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 854     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 855     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 856 }
 857
 858 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 859 {
 860     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
 861     /* clear shift force array and energies if the outputs were
 862        used in the current step */
 863     if (flags & GMX_FORCE_VIRIAL)
 864     {
 865         nbnxn_cuda_clear_e_fshift(nb);
 866     }
 867 }
 868
 869 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
 870                              const struct nbnxn_atomdata_t *nbat)
 871 {
 872     cudaError_t    stat;
 873     int            nalloc, natoms;
 874     bool           realloced;
 875     bool           bDoTime   = nb->bDoTime;
 876     cu_timers_t   *timers    = nb->timers;
 877     cu_atomdata_t *d_atdat   = nb->atdat;
 878     cudaStream_t   ls        = nb->stream[eintLocal];
 879
 880     natoms    = nbat->natoms;
 881     realloced = false;
 882
 883     if (bDoTime)
 884     {
 885         /* time async copy */
 886         stat = cudaEventRecord(timers->start_atdat, ls);
 887         CU_RET_ERR(stat, "cudaEventRecord failed");
 888     }
 889
 890     /* need to reallocate if we have to copy more atoms than the amount of space
 891        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 892     if (natoms > d_atdat->nalloc)
 893     {
 894         nalloc = over_alloc_small(natoms);
 895
 896         /* free up first if the arrays have already been initialized */
 897         if (d_atdat->nalloc != -1)
 898         {
 899             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 900             cu_free_buffered(d_atdat->xq);
 901             cu_free_buffered(d_atdat->atom_types);
 902         }
 903
 904         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 905         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 906         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 907         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 908
 909         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 910         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 911
 912         d_atdat->nalloc = nalloc;
 913         realloced       = true;
 914     }
 915
 916     d_atdat->natoms       = natoms;
 917     d_atdat->natoms_local = nbat->natoms_local;
 918
 919     /* need to clear GPU f output if realloc happened */
 920     if (realloced)
 921     {
 922         nbnxn_cuda_clear_f(nb, nalloc);
 923     }
 924
 925     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 926                       natoms*sizeof(*d_atdat->atom_types), ls);
 927
 928     if (bDoTime)
 929     {
 930         stat = cudaEventRecord(timers->stop_atdat, ls);
 931         CU_RET_ERR(stat, "cudaEventRecord failed");
 932     }
 933 }
 934
 935 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 936                                           const gmx_device_info_t *dev_info)
 937 {
 938     cudaError_t stat;
 939
 940     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 941     {
 942 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 943         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 944         if (use_texobj(dev_info))
 945         {
 946             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 947             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 948         }
 949         else
 950 #endif
 951         {
 952             GMX_UNUSED_VALUE(dev_info);
 953             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 954             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 955         }
 956         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 957     }
 958 }
 959
 960 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 961 {
 962     cudaError_t      stat;
 963     cu_atomdata_t   *atdat;
 964     cu_nbparam_t    *nbparam;
 965     cu_plist_t      *plist, *plist_nl;
 966     cu_timers_t     *timers;
 967
 968     /* Stopping the nvidia profiler here allows us to eliminate the subsequent
 969        uninitialization API calls from the trace. */
 970     if (getenv("NVPROF_ID") != NULL)
 971     {
 972         stat = cudaProfilerStop();
 973         CU_RET_ERR(stat, "cudaProfilerStop failed");
 974     }
 975
 976     if (nb == NULL)
 977     {
 978         return;
 979     }
 980
 981     atdat       = nb->atdat;
 982     nbparam     = nb->nbparam;
 983     plist       = nb->plist[eintLocal];
 984     plist_nl    = nb->plist[eintNonlocal];
 985     timers      = nb->timers;
 986
 987     nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 988
 989     stat = cudaEventDestroy(nb->nonlocal_done);
 990     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 991     stat = cudaEventDestroy(nb->misc_ops_done);
 992     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 993
 994     if (nb->bDoTime)
 995     {
 996         stat = cudaEventDestroy(timers->start_atdat);
 997         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 998         stat = cudaEventDestroy(timers->stop_atdat);
 999         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
1000
1001         /* The non-local counters/stream (second in the array) are needed only with DD. */
1002         for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
1003         {
1004             stat = cudaEventDestroy(timers->start_nb_k[i]);
1005             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
1006             stat = cudaEventDestroy(timers->stop_nb_k[i]);
1007             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
1008
1009             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
1010             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
1011             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
1012             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
1013
1014             stat = cudaStreamDestroy(nb->stream[i]);
1015             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1016
1017             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1018             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1019             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1020             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1021
1022             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1023             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1024             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1025             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1026         }
1027     }
1028
1029 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1030     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1031     if (use_texobj(nb->dev_info))
1032     {
1033         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1034         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1035     }
1036     else
1037 #endif
1038     {
1039         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1040         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1041     }
1042     cu_free_buffered(nbparam->nbfp);
1043
1044     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1045     {
1046 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1047         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1048         if (use_texobj(nb->dev_info))
1049         {
1050             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1051             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1052         }
1053         else
1054 #endif
1055         {
1056             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1057             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1058         }
1059         cu_free_buffered(nbparam->nbfp_comb);
1060     }
1061
1062     stat = cudaFree(atdat->shift_vec);
1063     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1064     stat = cudaFree(atdat->fshift);
1065     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1066
1067     stat = cudaFree(atdat->e_lj);
1068     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1069     stat = cudaFree(atdat->e_el);
1070     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1071
1072     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1073     cu_free_buffered(atdat->xq);
1074     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1075
1076     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1077     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1078     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1079     if (nb->bUseTwoStreams)
1080     {
1081         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1082         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1083         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1084     }
1085
1086     sfree(atdat);
1087     sfree(nbparam);
1088     sfree(plist);
1089     if (nb->bUseTwoStreams)
1090     {
1091         sfree(plist_nl);
1092     }
1093     sfree(timers);
1094     sfree(nb->timings);
1095     sfree(nb);
1096
1097     if (debug)
1098     {
1099         fprintf(debug, "Cleaned up CUDA data structures.\n");
1100     }
1101 }
1102
1103 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1104 {
1105     cudaError_t  stat;
1106     cudaStream_t stream = nb->stream[iloc];
1107
1108     stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1109     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1110 }
1111
1112 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1113 {
1114     return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1115 }
1116
1117 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1118 {
1119     /* The NVPROF_ID environment variable is set by nvprof and indicates that
1120        mdrun is executed in the CUDA profiler.
1121        If nvprof was run is with "--profile-from-start off", the profiler will
1122        be started here. This way we can avoid tracing the CUDA events from the
1123        first part of the run. Starting the profiler again does nothing.
1124      */
1125     if (getenv("NVPROF_ID") != NULL)
1126     {
1127         cudaError_t stat;
1128         stat = cudaProfilerStart();
1129         CU_RET_ERR(stat, "cudaProfilerStart failed");
1130     }
1131
1132     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1133     {
1134         init_timings(nbv->gpu_nbv->timings);
1135     }
1136 }
1137
1138 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1139 {
1140     return nb != NULL ?
1141            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1142
1143 }
1144
1145 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1146 {
1147     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1148             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1149 }