src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39 #include <stdlib.h>
  40 #include <stdio.h>
  41 #include <assert.h>
  42
  43 #include <cuda.h>
  44
  45 #include "gmx_fatal.h"
  46 #include "smalloc.h"
  47 #include "tables.h"
  48 #include "typedefs.h"
  49 #include "types/enums.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/interaction_const.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54 #include "gmx_detect_hardware.h"
  55
  56 #include "nbnxn_cuda_types.h"
  57 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  58 #include "nbnxn_cuda_data_mgmt.h"
  59 #include "pmalloc_cuda.h"
  60 #include "gpu_utils.h"
  61
  62 #include "gromacs/utility/common.h"
  63
  64 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  65
  66 /* This is a heuristically determined parameter for the Fermi architecture for
  67  * the minimum size of ci lists by multiplying this constant with the # of
  68  * multiprocessors on the current device.
  69  */
  70 static unsigned int gpu_min_ci_balanced_factor = 40;
  71
  72 /* Functions from nbnxn_cuda.cu */
  73 extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
  74 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  75 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  76 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
  77
  78 /* We should actually be using md_print_warn in md_logging.c,
  79  * but we can't include mpi.h in CUDA code.
  80  */
  81 static void md_print_warn(FILE       *fplog,
  82                           const char *fmt, ...)
  83 {
  84     va_list ap;
  85
  86     if (fplog != NULL)
  87     {
  88         /* We should only print to stderr on the master node,
  89          * in most cases fplog is only set on the master node, so this works.
  90          */
  91         va_start(ap, fmt);
  92         fprintf(stderr, "\n");
  93         vfprintf(stderr, fmt, ap);
  94         fprintf(stderr, "\n");
  95         va_end(ap);
  96
  97         va_start(ap, fmt);
  98         fprintf(fplog, "\n");
  99         vfprintf(fplog, fmt, ap);
 100         fprintf(fplog, "\n");
 101         va_end(ap);
 102     }
 103 }
 104
 105
 106 /* Fw. decl. */
 107 static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
 108
 109
 110 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
 111     and the table GPU array. If called with an already allocated table,
 112     it just re-uploads the table.
 113  */
 114 static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
 115                                            const cuda_dev_info_t *dev_info)
 116 {
 117     float       *ftmp, *coul_tab;
 118     int          tabsize;
 119     double       tabscale;
 120     cudaError_t  stat;
 121
 122     tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 123     /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 124     tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
 125
 126     pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
 127
 128     table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
 129                                 1/tabscale, nbp->ewald_beta);
 130
 131     /* If the table pointer == NULL the table is generated the first time =>
 132        the array pointer will be saved to nbparam and the texture is bound.
 133      */
 134     coul_tab = nbp->coulomb_tab;
 135     if (coul_tab == NULL)
 136     {
 137         stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
 138         CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 139
 140         nbp->coulomb_tab = coul_tab;
 141
 142 #ifdef TEXOBJ_SUPPORTED
 143         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 144         if (dev_info->prop.major >= 3)
 145         {
 146             cudaResourceDesc rd;
 147             memset(&rd, 0, sizeof(rd));
 148             rd.resType                  = cudaResourceTypeLinear;
 149             rd.res.linear.devPtr        = nbp->coulomb_tab;
 150             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 151             rd.res.linear.desc.x        = 32;
 152             rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
 153
 154             cudaTextureDesc td;
 155             memset(&td, 0, sizeof(td));
 156             td.readMode                 = cudaReadModeElementType;
 157             stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
 158             CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
 159         }
 160         else
 161 #endif
 162         {
 163             GMX_UNUSED_VALUE(dev_info);
 164             cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
 165             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
 166                                    coul_tab, &cd, tabsize*sizeof(*coul_tab));
 167             CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
 168         }
 169     }
 170
 171     cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
 172
 173     nbp->coulomb_tab_size     = tabsize;
 174     nbp->coulomb_tab_scale    = tabscale;
 175
 176     pfree(ftmp);
 177 }
 178
 179
 180 /*! Initializes the atomdata structure first time, it only gets filled at
 181     pair-search. */
 182 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 183 {
 184     cudaError_t stat;
 185
 186     ad->ntypes  = ntypes;
 187     stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
 188     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
 189     ad->bShiftVecUploaded = false;
 190
 191     stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
 192     CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
 193
 194     stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
 195     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
 196     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
 197     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 198
 199     /* initialize to NULL poiters to data that is not allocated here and will
 200        need reallocation in nbnxn_cuda_init_atomdata */
 201     ad->xq = NULL;
 202     ad->f  = NULL;
 203
 204     /* size -1 indicates that the respective array hasn't been initialized yet */
 205     ad->natoms = -1;
 206     ad->nalloc = -1;
 207 }
 208
 209 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
 210     earlier GPUs, single or twin cut-off. */
 211 static int pick_ewald_kernel_type(bool                   bTwinCut,
 212                                   const cuda_dev_info_t *dev_info)
 213 {
 214     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
 215     int  kernel_type;
 216
 217     /* Benchmarking/development environment variables to force the use of
 218        analytical or tabulated Ewald kernel. */
 219     bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
 220     bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
 221
 222     if (bForceAnalyticalEwald && bForceTabulatedEwald)
 223     {
 224         gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
 225                    "requested through environment variables.");
 226     }
 227
 228     /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
 229     if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
 230     {
 231         bUseAnalyticalEwald = true;
 232
 233         if (debug)
 234         {
 235             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
 236         }
 237     }
 238     else
 239     {
 240         bUseAnalyticalEwald = false;
 241
 242         if (debug)
 243         {
 244             fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
 245         }
 246     }
 247
 248     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 249        forces it (use it for debugging/benchmarking only). */
 250     if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
 251     {
 252         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
 253     }
 254     else
 255     {
 256         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
 257     }
 258
 259     return kernel_type;
 260 }
 261
 262
 263 /*! Initializes the nonbonded parameter data structure. */
 264 static void init_nbparam(cu_nbparam_t              *nbp,
 265                          const interaction_const_t *ic,
 266                          const nbnxn_atomdata_t    *nbat,
 267                          const cuda_dev_info_t     *dev_info)
 268 {
 269     cudaError_t stat;
 270     int         ntypes, nnbfp, nnbfp_comb;
 271
 272     ntypes  = nbat->ntype;
 273
 274     nbp->ewald_beta       = ic->ewaldcoeff_q;
 275     nbp->sh_ewald         = ic->sh_ewald;
 276     nbp->epsfac           = ic->epsfac;
 277     nbp->two_k_rf         = 2.0 * ic->k_rf;
 278     nbp->c_rf             = ic->c_rf;
 279     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
 280     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
 281     nbp->rlist_sq         = ic->rlist * ic->rlist;
 282
 283     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
 284     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 285
 286     nbp->rvdw_switch      = ic->rvdw_switch;
 287     nbp->dispersion_shift = ic->dispersion_shift;
 288     nbp->repulsion_shift  = ic->repulsion_shift;
 289     nbp->vdw_switch       = ic->vdw_switch;
 290
 291     if (ic->vdwtype == evdwCUT)
 292     {
 293         switch (ic->vdw_modifier)
 294         {
 295             case eintmodNONE:
 296             case eintmodPOTSHIFT:
 297                 nbp->vdwtype = evdwCuCUT;
 298                 break;
 299             case eintmodFORCESWITCH:
 300                 nbp->vdwtype = evdwCuFSWITCH;
 301                 break;
 302             case eintmodPOTSWITCH:
 303                 nbp->vdwtype = evdwCuPSWITCH;
 304                 break;
 305             default:
 306                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
 307                 break;
 308         }
 309     }
 310     else if (ic->vdwtype == evdwPME)
 311     {
 312         if (ic->ljpme_comb_rule == ljcrGEOM)
 313         {
 314             assert(nbat->comb_rule == ljcrGEOM);
 315             nbp->vdwtype = evdwCuEWALDGEOM;
 316         }
 317         else
 318         {
 319             assert(nbat->comb_rule == ljcrLB);
 320             nbp->vdwtype = evdwCuEWALDLB;
 321         }
 322     }
 323     else
 324     {
 325         gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
 326     }
 327
 328     if (ic->eeltype == eelCUT)
 329     {
 330         nbp->eeltype = eelCuCUT;
 331     }
 332     else if (EEL_RF(ic->eeltype))
 333     {
 334         nbp->eeltype = eelCuRF;
 335     }
 336     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 337     {
 338         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
 339         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
 340     }
 341     else
 342     {
 343         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 344         gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
 345     }
 346
 347     /* generate table for PME */
 348     nbp->coulomb_tab = NULL;
 349     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
 350     {
 351         init_ewald_coulomb_force_table(nbp, dev_info);
 352     }
 353
 354     nnbfp      = 2*ntypes*ntypes;
 355     nnbfp_comb = 2*ntypes;
 356
 357     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
 358     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
 359     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 360
 361
 362     if (ic->vdwtype == evdwPME)
 363     {
 364         stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 365         CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
 366         cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 367     }
 368
 369 #ifdef TEXOBJ_SUPPORTED
 370     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 371     if (dev_info->prop.major >= 3)
 372     {
 373         cudaResourceDesc rd;
 374         cudaTextureDesc  td;
 375
 376         memset(&rd, 0, sizeof(rd));
 377         rd.resType                  = cudaResourceTypeLinear;
 378         rd.res.linear.devPtr        = nbp->nbfp;
 379         rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 380         rd.res.linear.desc.x        = 32;
 381         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 382
 383         memset(&td, 0, sizeof(td));
 384         td.readMode                 = cudaReadModeElementType;
 385         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
 386         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
 387
 388         if (ic->vdwtype == evdwPME)
 389         {
 390             memset(&rd, 0, sizeof(rd));
 391             rd.resType                  = cudaResourceTypeLinear;
 392             rd.res.linear.devPtr        = nbp->nbfp_comb;
 393             rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
 394             rd.res.linear.desc.x        = 32;
 395             rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
 396
 397             memset(&td, 0, sizeof(td));
 398             td.readMode = cudaReadModeElementType;
 399             stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
 400             CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
 401         }
 402     }
 403     else
 404 #endif
 405     {
 406         cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
 407         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
 408                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
 409         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
 410
 411         if (ic->vdwtype == evdwPME)
 412         {
 413             stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
 414                                    nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
 415             CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
 416         }
 417     }
 418 }
 419
 420 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
 421  *  electrostatic type switching to twin cut-off (or back) if needed. */
 422 void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
 423                                          const interaction_const_t *ic)
 424 {
 425     cu_nbparam_t *nbp = cu_nb->nbparam;
 426
 427     nbp->rlist_sq       = ic->rlist * ic->rlist;
 428     nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
 429     nbp->ewald_beta     = ic->ewaldcoeff_q;
 430
 431     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
 432                                                  cu_nb->dev_info);
 433
 434     init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info);
 435 }
 436
 437 /*! Initializes the pair list data structure. */
 438 static void init_plist(cu_plist_t *pl)
 439 {
 440     /* initialize to NULL pointers to data that is not allocated here and will
 441        need reallocation in nbnxn_cuda_init_pairlist */
 442     pl->sci     = NULL;
 443     pl->cj4     = NULL;
 444     pl->excl    = NULL;
 445
 446     /* size -1 indicates that the respective array hasn't been initialized yet */
 447     pl->na_c        = -1;
 448     pl->nsci        = -1;
 449     pl->sci_nalloc  = -1;
 450     pl->ncj4        = -1;
 451     pl->cj4_nalloc  = -1;
 452     pl->nexcl       = -1;
 453     pl->excl_nalloc = -1;
 454     pl->bDoPrune    = false;
 455 }
 456
 457 /*! Initializes the timer data structure. */
 458 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 459 {
 460     cudaError_t stat;
 461     int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 462
 463     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
 464     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
 465     stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
 466     CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
 467
 468     /* The non-local counters/stream (second in the array) are needed only with DD. */
 469     for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
 470     {
 471         stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
 472         CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
 473         stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
 474         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
 475
 476
 477         stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
 478         CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
 479         stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
 480         CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
 481
 482         stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
 483         CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
 484         stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
 485         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
 486
 487         stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
 488         CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
 489         stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
 490         CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
 491     }
 492 }
 493
 494 /*! Initializes the timings data structure. */
 495 static void init_timings(wallclock_gpu_t *t)
 496 {
 497     int i, j;
 498
 499     t->nb_h2d_t = 0.0;
 500     t->nb_d2h_t = 0.0;
 501     t->nb_c     = 0;
 502     t->pl_h2d_t = 0.0;
 503     t->pl_h2d_c = 0;
 504     for (i = 0; i < 2; i++)
 505     {
 506         for (j = 0; j < 2; j++)
 507         {
 508             t->ktime[i][j].t = 0.0;
 509             t->ktime[i][j].c = 0;
 510         }
 511     }
 512 }
 513
 514 void nbnxn_cuda_init(FILE                 *fplog,
 515                      nbnxn_cuda_ptr_t     *p_cu_nb,
 516                      const gmx_gpu_info_t *gpu_info,
 517                      const gmx_gpu_opt_t  *gpu_opt,
 518                      int                   my_gpu_index,
 519                      gmx_bool              bLocalAndNonlocal)
 520 {
 521     cudaError_t       stat;
 522     nbnxn_cuda_ptr_t  nb;
 523     char              sbuf[STRLEN];
 524     bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
 525     int               cuda_drv_ver;
 526
 527     assert(gpu_info);
 528
 529     if (p_cu_nb == NULL)
 530     {
 531         return;
 532     }
 533
 534     snew(nb, 1);
 535     snew(nb->atdat, 1);
 536     snew(nb->nbparam, 1);
 537     snew(nb->plist[eintLocal], 1);
 538     if (bLocalAndNonlocal)
 539     {
 540         snew(nb->plist[eintNonlocal], 1);
 541     }
 542
 543     nb->bUseTwoStreams = bLocalAndNonlocal;
 544
 545     snew(nb->timers, 1);
 546     snew(nb->timings, 1);
 547
 548     /* init nbst */
 549     pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
 550     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
 551     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 552
 553     init_plist(nb->plist[eintLocal]);
 554
 555     /* set device info, just point it to the right GPU among the detected ones */
 556     nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 557
 558     /* local/non-local GPU streams */
 559     stat = cudaStreamCreate(&nb->stream[eintLocal]);
 560     CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
 561     if (nb->bUseTwoStreams)
 562     {
 563         init_plist(nb->plist[eintNonlocal]);
 564
 565         /* CUDA stream priority available in the CUDA RT 5.5 API.
 566          * Note that the device we're running on does not have to support
 567          * priorities, because we are querying the priority range which in this
 568          * case will be a single value.
 569          */
 570 #if CUDA_VERSION >= 5500
 571         {
 572             int highest_priority;
 573             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
 574             CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 575
 576             stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
 577                                                 cudaStreamDefault,
 578                                                 highest_priority);
 579             CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
 580         }
 581 #else
 582         stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
 583         CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
 584 #endif
 585     }
 586
 587     /* init events for sychronization (timing disabled for performance reasons!) */
 588     stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
 589     CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
 590     stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
 591     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
 592
 593     /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
 594      * (which increases with shorter time/step) caused by a known CUDA driver bug.
 595      * To work around the issue we'll use an (admittedly fragile) memory polling
 596      * waiting to preserve performance. This requires support for atomic
 597      * operations and only works on x86/x86_64.
 598      * With polling wait event-timing also needs to be disabled.
 599      *
 600      * The overhead is greatly reduced in API v5.0 drivers and the improvement
 601      * is independent of runtime version. Hence, with API v5.0 drivers and later
 602      * we won't switch to polling.
 603      *
 604      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
 605      * ranks so we will also disable it in that case.
 606      */
 607
 608     bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
 609     bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
 610
 611 #ifdef TMPI_ATOMICS
 612     bTMPIAtomics = true;
 613 #else
 614     bTMPIAtomics = false;
 615 #endif
 616
 617 #ifdef GMX_TARGET_X86
 618     bX86 = true;
 619 #else
 620     bX86 = false;
 621 #endif
 622
 623     if (bStreamSync && bNoStreamSync)
 624     {
 625         gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
 626     }
 627
 628     stat = cudaDriverGetVersion(&cuda_drv_ver);
 629     CU_RET_ERR(stat, "cudaDriverGetVersion failed");
 630
 631     bOldDriver = (cuda_drv_ver < 5000);
 632
 633     if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
 634     {
 635         /* Polling wait should be used instead of cudaStreamSynchronize only if:
 636          *   - ECC is ON & driver is old (checked above),
 637          *   - we're on x86/x86_64,
 638          *   - atomics are available, and
 639          *   - GPUs are not being shared.
 640          */
 641         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
 642                                    (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 643
 644         if (bStreamSync)
 645         {
 646             nb->bUseStreamSync = true;
 647
 648             /* only warn if polling should be used */
 649             if (bShouldUsePollSync)
 650             {
 651                 md_print_warn(fplog,
 652                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
 653                               "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
 654             }
 655         }
 656         else
 657         {
 658             nb->bUseStreamSync = !bShouldUsePollSync;
 659
 660             if (bShouldUsePollSync)
 661             {
 662                 md_print_warn(fplog,
 663                               "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
 664                               "      cause performance loss. Switching to the alternative polling GPU wait.\n"
 665                               "      If you encounter issues, switch back to standard GPU waiting by setting\n"
 666                               "      the GMX_CUDA_STREAMSYNC environment variable.\n");
 667             }
 668             else
 669             {
 670                 /* Tell the user that the ECC+old driver combination can be bad */
 671                 sprintf(sbuf,
 672                         "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
 673                         "      A known bug in this driver version can cause performance loss.\n"
 674                         "      However, the polling wait workaround can not be used because\n%s\n"
 675                         "      Consider updating the driver or turning ECC off.",
 676                         (bX86 && bTMPIAtomics) ?
 677                         "      GPU(s) are being oversubscribed." :
 678                         "      atomic operations are not supported by the platform/CPU+compiler.");
 679                 md_print_warn(fplog, sbuf);
 680             }
 681         }
 682     }
 683     else
 684     {
 685         if (bNoStreamSync)
 686         {
 687             nb->bUseStreamSync = false;
 688
 689             md_print_warn(fplog,
 690                           "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
 691         }
 692         else
 693         {
 694             /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
 695             nb->bUseStreamSync = true;
 696         }
 697     }
 698
 699     /* CUDA timing disabled as event timers don't work:
 700        - with multiple streams = domain-decomposition;
 701        - with the polling waiting hack (without cudaStreamSynchronize);
 702        - when turned off by GMX_DISABLE_CUDA_TIMING.
 703      */
 704     nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
 705                    (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
 706
 707     if (nb->bDoTime)
 708     {
 709         init_timers(nb->timers, nb->bUseTwoStreams);
 710         init_timings(nb->timings);
 711     }
 712
 713     /* set the kernel type for the current GPU */
 714     /* pick L1 cache configuration */
 715     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 716
 717     *p_cu_nb = nb;
 718
 719     if (debug)
 720     {
 721         fprintf(debug, "Initialized CUDA data structures.\n");
 722     }
 723 }
 724
 725 void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
 726                            const interaction_const_t      *ic,
 727                            const nonbonded_verlet_group_t *nbv_group)
 728 {
 729     init_atomdata_first(cu_nb->atdat, nbv_group[0].nbat->ntype);
 730     init_nbparam(cu_nb->nbparam, ic, nbv_group[0].nbat, cu_nb->dev_info);
 731
 732     /* clear energy and shift force outputs */
 733     nbnxn_cuda_clear_e_fshift(cu_nb);
 734 }
 735
 736 void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
 737                               const nbnxn_pairlist_t *h_plist,
 738                               int                     iloc)
 739 {
 740     char          sbuf[STRLEN];
 741     cudaError_t   stat;
 742     bool          bDoTime    = cu_nb->bDoTime;
 743     cudaStream_t  stream     = cu_nb->stream[iloc];
 744     cu_plist_t   *d_plist    = cu_nb->plist[iloc];
 745
 746     if (d_plist->na_c < 0)
 747     {
 748         d_plist->na_c = h_plist->na_ci;
 749     }
 750     else
 751     {
 752         if (d_plist->na_c != h_plist->na_ci)
 753         {
 754             sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
 755                     d_plist->na_c, h_plist->na_ci);
 756             gmx_incons(sbuf);
 757         }
 758     }
 759
 760     if (bDoTime)
 761     {
 762         stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
 763         CU_RET_ERR(stat, "cudaEventRecord failed");
 764     }
 765
 766     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
 767                         &d_plist->nsci, &d_plist->sci_nalloc,
 768                         h_plist->nsci,
 769                         stream, true);
 770
 771     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
 772                         &d_plist->ncj4, &d_plist->cj4_nalloc,
 773                         h_plist->ncj4,
 774                         stream, true);
 775
 776     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
 777                         &d_plist->nexcl, &d_plist->excl_nalloc,
 778                         h_plist->nexcl,
 779                         stream, true);
 780
 781     if (bDoTime)
 782     {
 783         stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
 784         CU_RET_ERR(stat, "cudaEventRecord failed");
 785     }
 786
 787     /* need to prune the pair list during the next step */
 788     d_plist->bDoPrune = true;
 789 }
 790
 791 void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
 792                                 const nbnxn_atomdata_t *nbatom)
 793 {
 794     cu_atomdata_t *adat  = cu_nb->atdat;
 795     cudaStream_t   ls    = cu_nb->stream[eintLocal];
 796
 797     /* only if we have a dynamic box */
 798     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
 799     {
 800         cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
 801                           SHIFTS * sizeof(*adat->shift_vec), ls);
 802         adat->bShiftVecUploaded = true;
 803     }
 804 }
 805
 806 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 807 static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 808 {
 809     cudaError_t    stat;
 810     cu_atomdata_t *adat  = cu_nb->atdat;
 811     cudaStream_t   ls    = cu_nb->stream[eintLocal];
 812
 813     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
 814     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
 815 }
 816
 817 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 818 static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 819 {
 820     cudaError_t    stat;
 821     cu_atomdata_t *adat  = cu_nb->atdat;
 822     cudaStream_t   ls    = cu_nb->stream[eintLocal];
 823
 824     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
 825     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
 826     stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
 827     CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
 828     stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
 829     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 830 }
 831
 832 void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
 833 {
 834     nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
 835     /* clear shift force array and energies if the outputs were
 836        used in the current step */
 837     if (flags & GMX_FORCE_VIRIAL)
 838     {
 839         nbnxn_cuda_clear_e_fshift(cu_nb);
 840     }
 841 }
 842
 843 void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
 844                               const nbnxn_atomdata_t *nbat)
 845 {
 846     cudaError_t    stat;
 847     int            nalloc, natoms;
 848     bool           realloced;
 849     bool           bDoTime   = cu_nb->bDoTime;
 850     cu_timers_t   *timers    = cu_nb->timers;
 851     cu_atomdata_t *d_atdat   = cu_nb->atdat;
 852     cudaStream_t   ls        = cu_nb->stream[eintLocal];
 853
 854     natoms    = nbat->natoms;
 855     realloced = false;
 856
 857     if (bDoTime)
 858     {
 859         /* time async copy */
 860         stat = cudaEventRecord(timers->start_atdat, ls);
 861         CU_RET_ERR(stat, "cudaEventRecord failed");
 862     }
 863
 864     /* need to reallocate if we have to copy more atoms than the amount of space
 865        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
 866     if (natoms > d_atdat->nalloc)
 867     {
 868         nalloc = over_alloc_small(natoms);
 869
 870         /* free up first if the arrays have already been initialized */
 871         if (d_atdat->nalloc != -1)
 872         {
 873             cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
 874             cu_free_buffered(d_atdat->xq);
 875             cu_free_buffered(d_atdat->atom_types);
 876         }
 877
 878         stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
 879         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
 880         stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
 881         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
 882
 883         stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
 884         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 885
 886         d_atdat->nalloc = nalloc;
 887         realloced       = true;
 888     }
 889
 890     d_atdat->natoms       = natoms;
 891     d_atdat->natoms_local = nbat->natoms_local;
 892
 893     /* need to clear GPU f output if realloc happened */
 894     if (realloced)
 895     {
 896         nbnxn_cuda_clear_f(cu_nb, nalloc);
 897     }
 898
 899     cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
 900                       natoms*sizeof(*d_atdat->atom_types), ls);
 901
 902     if (bDoTime)
 903     {
 904         stat = cudaEventRecord(timers->stop_atdat, ls);
 905         CU_RET_ERR(stat, "cudaEventRecord failed");
 906     }
 907 }
 908
 909 void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 910 {
 911     cudaError_t      stat;
 912     cu_atomdata_t   *atdat;
 913     cu_nbparam_t    *nbparam;
 914     cu_plist_t      *plist, *plist_nl;
 915     cu_timers_t     *timers;
 916
 917     if (cu_nb == NULL)
 918     {
 919         return;
 920     }
 921
 922     atdat       = cu_nb->atdat;
 923     nbparam     = cu_nb->nbparam;
 924     plist       = cu_nb->plist[eintLocal];
 925     plist_nl    = cu_nb->plist[eintNonlocal];
 926     timers      = cu_nb->timers;
 927
 928     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
 929     {
 930
 931 #ifdef TEXOBJ_SUPPORTED
 932         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 933         if (cu_nb->dev_info->prop.major >= 3)
 934         {
 935             stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
 936             CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
 937         }
 938         else
 939 #endif
 940         {
 941             stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
 942             CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
 943         }
 944         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
 945     }
 946
 947     stat = cudaEventDestroy(cu_nb->nonlocal_done);
 948     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
 949     stat = cudaEventDestroy(cu_nb->misc_ops_done);
 950     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
 951
 952     if (cu_nb->bDoTime)
 953     {
 954         stat = cudaEventDestroy(timers->start_atdat);
 955         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
 956         stat = cudaEventDestroy(timers->stop_atdat);
 957         CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
 958
 959         /* The non-local counters/stream (second in the array) are needed only with DD. */
 960         for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
 961         {
 962             stat = cudaEventDestroy(timers->start_nb_k[i]);
 963             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
 964             stat = cudaEventDestroy(timers->stop_nb_k[i]);
 965             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
 966
 967             stat = cudaEventDestroy(timers->start_pl_h2d[i]);
 968             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
 969             stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
 970             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
 971
 972             stat = cudaStreamDestroy(cu_nb->stream[i]);
 973             CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
 974
 975             stat = cudaEventDestroy(timers->start_nb_h2d[i]);
 976             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
 977             stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
 978             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
 979
 980             stat = cudaEventDestroy(timers->start_nb_d2h[i]);
 981             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
 982             stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
 983             CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
 984         }
 985     }
 986
 987 #ifdef TEXOBJ_SUPPORTED
 988     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
 989     if (cu_nb->dev_info->prop.major >= 3)
 990     {
 991         stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
 992         CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
 993     }
 994     else
 995 #endif
 996     {
 997         stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
 998         CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
 999     }
1000     cu_free_buffered(nbparam->nbfp);
1001
1002     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1003     {
1004 #ifdef TEXOBJ_SUPPORTED
1005         /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1006         if (cu_nb->dev_info->prop.major >= 3)
1007         {
1008             stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1009             CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1010         }
1011         else
1012 #endif
1013         {
1014             stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1015             CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1016         }
1017         cu_free_buffered(nbparam->nbfp_comb);
1018     }
1019
1020     stat = cudaFree(atdat->shift_vec);
1021     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1022     stat = cudaFree(atdat->fshift);
1023     CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1024
1025     stat = cudaFree(atdat->e_lj);
1026     CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1027     stat = cudaFree(atdat->e_el);
1028     CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1029
1030     cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1031     cu_free_buffered(atdat->xq);
1032     cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1033
1034     cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1035     cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1036     cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1037     if (cu_nb->bUseTwoStreams)
1038     {
1039         cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1040         cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1041         cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1042     }
1043
1044     sfree(atdat);
1045     sfree(nbparam);
1046     sfree(plist);
1047     if (cu_nb->bUseTwoStreams)
1048     {
1049         sfree(plist_nl);
1050     }
1051     sfree(timers);
1052     sfree(cu_nb->timings);
1053     sfree(cu_nb);
1054
1055     if (debug)
1056     {
1057         fprintf(debug, "Cleaned up CUDA data structures.\n");
1058     }
1059 }
1060
1061 void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
1062 {
1063     cudaError_t  stat;
1064     cudaStream_t stream = cu_nb->stream[iloc];
1065
1066     stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
1067     CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1068 }
1069
1070 wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
1071 {
1072     return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
1073 }
1074
1075 void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
1076 {
1077     if (cu_nb->bDoTime)
1078     {
1079         init_timings(cu_nb->timings);
1080     }
1081 }
1082
1083 int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
1084 {
1085     return cu_nb != NULL ?
1086            gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
1087
1088 }
1089
1090 gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
1091 {
1092     return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1093             (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
1094 }