2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/typedefs.h"
56 #include "gromacs/legacyheaders/types/enums.h"
57 #include "gromacs/legacyheaders/types/force_flags.h"
58 #include "gromacs/legacyheaders/types/interaction_const.h"
59 #include "gromacs/mdlib/nb_verlet.h"
60 #include "gromacs/mdlib/nbnxn_consts.h"
61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
62 #include "gromacs/pbcutil/ishift.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/smalloc.h"
69 #include "nbnxn_cuda_types.h"
71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
73 /* This is a heuristically determined parameter for the Fermi, Kepler
74 * and Maxwell architectures for the minimum size of ci lists by multiplying
75 * this constant with the # of multiprocessors on the current device.
76 * Since the maximum number of blocks per multiprocessor is 16, the ideal
77 * count for small systems is 32 or 48 blocks per multiprocessor. Because
78 * there is a bit of fluctuations in the generated block counts, we use
79 * a target of 44 instead of the ideal value of 48.
81 static unsigned int gpu_min_ci_balanced_factor = 44;
83 /* Functions from nbnxn_cuda.cu */
84 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
85 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
89 /* We should actually be using md_print_warn in md_logging.c,
90 * but we can't include mpi.h in CUDA code.
92 static void md_print_warn(FILE *fplog,
99 /* We should only print to stderr on the master node,
100 * in most cases fplog is only set on the master node, so this works.
103 fprintf(stderr, "\n");
104 vfprintf(stderr, fmt, ap);
105 fprintf(stderr, "\n");
109 fprintf(fplog, "\n");
110 vfprintf(fplog, fmt, ap);
111 fprintf(fplog, "\n");
118 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
121 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
122 const gmx_device_info_t *dev_info);
126 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
127 static bool use_texobj(const gmx_device_info_t *dev_info)
129 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
130 return (dev_info->prop.major >= 3);
134 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
135 and the table GPU array. If called with an already allocated table,
136 it just re-uploads the table.
138 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
140 const gmx_device_info_t *dev_info)
145 if (nbp->coulomb_tab != NULL)
147 nbnxn_cuda_free_nbparam_table(nbp, dev_info);
150 stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
151 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
153 nbp->coulomb_tab = coul_tab;
155 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
156 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
157 if (use_texobj(dev_info))
160 memset(&rd, 0, sizeof(rd));
161 rd.resType = cudaResourceTypeLinear;
162 rd.res.linear.devPtr = nbp->coulomb_tab;
163 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
164 rd.res.linear.desc.x = 32;
165 rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab);
168 memset(&td, 0, sizeof(td));
169 td.readMode = cudaReadModeElementType;
170 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
171 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
174 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
176 GMX_UNUSED_VALUE(dev_info);
177 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
178 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
180 ic->tabq_size*sizeof(*coul_tab));
181 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
184 cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
186 nbp->coulomb_tab_size = ic->tabq_size;
187 nbp->coulomb_tab_scale = ic->tabq_scale;
191 /*! Initializes the atomdata structure first time, it only gets filled at
193 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
198 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
199 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
200 ad->bShiftVecUploaded = false;
202 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
203 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
205 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
206 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
207 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
208 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
210 /* initialize to NULL poiters to data that is not allocated here and will
211 need reallocation in nbnxn_cuda_init_atomdata */
215 /* size -1 indicates that the respective array hasn't been initialized yet */
220 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
221 earlier GPUs, single or twin cut-off. */
222 static int pick_ewald_kernel_type(bool bTwinCut,
223 const gmx_device_info_t *dev_info)
225 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
228 /* Benchmarking/development environment variables to force the use of
229 analytical or tabulated Ewald kernel. */
230 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
231 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
233 if (bForceAnalyticalEwald && bForceTabulatedEwald)
235 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
236 "requested through environment variables.");
239 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
240 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
242 bUseAnalyticalEwald = true;
246 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
251 bUseAnalyticalEwald = false;
255 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
259 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
260 forces it (use it for debugging/benchmarking only). */
261 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
263 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
267 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
273 /*! Copies all parameters related to the cut-off from ic to nbp */
274 static void set_cutoff_parameters(cu_nbparam_t *nbp,
275 const interaction_const_t *ic)
277 nbp->ewald_beta = ic->ewaldcoeff_q;
278 nbp->sh_ewald = ic->sh_ewald;
279 nbp->epsfac = ic->epsfac;
280 nbp->two_k_rf = 2.0 * ic->k_rf;
281 nbp->c_rf = ic->c_rf;
282 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
283 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
284 nbp->rlist_sq = ic->rlist * ic->rlist;
286 nbp->sh_lj_ewald = ic->sh_lj_ewald;
287 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
289 nbp->rvdw_switch = ic->rvdw_switch;
290 nbp->dispersion_shift = ic->dispersion_shift;
291 nbp->repulsion_shift = ic->repulsion_shift;
292 nbp->vdw_switch = ic->vdw_switch;
295 /*! Initializes the nonbonded parameter data structure. */
296 static void init_nbparam(cu_nbparam_t *nbp,
297 const interaction_const_t *ic,
298 const nbnxn_atomdata_t *nbat,
299 const gmx_device_info_t *dev_info)
302 int ntypes, nnbfp, nnbfp_comb;
304 ntypes = nbat->ntype;
306 set_cutoff_parameters(nbp, ic);
308 if (ic->vdwtype == evdwCUT)
310 switch (ic->vdw_modifier)
313 case eintmodPOTSHIFT:
314 nbp->vdwtype = evdwCuCUT;
316 case eintmodFORCESWITCH:
317 nbp->vdwtype = evdwCuFSWITCH;
319 case eintmodPOTSWITCH:
320 nbp->vdwtype = evdwCuPSWITCH;
323 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
327 else if (ic->vdwtype == evdwPME)
329 if (ic->ljpme_comb_rule == ljcrGEOM)
331 assert(nbat->comb_rule == ljcrGEOM);
332 nbp->vdwtype = evdwCuEWALDGEOM;
336 assert(nbat->comb_rule == ljcrLB);
337 nbp->vdwtype = evdwCuEWALDLB;
342 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
345 if (ic->eeltype == eelCUT)
347 nbp->eeltype = eelCuCUT;
349 else if (EEL_RF(ic->eeltype))
351 nbp->eeltype = eelCuRF;
353 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
355 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
356 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
360 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
361 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
364 /* generate table for PME */
365 nbp->coulomb_tab = NULL;
366 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
368 init_ewald_coulomb_force_table(ic, nbp, dev_info);
371 nnbfp = 2*ntypes*ntypes;
372 nnbfp_comb = 2*ntypes;
374 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
375 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
376 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
379 if (ic->vdwtype == evdwPME)
381 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
382 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
383 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
386 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
387 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
388 if (use_texobj(dev_info))
393 memset(&rd, 0, sizeof(rd));
394 rd.resType = cudaResourceTypeLinear;
395 rd.res.linear.devPtr = nbp->nbfp;
396 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
397 rd.res.linear.desc.x = 32;
398 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
400 memset(&td, 0, sizeof(td));
401 td.readMode = cudaReadModeElementType;
402 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
403 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
405 if (ic->vdwtype == evdwPME)
407 memset(&rd, 0, sizeof(rd));
408 rd.resType = cudaResourceTypeLinear;
409 rd.res.linear.devPtr = nbp->nbfp_comb;
410 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
411 rd.res.linear.desc.x = 32;
412 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
414 memset(&td, 0, sizeof(td));
415 td.readMode = cudaReadModeElementType;
416 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
417 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
421 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
423 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
424 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
425 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
426 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
428 if (ic->vdwtype == evdwPME)
430 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
431 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
432 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
437 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
438 * electrostatic type switching to twin cut-off (or back) if needed. */
439 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
440 const interaction_const_t *ic)
442 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
446 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
447 cu_nbparam_t *nbp = nb->nbparam;
449 set_cutoff_parameters(nbp, ic);
451 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
454 init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
457 /*! Initializes the pair list data structure. */
458 static void init_plist(cu_plist_t *pl)
460 /* initialize to NULL pointers to data that is not allocated here and will
461 need reallocation in nbnxn_gpu_init_pairlist */
466 /* size -1 indicates that the respective array hasn't been initialized yet */
473 pl->excl_nalloc = -1;
474 pl->bDoPrune = false;
477 /*! Initializes the timer data structure. */
478 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
481 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
483 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
484 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
485 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
486 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
488 /* The non-local counters/stream (second in the array) are needed only with DD. */
489 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
491 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
492 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
493 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
494 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
497 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
498 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
499 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
500 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
502 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
503 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
504 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
505 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
507 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
508 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
509 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
510 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
514 /*! Initializes the timings data structure. */
515 static void init_timings(gmx_wallclock_gpu_t *t)
524 for (i = 0; i < 2; i++)
526 for (j = 0; j < 2; j++)
528 t->ktime[i][j].t = 0.0;
529 t->ktime[i][j].c = 0;
534 /*! Initializes simulation constant data. */
535 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb,
536 const interaction_const_t *ic,
537 const nonbonded_verlet_group_t *nbv_group)
539 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
540 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
542 /* clear energy and shift force outputs */
543 nbnxn_cuda_clear_e_fshift(nb);
546 void nbnxn_gpu_init(FILE *fplog,
547 gmx_nbnxn_cuda_t **p_nb,
548 const gmx_gpu_info_t *gpu_info,
549 const gmx_gpu_opt_t *gpu_opt,
550 const interaction_const_t *ic,
551 nonbonded_verlet_group_t *nbv_grp,
554 gmx_bool bLocalAndNonlocal)
557 gmx_nbnxn_cuda_t *nb;
559 bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
571 snew(nb->nbparam, 1);
572 snew(nb->plist[eintLocal], 1);
573 if (bLocalAndNonlocal)
575 snew(nb->plist[eintNonlocal], 1);
578 nb->bUseTwoStreams = bLocalAndNonlocal;
581 snew(nb->timings, 1);
584 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
585 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
586 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
588 init_plist(nb->plist[eintLocal]);
590 /* set device info, just point it to the right GPU among the detected ones */
591 nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
593 /* local/non-local GPU streams */
594 stat = cudaStreamCreate(&nb->stream[eintLocal]);
595 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
596 if (nb->bUseTwoStreams)
598 init_plist(nb->plist[eintNonlocal]);
600 /* CUDA stream priority available in the CUDA RT 5.5 API.
601 * Note that the device we're running on does not have to support
602 * priorities, because we are querying the priority range which in this
603 * case will be a single value.
605 #if GMX_CUDA_VERSION >= 5050
607 int highest_priority;
608 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
609 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
611 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
614 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
617 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
618 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
622 /* init events for sychronization (timing disabled for performance reasons!) */
623 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
624 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
625 stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
626 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
628 /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
629 * (which increases with shorter time/step) caused by a known CUDA driver bug.
630 * To work around the issue we'll use an (admittedly fragile) memory polling
631 * waiting to preserve performance. This requires support for atomic
632 * operations and only works on x86/x86_64.
633 * With polling wait event-timing also needs to be disabled.
635 * The overhead is greatly reduced in API v5.0 drivers and the improvement
636 * is independent of runtime version. Hence, with API v5.0 drivers and later
637 * we won't switch to polling.
639 * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
640 * ranks so we will also disable it in that case.
643 bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
644 bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
649 bTMPIAtomics = false;
652 #ifdef GMX_TARGET_X86
658 if (bStreamSync && bNoStreamSync)
660 gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
663 stat = cudaDriverGetVersion(&cuda_drv_ver);
664 CU_RET_ERR(stat, "cudaDriverGetVersion failed");
666 bOldDriver = (cuda_drv_ver < 5000);
668 if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
670 /* Polling wait should be used instead of cudaStreamSynchronize only if:
671 * - ECC is ON & driver is old (checked above),
672 * - we're on x86/x86_64,
673 * - atomics are available, and
674 * - GPUs are not being shared.
676 bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
677 (gmx_count_gpu_dev_shared(gpu_opt) < 1));
681 nb->bUseStreamSync = true;
683 /* only warn if polling should be used */
684 if (bShouldUsePollSync)
687 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
688 " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
693 nb->bUseStreamSync = !bShouldUsePollSync;
695 if (bShouldUsePollSync)
698 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
699 " cause performance loss. Switching to the alternative polling GPU wait.\n"
700 " If you encounter issues, switch back to standard GPU waiting by setting\n"
701 " the GMX_CUDA_STREAMSYNC environment variable.\n");
705 /* Tell the user that the ECC+old driver combination can be bad */
707 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
708 " A known bug in this driver version can cause performance loss.\n"
709 " However, the polling wait workaround can not be used because\n%s\n"
710 " Consider updating the driver or turning ECC off.",
711 (bX86 && bTMPIAtomics) ?
712 " GPU(s) are being oversubscribed." :
713 " atomic operations are not supported by the platform/CPU+compiler.");
714 md_print_warn(fplog, sbuf);
722 nb->bUseStreamSync = false;
725 "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
729 /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
730 nb->bUseStreamSync = true;
734 /* CUDA timing disabled as event timers don't work:
735 - with multiple streams = domain-decomposition;
736 - with the polling waiting hack (without cudaStreamSynchronize);
737 - when turned off by GMX_DISABLE_CUDA_TIMING.
739 nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
740 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
744 init_timers(nb->timers, nb->bUseTwoStreams);
745 init_timings(nb->timings);
748 /* set the kernel type for the current GPU */
749 /* pick L1 cache configuration */
750 nbnxn_cuda_set_cacheconfig(nb->dev_info);
752 nbnxn_cuda_init_const(nb, ic, nbv_grp);
758 fprintf(debug, "Initialized CUDA data structures.\n");
762 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
763 const nbnxn_pairlist_t *h_plist,
768 bool bDoTime = nb->bDoTime;
769 cudaStream_t stream = nb->stream[iloc];
770 cu_plist_t *d_plist = nb->plist[iloc];
772 if (d_plist->na_c < 0)
774 d_plist->na_c = h_plist->na_ci;
778 if (d_plist->na_c != h_plist->na_ci)
780 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
781 d_plist->na_c, h_plist->na_ci);
788 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
789 CU_RET_ERR(stat, "cudaEventRecord failed");
792 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
793 &d_plist->nsci, &d_plist->sci_nalloc,
797 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
798 &d_plist->ncj4, &d_plist->cj4_nalloc,
802 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
803 &d_plist->nexcl, &d_plist->excl_nalloc,
809 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
810 CU_RET_ERR(stat, "cudaEventRecord failed");
813 /* need to prune the pair list during the next step */
814 d_plist->bDoPrune = true;
817 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
818 const nbnxn_atomdata_t *nbatom)
820 cu_atomdata_t *adat = nb->atdat;
821 cudaStream_t ls = nb->stream[eintLocal];
823 /* only if we have a dynamic box */
824 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
826 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
827 SHIFTS * sizeof(*adat->shift_vec), ls);
828 adat->bShiftVecUploaded = true;
832 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
833 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
836 cu_atomdata_t *adat = nb->atdat;
837 cudaStream_t ls = nb->stream[eintLocal];
839 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
840 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
843 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
844 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
847 cu_atomdata_t *adat = nb->atdat;
848 cudaStream_t ls = nb->stream[eintLocal];
850 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
851 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
852 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
853 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
854 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
855 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
858 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
860 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
861 /* clear shift force array and energies if the outputs were
862 used in the current step */
863 if (flags & GMX_FORCE_VIRIAL)
865 nbnxn_cuda_clear_e_fshift(nb);
869 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
870 const struct nbnxn_atomdata_t *nbat)
875 bool bDoTime = nb->bDoTime;
876 cu_timers_t *timers = nb->timers;
877 cu_atomdata_t *d_atdat = nb->atdat;
878 cudaStream_t ls = nb->stream[eintLocal];
880 natoms = nbat->natoms;
885 /* time async copy */
886 stat = cudaEventRecord(timers->start_atdat, ls);
887 CU_RET_ERR(stat, "cudaEventRecord failed");
890 /* need to reallocate if we have to copy more atoms than the amount of space
891 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
892 if (natoms > d_atdat->nalloc)
894 nalloc = over_alloc_small(natoms);
896 /* free up first if the arrays have already been initialized */
897 if (d_atdat->nalloc != -1)
899 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
900 cu_free_buffered(d_atdat->xq);
901 cu_free_buffered(d_atdat->atom_types);
904 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
905 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
906 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
907 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
909 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
910 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
912 d_atdat->nalloc = nalloc;
916 d_atdat->natoms = natoms;
917 d_atdat->natoms_local = nbat->natoms_local;
919 /* need to clear GPU f output if realloc happened */
922 nbnxn_cuda_clear_f(nb, nalloc);
925 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
926 natoms*sizeof(*d_atdat->atom_types), ls);
930 stat = cudaEventRecord(timers->stop_atdat, ls);
931 CU_RET_ERR(stat, "cudaEventRecord failed");
935 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
936 const gmx_device_info_t *dev_info)
940 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
942 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
943 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
944 if (use_texobj(dev_info))
946 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
947 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
952 GMX_UNUSED_VALUE(dev_info);
953 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
954 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
956 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
960 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
963 cu_atomdata_t *atdat;
964 cu_nbparam_t *nbparam;
965 cu_plist_t *plist, *plist_nl;
968 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
969 uninitialization API calls from the trace. */
970 if (getenv("NVPROF_ID") != NULL)
972 stat = cudaProfilerStop();
973 CU_RET_ERR(stat, "cudaProfilerStop failed");
982 nbparam = nb->nbparam;
983 plist = nb->plist[eintLocal];
984 plist_nl = nb->plist[eintNonlocal];
987 nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
989 stat = cudaEventDestroy(nb->nonlocal_done);
990 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
991 stat = cudaEventDestroy(nb->misc_ops_done);
992 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
996 stat = cudaEventDestroy(timers->start_atdat);
997 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
998 stat = cudaEventDestroy(timers->stop_atdat);
999 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
1001 /* The non-local counters/stream (second in the array) are needed only with DD. */
1002 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
1004 stat = cudaEventDestroy(timers->start_nb_k[i]);
1005 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
1006 stat = cudaEventDestroy(timers->stop_nb_k[i]);
1007 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
1009 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
1010 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
1011 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
1012 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
1014 stat = cudaStreamDestroy(nb->stream[i]);
1015 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1017 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1018 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1019 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1020 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1022 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1023 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1024 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1025 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1029 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1030 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1031 if (use_texobj(nb->dev_info))
1033 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1034 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1039 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1040 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1042 cu_free_buffered(nbparam->nbfp);
1044 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1046 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1047 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1048 if (use_texobj(nb->dev_info))
1050 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1051 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1056 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1057 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1059 cu_free_buffered(nbparam->nbfp_comb);
1062 stat = cudaFree(atdat->shift_vec);
1063 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1064 stat = cudaFree(atdat->fshift);
1065 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1067 stat = cudaFree(atdat->e_lj);
1068 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1069 stat = cudaFree(atdat->e_el);
1070 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1072 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1073 cu_free_buffered(atdat->xq);
1074 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1076 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1077 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1078 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1079 if (nb->bUseTwoStreams)
1081 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1082 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1083 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1089 if (nb->bUseTwoStreams)
1099 fprintf(debug, "Cleaned up CUDA data structures.\n");
1103 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1106 cudaStream_t stream = nb->stream[iloc];
1108 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1109 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1112 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1114 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1117 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1119 /* The NVPROF_ID environment variable is set by nvprof and indicates that
1120 mdrun is executed in the CUDA profiler.
1121 If nvprof was run is with "--profile-from-start off", the profiler will
1122 be started here. This way we can avoid tracing the CUDA events from the
1123 first part of the run. Starting the profiler again does nothing.
1125 if (getenv("NVPROF_ID") != NULL)
1128 stat = cudaProfilerStart();
1129 CU_RET_ERR(stat, "cudaProfilerStart failed");
1132 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1134 init_timings(nbv->gpu_nbv->timings);
1138 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1141 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1145 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1147 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1148 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));