2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/typedefs.h"
56 #include "gromacs/legacyheaders/types/enums.h"
57 #include "gromacs/legacyheaders/types/force_flags.h"
58 #include "gromacs/legacyheaders/types/interaction_const.h"
59 #include "gromacs/mdlib/nb_verlet.h"
60 #include "gromacs/mdlib/nbnxn_consts.h"
61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
62 #include "gromacs/pbcutil/ishift.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/smalloc.h"
69 #include "nbnxn_cuda_types.h"
71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
73 /* This is a heuristically determined parameter for the Fermi architecture for
74 * the minimum size of ci lists by multiplying this constant with the # of
75 * multiprocessors on the current device.
77 static unsigned int gpu_min_ci_balanced_factor = 40;
79 /* Functions from nbnxn_cuda.cu */
80 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
81 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
82 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
83 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
85 /* We should actually be using md_print_warn in md_logging.c,
86 * but we can't include mpi.h in CUDA code.
88 static void md_print_warn(FILE *fplog,
95 /* We should only print to stderr on the master node,
96 * in most cases fplog is only set on the master node, so this works.
99 fprintf(stderr, "\n");
100 vfprintf(stderr, fmt, ap);
101 fprintf(stderr, "\n");
105 fprintf(fplog, "\n");
106 vfprintf(fplog, fmt, ap);
107 fprintf(fplog, "\n");
114 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
117 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
118 const gmx_device_info_t *dev_info);
122 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
123 static bool use_texobj(const gmx_device_info_t *dev_info)
125 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
126 return (dev_info->prop.major >= 3);
130 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
131 and the table GPU array. If called with an already allocated table,
132 it just re-uploads the table.
134 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
136 const gmx_device_info_t *dev_info)
141 if (nbp->coulomb_tab != NULL)
143 nbnxn_cuda_free_nbparam_table(nbp, dev_info);
146 stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
147 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
149 nbp->coulomb_tab = coul_tab;
151 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
152 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
153 if (use_texobj(dev_info))
156 memset(&rd, 0, sizeof(rd));
157 rd.resType = cudaResourceTypeLinear;
158 rd.res.linear.devPtr = nbp->coulomb_tab;
159 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
160 rd.res.linear.desc.x = 32;
161 rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab);
164 memset(&td, 0, sizeof(td));
165 td.readMode = cudaReadModeElementType;
166 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
167 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
170 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
172 GMX_UNUSED_VALUE(dev_info);
173 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
174 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
176 ic->tabq_size*sizeof(*coul_tab));
177 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
180 cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
182 nbp->coulomb_tab_size = ic->tabq_size;
183 nbp->coulomb_tab_scale = ic->tabq_scale;
187 /*! Initializes the atomdata structure first time, it only gets filled at
189 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
194 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
195 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
196 ad->bShiftVecUploaded = false;
198 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
199 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
201 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
202 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
203 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
204 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
206 /* initialize to NULL poiters to data that is not allocated here and will
207 need reallocation in nbnxn_cuda_init_atomdata */
211 /* size -1 indicates that the respective array hasn't been initialized yet */
216 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
217 earlier GPUs, single or twin cut-off. */
218 static int pick_ewald_kernel_type(bool bTwinCut,
219 const gmx_device_info_t *dev_info)
221 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
224 /* Benchmarking/development environment variables to force the use of
225 analytical or tabulated Ewald kernel. */
226 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
227 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
229 if (bForceAnalyticalEwald && bForceTabulatedEwald)
231 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
232 "requested through environment variables.");
235 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
236 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
238 bUseAnalyticalEwald = true;
242 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
247 bUseAnalyticalEwald = false;
251 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
255 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
256 forces it (use it for debugging/benchmarking only). */
257 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
259 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
263 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
269 /*! Copies all parameters related to the cut-off from ic to nbp */
270 static void set_cutoff_parameters(cu_nbparam_t *nbp,
271 const interaction_const_t *ic)
273 nbp->ewald_beta = ic->ewaldcoeff_q;
274 nbp->sh_ewald = ic->sh_ewald;
275 nbp->epsfac = ic->epsfac;
276 nbp->two_k_rf = 2.0 * ic->k_rf;
277 nbp->c_rf = ic->c_rf;
278 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
279 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
280 nbp->rlist_sq = ic->rlist * ic->rlist;
282 nbp->sh_lj_ewald = ic->sh_lj_ewald;
283 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
285 nbp->rvdw_switch = ic->rvdw_switch;
286 nbp->dispersion_shift = ic->dispersion_shift;
287 nbp->repulsion_shift = ic->repulsion_shift;
288 nbp->vdw_switch = ic->vdw_switch;
291 /*! Initializes the nonbonded parameter data structure. */
292 static void init_nbparam(cu_nbparam_t *nbp,
293 const interaction_const_t *ic,
294 const nbnxn_atomdata_t *nbat,
295 const gmx_device_info_t *dev_info)
298 int ntypes, nnbfp, nnbfp_comb;
300 ntypes = nbat->ntype;
302 set_cutoff_parameters(nbp, ic);
304 if (ic->vdwtype == evdwCUT)
306 switch (ic->vdw_modifier)
309 case eintmodPOTSHIFT:
310 nbp->vdwtype = evdwCuCUT;
312 case eintmodFORCESWITCH:
313 nbp->vdwtype = evdwCuFSWITCH;
315 case eintmodPOTSWITCH:
316 nbp->vdwtype = evdwCuPSWITCH;
319 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
323 else if (ic->vdwtype == evdwPME)
325 if (ic->ljpme_comb_rule == ljcrGEOM)
327 assert(nbat->comb_rule == ljcrGEOM);
328 nbp->vdwtype = evdwCuEWALDGEOM;
332 assert(nbat->comb_rule == ljcrLB);
333 nbp->vdwtype = evdwCuEWALDLB;
338 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
341 if (ic->eeltype == eelCUT)
343 nbp->eeltype = eelCuCUT;
345 else if (EEL_RF(ic->eeltype))
347 nbp->eeltype = eelCuRF;
349 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
351 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
352 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
356 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
357 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
360 /* generate table for PME */
361 nbp->coulomb_tab = NULL;
362 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
364 init_ewald_coulomb_force_table(ic, nbp, dev_info);
367 nnbfp = 2*ntypes*ntypes;
368 nnbfp_comb = 2*ntypes;
370 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
371 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
372 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
375 if (ic->vdwtype == evdwPME)
377 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
378 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
379 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
382 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
383 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
384 if (use_texobj(dev_info))
389 memset(&rd, 0, sizeof(rd));
390 rd.resType = cudaResourceTypeLinear;
391 rd.res.linear.devPtr = nbp->nbfp;
392 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
393 rd.res.linear.desc.x = 32;
394 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
396 memset(&td, 0, sizeof(td));
397 td.readMode = cudaReadModeElementType;
398 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
399 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
401 if (ic->vdwtype == evdwPME)
403 memset(&rd, 0, sizeof(rd));
404 rd.resType = cudaResourceTypeLinear;
405 rd.res.linear.devPtr = nbp->nbfp_comb;
406 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
407 rd.res.linear.desc.x = 32;
408 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
410 memset(&td, 0, sizeof(td));
411 td.readMode = cudaReadModeElementType;
412 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
413 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
417 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
419 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
420 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
421 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
422 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
424 if (ic->vdwtype == evdwPME)
426 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
427 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
428 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
433 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
434 * electrostatic type switching to twin cut-off (or back) if needed. */
435 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
436 const interaction_const_t *ic)
438 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
442 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
443 cu_nbparam_t *nbp = nb->nbparam;
445 set_cutoff_parameters(nbp, ic);
447 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
450 init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
453 /*! Initializes the pair list data structure. */
454 static void init_plist(cu_plist_t *pl)
456 /* initialize to NULL pointers to data that is not allocated here and will
457 need reallocation in nbnxn_gpu_init_pairlist */
462 /* size -1 indicates that the respective array hasn't been initialized yet */
469 pl->excl_nalloc = -1;
470 pl->bDoPrune = false;
473 /*! Initializes the timer data structure. */
474 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
477 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
479 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
480 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
481 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
482 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
484 /* The non-local counters/stream (second in the array) are needed only with DD. */
485 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
487 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
488 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
489 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
490 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
493 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
494 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
495 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
496 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
498 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
499 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
500 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
501 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
503 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
504 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
505 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
506 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
510 /*! Initializes the timings data structure. */
511 static void init_timings(gmx_wallclock_gpu_t *t)
520 for (i = 0; i < 2; i++)
522 for (j = 0; j < 2; j++)
524 t->ktime[i][j].t = 0.0;
525 t->ktime[i][j].c = 0;
530 void nbnxn_gpu_init(FILE *fplog,
531 gmx_nbnxn_cuda_t **p_nb,
532 const gmx_gpu_info_t *gpu_info,
533 const gmx_gpu_opt_t *gpu_opt,
535 gmx_bool bLocalAndNonlocal)
538 gmx_nbnxn_cuda_t *nb;
540 bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
552 snew(nb->nbparam, 1);
553 snew(nb->plist[eintLocal], 1);
554 if (bLocalAndNonlocal)
556 snew(nb->plist[eintNonlocal], 1);
559 nb->bUseTwoStreams = bLocalAndNonlocal;
562 snew(nb->timings, 1);
565 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
566 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
567 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
569 init_plist(nb->plist[eintLocal]);
571 /* set device info, just point it to the right GPU among the detected ones */
572 nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
574 /* local/non-local GPU streams */
575 stat = cudaStreamCreate(&nb->stream[eintLocal]);
576 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
577 if (nb->bUseTwoStreams)
579 init_plist(nb->plist[eintNonlocal]);
581 /* CUDA stream priority available in the CUDA RT 5.5 API.
582 * Note that the device we're running on does not have to support
583 * priorities, because we are querying the priority range which in this
584 * case will be a single value.
586 #if GMX_CUDA_VERSION >= 5050
588 int highest_priority;
589 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
590 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
592 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
595 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
598 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
599 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
603 /* init events for sychronization (timing disabled for performance reasons!) */
604 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
605 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
606 stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
607 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
609 /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
610 * (which increases with shorter time/step) caused by a known CUDA driver bug.
611 * To work around the issue we'll use an (admittedly fragile) memory polling
612 * waiting to preserve performance. This requires support for atomic
613 * operations and only works on x86/x86_64.
614 * With polling wait event-timing also needs to be disabled.
616 * The overhead is greatly reduced in API v5.0 drivers and the improvement
617 * is independent of runtime version. Hence, with API v5.0 drivers and later
618 * we won't switch to polling.
620 * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
621 * ranks so we will also disable it in that case.
624 bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
625 bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
630 bTMPIAtomics = false;
633 #ifdef GMX_TARGET_X86
639 if (bStreamSync && bNoStreamSync)
641 gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
644 stat = cudaDriverGetVersion(&cuda_drv_ver);
645 CU_RET_ERR(stat, "cudaDriverGetVersion failed");
647 bOldDriver = (cuda_drv_ver < 5000);
649 if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
651 /* Polling wait should be used instead of cudaStreamSynchronize only if:
652 * - ECC is ON & driver is old (checked above),
653 * - we're on x86/x86_64,
654 * - atomics are available, and
655 * - GPUs are not being shared.
657 bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
658 (gmx_count_gpu_dev_shared(gpu_opt) < 1));
662 nb->bUseStreamSync = true;
664 /* only warn if polling should be used */
665 if (bShouldUsePollSync)
668 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
669 " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
674 nb->bUseStreamSync = !bShouldUsePollSync;
676 if (bShouldUsePollSync)
679 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
680 " cause performance loss. Switching to the alternative polling GPU wait.\n"
681 " If you encounter issues, switch back to standard GPU waiting by setting\n"
682 " the GMX_CUDA_STREAMSYNC environment variable.\n");
686 /* Tell the user that the ECC+old driver combination can be bad */
688 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
689 " A known bug in this driver version can cause performance loss.\n"
690 " However, the polling wait workaround can not be used because\n%s\n"
691 " Consider updating the driver or turning ECC off.",
692 (bX86 && bTMPIAtomics) ?
693 " GPU(s) are being oversubscribed." :
694 " atomic operations are not supported by the platform/CPU+compiler.");
695 md_print_warn(fplog, sbuf);
703 nb->bUseStreamSync = false;
706 "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
710 /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
711 nb->bUseStreamSync = true;
715 /* CUDA timing disabled as event timers don't work:
716 - with multiple streams = domain-decomposition;
717 - with the polling waiting hack (without cudaStreamSynchronize);
718 - when turned off by GMX_DISABLE_CUDA_TIMING.
720 nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
721 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
725 init_timers(nb->timers, nb->bUseTwoStreams);
726 init_timings(nb->timings);
729 /* set the kernel type for the current GPU */
730 /* pick L1 cache configuration */
731 nbnxn_cuda_set_cacheconfig(nb->dev_info);
737 fprintf(debug, "Initialized CUDA data structures.\n");
741 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
742 const interaction_const_t *ic,
743 const nonbonded_verlet_group_t *nbv_group)
745 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
746 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
748 /* clear energy and shift force outputs */
749 nbnxn_cuda_clear_e_fshift(nb);
752 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
753 const nbnxn_pairlist_t *h_plist,
758 bool bDoTime = nb->bDoTime;
759 cudaStream_t stream = nb->stream[iloc];
760 cu_plist_t *d_plist = nb->plist[iloc];
762 if (d_plist->na_c < 0)
764 d_plist->na_c = h_plist->na_ci;
768 if (d_plist->na_c != h_plist->na_ci)
770 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
771 d_plist->na_c, h_plist->na_ci);
778 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
779 CU_RET_ERR(stat, "cudaEventRecord failed");
782 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
783 &d_plist->nsci, &d_plist->sci_nalloc,
787 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
788 &d_plist->ncj4, &d_plist->cj4_nalloc,
792 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
793 &d_plist->nexcl, &d_plist->excl_nalloc,
799 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
800 CU_RET_ERR(stat, "cudaEventRecord failed");
803 /* need to prune the pair list during the next step */
804 d_plist->bDoPrune = true;
807 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
808 const nbnxn_atomdata_t *nbatom)
810 cu_atomdata_t *adat = nb->atdat;
811 cudaStream_t ls = nb->stream[eintLocal];
813 /* only if we have a dynamic box */
814 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
816 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
817 SHIFTS * sizeof(*adat->shift_vec), ls);
818 adat->bShiftVecUploaded = true;
822 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
823 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
826 cu_atomdata_t *adat = nb->atdat;
827 cudaStream_t ls = nb->stream[eintLocal];
829 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
830 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
833 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
834 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
837 cu_atomdata_t *adat = nb->atdat;
838 cudaStream_t ls = nb->stream[eintLocal];
840 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
841 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
842 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
843 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
844 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
845 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
848 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
850 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
851 /* clear shift force array and energies if the outputs were
852 used in the current step */
853 if (flags & GMX_FORCE_VIRIAL)
855 nbnxn_cuda_clear_e_fshift(nb);
859 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
860 const struct nbnxn_atomdata_t *nbat)
865 bool bDoTime = nb->bDoTime;
866 cu_timers_t *timers = nb->timers;
867 cu_atomdata_t *d_atdat = nb->atdat;
868 cudaStream_t ls = nb->stream[eintLocal];
870 natoms = nbat->natoms;
875 /* time async copy */
876 stat = cudaEventRecord(timers->start_atdat, ls);
877 CU_RET_ERR(stat, "cudaEventRecord failed");
880 /* need to reallocate if we have to copy more atoms than the amount of space
881 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
882 if (natoms > d_atdat->nalloc)
884 nalloc = over_alloc_small(natoms);
886 /* free up first if the arrays have already been initialized */
887 if (d_atdat->nalloc != -1)
889 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
890 cu_free_buffered(d_atdat->xq);
891 cu_free_buffered(d_atdat->atom_types);
894 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
895 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
896 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
897 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
899 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
900 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
902 d_atdat->nalloc = nalloc;
906 d_atdat->natoms = natoms;
907 d_atdat->natoms_local = nbat->natoms_local;
909 /* need to clear GPU f output if realloc happened */
912 nbnxn_cuda_clear_f(nb, nalloc);
915 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
916 natoms*sizeof(*d_atdat->atom_types), ls);
920 stat = cudaEventRecord(timers->stop_atdat, ls);
921 CU_RET_ERR(stat, "cudaEventRecord failed");
925 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
926 const gmx_device_info_t *dev_info)
930 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
932 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
933 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
934 if (use_texobj(dev_info))
936 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
937 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
942 GMX_UNUSED_VALUE(dev_info);
943 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
944 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
946 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
950 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
953 cu_atomdata_t *atdat;
954 cu_nbparam_t *nbparam;
955 cu_plist_t *plist, *plist_nl;
958 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
959 uninitialization API calls from the trace. */
960 if (getenv("NVPROF_ID") != NULL)
962 stat = cudaProfilerStop();
963 CU_RET_ERR(stat, "cudaProfilerStop failed");
972 nbparam = nb->nbparam;
973 plist = nb->plist[eintLocal];
974 plist_nl = nb->plist[eintNonlocal];
977 nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
979 stat = cudaEventDestroy(nb->nonlocal_done);
980 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
981 stat = cudaEventDestroy(nb->misc_ops_done);
982 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
986 stat = cudaEventDestroy(timers->start_atdat);
987 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
988 stat = cudaEventDestroy(timers->stop_atdat);
989 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
991 /* The non-local counters/stream (second in the array) are needed only with DD. */
992 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
994 stat = cudaEventDestroy(timers->start_nb_k[i]);
995 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
996 stat = cudaEventDestroy(timers->stop_nb_k[i]);
997 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
999 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
1000 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
1001 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
1002 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
1004 stat = cudaStreamDestroy(nb->stream[i]);
1005 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1007 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1008 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1009 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1010 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1012 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1013 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1014 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1015 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1019 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1020 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1021 if (use_texobj(nb->dev_info))
1023 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1024 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1029 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1030 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1032 cu_free_buffered(nbparam->nbfp);
1034 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1036 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1037 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1038 if (use_texobj(nb->dev_info))
1040 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1041 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1046 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1047 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1049 cu_free_buffered(nbparam->nbfp_comb);
1052 stat = cudaFree(atdat->shift_vec);
1053 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1054 stat = cudaFree(atdat->fshift);
1055 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1057 stat = cudaFree(atdat->e_lj);
1058 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1059 stat = cudaFree(atdat->e_el);
1060 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1062 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1063 cu_free_buffered(atdat->xq);
1064 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1066 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1067 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1068 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1069 if (nb->bUseTwoStreams)
1071 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1072 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1073 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1079 if (nb->bUseTwoStreams)
1089 fprintf(debug, "Cleaned up CUDA data structures.\n");
1093 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1096 cudaStream_t stream = nb->stream[iloc];
1098 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1099 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1102 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1104 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1107 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1109 /* The NVPROF_ID environment variable is set by nvprof and indicates that
1110 mdrun is executed in the CUDA profiler.
1111 If nvprof was run is with "--profile-from-start off", the profiler will
1112 be started here. This way we can avoid tracing the CUDA events from the
1113 first part of the run. Starting the profiler again does nothing.
1115 if (getenv("NVPROF_ID") != NULL)
1118 stat = cudaProfilerStart();
1119 CU_RET_ERR(stat, "cudaProfilerStart failed");
1122 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1124 init_timings(nbv->gpu_nbv->timings);
1128 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1131 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1135 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1137 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1138 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));