2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/typedefs.h"
56 #include "gromacs/legacyheaders/types/enums.h"
57 #include "gromacs/legacyheaders/types/force_flags.h"
58 #include "gromacs/legacyheaders/types/interaction_const.h"
59 #include "gromacs/mdlib/nb_verlet.h"
60 #include "gromacs/mdlib/nbnxn_consts.h"
61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
62 #include "gromacs/pbcutil/ishift.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/smalloc.h"
69 #include "nbnxn_cuda_types.h"
71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
73 /* This is a heuristically determined parameter for the Fermi architecture for
74 * the minimum size of ci lists by multiplying this constant with the # of
75 * multiprocessors on the current device.
77 static unsigned int gpu_min_ci_balanced_factor = 40;
79 /* Functions from nbnxn_cuda.cu */
80 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
81 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
82 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
83 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
85 /* We should actually be using md_print_warn in md_logging.c,
86 * but we can't include mpi.h in CUDA code.
88 static void md_print_warn(FILE *fplog,
95 /* We should only print to stderr on the master node,
96 * in most cases fplog is only set on the master node, so this works.
99 fprintf(stderr, "\n");
100 vfprintf(stderr, fmt, ap);
101 fprintf(stderr, "\n");
105 fprintf(fplog, "\n");
106 vfprintf(fplog, fmt, ap);
107 fprintf(fplog, "\n");
114 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
117 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
118 const gmx_device_info_t *dev_info);
121 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
122 and the table GPU array. If called with an already allocated table,
123 it just re-uploads the table.
125 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
127 const gmx_device_info_t *dev_info)
132 if (nbp->coulomb_tab != NULL)
134 nbnxn_cuda_free_nbparam_table(nbp, dev_info);
137 stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
138 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
140 nbp->coulomb_tab = coul_tab;
142 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
143 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
144 if (dev_info->prop.major >= 3)
147 memset(&rd, 0, sizeof(rd));
148 rd.resType = cudaResourceTypeLinear;
149 rd.res.linear.devPtr = nbp->coulomb_tab;
150 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
151 rd.res.linear.desc.x = 32;
152 rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab);
155 memset(&td, 0, sizeof(td));
156 td.readMode = cudaReadModeElementType;
157 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
158 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
161 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
163 GMX_UNUSED_VALUE(dev_info);
164 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
165 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
167 ic->tabq_size*sizeof(*coul_tab));
168 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
171 cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
173 nbp->coulomb_tab_size = ic->tabq_size;
174 nbp->coulomb_tab_scale = ic->tabq_scale;
178 /*! Initializes the atomdata structure first time, it only gets filled at
180 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
185 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
186 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
187 ad->bShiftVecUploaded = false;
189 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
190 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
192 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
193 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
194 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
195 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
197 /* initialize to NULL poiters to data that is not allocated here and will
198 need reallocation in nbnxn_cuda_init_atomdata */
202 /* size -1 indicates that the respective array hasn't been initialized yet */
207 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
208 earlier GPUs, single or twin cut-off. */
209 static int pick_ewald_kernel_type(bool bTwinCut,
210 const gmx_device_info_t *dev_info)
212 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
215 /* Benchmarking/development environment variables to force the use of
216 analytical or tabulated Ewald kernel. */
217 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
218 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
220 if (bForceAnalyticalEwald && bForceTabulatedEwald)
222 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
223 "requested through environment variables.");
226 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
227 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
229 bUseAnalyticalEwald = true;
233 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
238 bUseAnalyticalEwald = false;
242 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
246 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
247 forces it (use it for debugging/benchmarking only). */
248 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
250 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
254 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
260 /*! Copies all parameters related to the cut-off from ic to nbp */
261 static void set_cutoff_parameters(cu_nbparam_t *nbp,
262 const interaction_const_t *ic)
264 nbp->ewald_beta = ic->ewaldcoeff_q;
265 nbp->sh_ewald = ic->sh_ewald;
266 nbp->epsfac = ic->epsfac;
267 nbp->two_k_rf = 2.0 * ic->k_rf;
268 nbp->c_rf = ic->c_rf;
269 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
270 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
271 nbp->rlist_sq = ic->rlist * ic->rlist;
273 nbp->sh_lj_ewald = ic->sh_lj_ewald;
274 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
276 nbp->rvdw_switch = ic->rvdw_switch;
277 nbp->dispersion_shift = ic->dispersion_shift;
278 nbp->repulsion_shift = ic->repulsion_shift;
279 nbp->vdw_switch = ic->vdw_switch;
282 /*! Initializes the nonbonded parameter data structure. */
283 static void init_nbparam(cu_nbparam_t *nbp,
284 const interaction_const_t *ic,
285 const nbnxn_atomdata_t *nbat,
286 const gmx_device_info_t *dev_info)
289 int ntypes, nnbfp, nnbfp_comb;
291 ntypes = nbat->ntype;
293 set_cutoff_parameters(nbp, ic);
295 if (ic->vdwtype == evdwCUT)
297 switch (ic->vdw_modifier)
300 case eintmodPOTSHIFT:
301 nbp->vdwtype = evdwCuCUT;
303 case eintmodFORCESWITCH:
304 nbp->vdwtype = evdwCuFSWITCH;
306 case eintmodPOTSWITCH:
307 nbp->vdwtype = evdwCuPSWITCH;
310 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
314 else if (ic->vdwtype == evdwPME)
316 if (ic->ljpme_comb_rule == ljcrGEOM)
318 assert(nbat->comb_rule == ljcrGEOM);
319 nbp->vdwtype = evdwCuEWALDGEOM;
323 assert(nbat->comb_rule == ljcrLB);
324 nbp->vdwtype = evdwCuEWALDLB;
329 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
332 if (ic->eeltype == eelCUT)
334 nbp->eeltype = eelCuCUT;
336 else if (EEL_RF(ic->eeltype))
338 nbp->eeltype = eelCuRF;
340 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
342 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
343 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
347 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
348 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
351 /* generate table for PME */
352 nbp->coulomb_tab = NULL;
353 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
355 init_ewald_coulomb_force_table(ic, nbp, dev_info);
358 nnbfp = 2*ntypes*ntypes;
359 nnbfp_comb = 2*ntypes;
361 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
362 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
363 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
366 if (ic->vdwtype == evdwPME)
368 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
369 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
370 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
373 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
374 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
375 if (dev_info->prop.major >= 3)
380 memset(&rd, 0, sizeof(rd));
381 rd.resType = cudaResourceTypeLinear;
382 rd.res.linear.devPtr = nbp->nbfp;
383 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
384 rd.res.linear.desc.x = 32;
385 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
387 memset(&td, 0, sizeof(td));
388 td.readMode = cudaReadModeElementType;
389 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
390 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
392 if (ic->vdwtype == evdwPME)
394 memset(&rd, 0, sizeof(rd));
395 rd.resType = cudaResourceTypeLinear;
396 rd.res.linear.devPtr = nbp->nbfp_comb;
397 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
398 rd.res.linear.desc.x = 32;
399 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
401 memset(&td, 0, sizeof(td));
402 td.readMode = cudaReadModeElementType;
403 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
404 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
408 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
410 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
411 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
412 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
413 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
415 if (ic->vdwtype == evdwPME)
417 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
418 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
419 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
424 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
425 * electrostatic type switching to twin cut-off (or back) if needed. */
426 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
427 const interaction_const_t *ic)
429 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
433 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
434 cu_nbparam_t *nbp = nb->nbparam;
436 set_cutoff_parameters(nbp, ic);
438 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
441 init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
444 /*! Initializes the pair list data structure. */
445 static void init_plist(cu_plist_t *pl)
447 /* initialize to NULL pointers to data that is not allocated here and will
448 need reallocation in nbnxn_gpu_init_pairlist */
453 /* size -1 indicates that the respective array hasn't been initialized yet */
460 pl->excl_nalloc = -1;
461 pl->bDoPrune = false;
464 /*! Initializes the timer data structure. */
465 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
468 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
470 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
471 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
472 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
473 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
475 /* The non-local counters/stream (second in the array) are needed only with DD. */
476 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
478 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
479 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
480 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
481 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
484 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
485 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
486 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
487 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
489 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
490 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
491 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
492 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
494 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
495 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
496 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
497 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
501 /*! Initializes the timings data structure. */
502 static void init_timings(gmx_wallclock_gpu_t *t)
511 for (i = 0; i < 2; i++)
513 for (j = 0; j < 2; j++)
515 t->ktime[i][j].t = 0.0;
516 t->ktime[i][j].c = 0;
521 void nbnxn_gpu_init(FILE *fplog,
522 gmx_nbnxn_cuda_t **p_nb,
523 const gmx_gpu_info_t *gpu_info,
524 const gmx_gpu_opt_t *gpu_opt,
526 gmx_bool bLocalAndNonlocal)
529 gmx_nbnxn_cuda_t *nb;
531 bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
543 snew(nb->nbparam, 1);
544 snew(nb->plist[eintLocal], 1);
545 if (bLocalAndNonlocal)
547 snew(nb->plist[eintNonlocal], 1);
550 nb->bUseTwoStreams = bLocalAndNonlocal;
553 snew(nb->timings, 1);
556 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
557 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
558 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
560 init_plist(nb->plist[eintLocal]);
562 /* set device info, just point it to the right GPU among the detected ones */
563 nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
565 /* local/non-local GPU streams */
566 stat = cudaStreamCreate(&nb->stream[eintLocal]);
567 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
568 if (nb->bUseTwoStreams)
570 init_plist(nb->plist[eintNonlocal]);
572 /* CUDA stream priority available in the CUDA RT 5.5 API.
573 * Note that the device we're running on does not have to support
574 * priorities, because we are querying the priority range which in this
575 * case will be a single value.
577 #if GMX_CUDA_VERSION >= 5050
579 int highest_priority;
580 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
581 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
583 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
586 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
589 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
590 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
594 /* init events for sychronization (timing disabled for performance reasons!) */
595 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
596 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
597 stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
598 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
600 /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
601 * (which increases with shorter time/step) caused by a known CUDA driver bug.
602 * To work around the issue we'll use an (admittedly fragile) memory polling
603 * waiting to preserve performance. This requires support for atomic
604 * operations and only works on x86/x86_64.
605 * With polling wait event-timing also needs to be disabled.
607 * The overhead is greatly reduced in API v5.0 drivers and the improvement
608 * is independent of runtime version. Hence, with API v5.0 drivers and later
609 * we won't switch to polling.
611 * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
612 * ranks so we will also disable it in that case.
615 bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
616 bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
621 bTMPIAtomics = false;
624 #ifdef GMX_TARGET_X86
630 if (bStreamSync && bNoStreamSync)
632 gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
635 stat = cudaDriverGetVersion(&cuda_drv_ver);
636 CU_RET_ERR(stat, "cudaDriverGetVersion failed");
638 bOldDriver = (cuda_drv_ver < 5000);
640 if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
642 /* Polling wait should be used instead of cudaStreamSynchronize only if:
643 * - ECC is ON & driver is old (checked above),
644 * - we're on x86/x86_64,
645 * - atomics are available, and
646 * - GPUs are not being shared.
648 bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
649 (gmx_count_gpu_dev_shared(gpu_opt) < 1));
653 nb->bUseStreamSync = true;
655 /* only warn if polling should be used */
656 if (bShouldUsePollSync)
659 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
660 " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
665 nb->bUseStreamSync = !bShouldUsePollSync;
667 if (bShouldUsePollSync)
670 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
671 " cause performance loss. Switching to the alternative polling GPU wait.\n"
672 " If you encounter issues, switch back to standard GPU waiting by setting\n"
673 " the GMX_CUDA_STREAMSYNC environment variable.\n");
677 /* Tell the user that the ECC+old driver combination can be bad */
679 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
680 " A known bug in this driver version can cause performance loss.\n"
681 " However, the polling wait workaround can not be used because\n%s\n"
682 " Consider updating the driver or turning ECC off.",
683 (bX86 && bTMPIAtomics) ?
684 " GPU(s) are being oversubscribed." :
685 " atomic operations are not supported by the platform/CPU+compiler.");
686 md_print_warn(fplog, sbuf);
694 nb->bUseStreamSync = false;
697 "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
701 /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
702 nb->bUseStreamSync = true;
706 /* CUDA timing disabled as event timers don't work:
707 - with multiple streams = domain-decomposition;
708 - with the polling waiting hack (without cudaStreamSynchronize);
709 - when turned off by GMX_DISABLE_CUDA_TIMING.
711 nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
712 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
716 init_timers(nb->timers, nb->bUseTwoStreams);
717 init_timings(nb->timings);
720 /* set the kernel type for the current GPU */
721 /* pick L1 cache configuration */
722 nbnxn_cuda_set_cacheconfig(nb->dev_info);
728 fprintf(debug, "Initialized CUDA data structures.\n");
732 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
733 const interaction_const_t *ic,
734 const nonbonded_verlet_group_t *nbv_group)
736 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
737 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
739 /* clear energy and shift force outputs */
740 nbnxn_cuda_clear_e_fshift(nb);
743 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
744 const nbnxn_pairlist_t *h_plist,
749 bool bDoTime = nb->bDoTime;
750 cudaStream_t stream = nb->stream[iloc];
751 cu_plist_t *d_plist = nb->plist[iloc];
753 if (d_plist->na_c < 0)
755 d_plist->na_c = h_plist->na_ci;
759 if (d_plist->na_c != h_plist->na_ci)
761 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
762 d_plist->na_c, h_plist->na_ci);
769 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
770 CU_RET_ERR(stat, "cudaEventRecord failed");
773 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
774 &d_plist->nsci, &d_plist->sci_nalloc,
778 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
779 &d_plist->ncj4, &d_plist->cj4_nalloc,
783 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
784 &d_plist->nexcl, &d_plist->excl_nalloc,
790 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
791 CU_RET_ERR(stat, "cudaEventRecord failed");
794 /* need to prune the pair list during the next step */
795 d_plist->bDoPrune = true;
798 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
799 const nbnxn_atomdata_t *nbatom)
801 cu_atomdata_t *adat = nb->atdat;
802 cudaStream_t ls = nb->stream[eintLocal];
804 /* only if we have a dynamic box */
805 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
807 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
808 SHIFTS * sizeof(*adat->shift_vec), ls);
809 adat->bShiftVecUploaded = true;
813 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
814 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
817 cu_atomdata_t *adat = nb->atdat;
818 cudaStream_t ls = nb->stream[eintLocal];
820 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
821 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
824 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
825 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
828 cu_atomdata_t *adat = nb->atdat;
829 cudaStream_t ls = nb->stream[eintLocal];
831 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
832 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
833 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
834 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
835 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
836 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
839 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
841 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
842 /* clear shift force array and energies if the outputs were
843 used in the current step */
844 if (flags & GMX_FORCE_VIRIAL)
846 nbnxn_cuda_clear_e_fshift(nb);
850 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
851 const struct nbnxn_atomdata_t *nbat)
856 bool bDoTime = nb->bDoTime;
857 cu_timers_t *timers = nb->timers;
858 cu_atomdata_t *d_atdat = nb->atdat;
859 cudaStream_t ls = nb->stream[eintLocal];
861 natoms = nbat->natoms;
866 /* time async copy */
867 stat = cudaEventRecord(timers->start_atdat, ls);
868 CU_RET_ERR(stat, "cudaEventRecord failed");
871 /* need to reallocate if we have to copy more atoms than the amount of space
872 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
873 if (natoms > d_atdat->nalloc)
875 nalloc = over_alloc_small(natoms);
877 /* free up first if the arrays have already been initialized */
878 if (d_atdat->nalloc != -1)
880 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
881 cu_free_buffered(d_atdat->xq);
882 cu_free_buffered(d_atdat->atom_types);
885 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
886 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
887 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
888 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
890 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
891 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
893 d_atdat->nalloc = nalloc;
897 d_atdat->natoms = natoms;
898 d_atdat->natoms_local = nbat->natoms_local;
900 /* need to clear GPU f output if realloc happened */
903 nbnxn_cuda_clear_f(nb, nalloc);
906 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
907 natoms*sizeof(*d_atdat->atom_types), ls);
911 stat = cudaEventRecord(timers->stop_atdat, ls);
912 CU_RET_ERR(stat, "cudaEventRecord failed");
916 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
917 const gmx_device_info_t *dev_info)
921 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
923 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
924 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
925 if (dev_info->prop.major >= 3)
927 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
928 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
933 GMX_UNUSED_VALUE(dev_info);
934 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
935 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
937 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
941 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
944 cu_atomdata_t *atdat;
945 cu_nbparam_t *nbparam;
946 cu_plist_t *plist, *plist_nl;
949 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
950 uninitialization API calls from the trace. */
951 if (getenv("NVPROF_ID") != NULL)
953 stat = cudaProfilerStop();
954 CU_RET_ERR(stat, "cudaProfilerStop failed");
963 nbparam = nb->nbparam;
964 plist = nb->plist[eintLocal];
965 plist_nl = nb->plist[eintNonlocal];
968 nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
970 stat = cudaEventDestroy(nb->nonlocal_done);
971 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
972 stat = cudaEventDestroy(nb->misc_ops_done);
973 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
977 stat = cudaEventDestroy(timers->start_atdat);
978 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
979 stat = cudaEventDestroy(timers->stop_atdat);
980 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
982 /* The non-local counters/stream (second in the array) are needed only with DD. */
983 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
985 stat = cudaEventDestroy(timers->start_nb_k[i]);
986 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
987 stat = cudaEventDestroy(timers->stop_nb_k[i]);
988 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
990 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
991 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
992 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
993 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
995 stat = cudaStreamDestroy(nb->stream[i]);
996 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
998 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
999 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1000 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1001 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1003 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1004 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1005 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1006 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1010 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1011 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1012 if (nb->dev_info->prop.major >= 3)
1014 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1015 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1020 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1021 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1023 cu_free_buffered(nbparam->nbfp);
1025 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1027 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1028 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1029 if (nb->dev_info->prop.major >= 3)
1031 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1032 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1037 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1038 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1040 cu_free_buffered(nbparam->nbfp_comb);
1043 stat = cudaFree(atdat->shift_vec);
1044 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1045 stat = cudaFree(atdat->fshift);
1046 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1048 stat = cudaFree(atdat->e_lj);
1049 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1050 stat = cudaFree(atdat->e_el);
1051 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1053 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1054 cu_free_buffered(atdat->xq);
1055 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1057 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1058 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1059 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1060 if (nb->bUseTwoStreams)
1062 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1063 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1064 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1070 if (nb->bUseTwoStreams)
1080 fprintf(debug, "Cleaned up CUDA data structures.\n");
1084 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1087 cudaStream_t stream = nb->stream[iloc];
1089 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1090 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1093 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1095 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1098 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1100 /* The NVPROF_ID environment variable is set by nvprof and indicates that
1101 mdrun is executed in the CUDA profiler.
1102 If nvprof was run is with "--profile-from-start off", the profiler will
1103 be started here. This way we can avoid tracing the CUDA events from the
1104 first part of the run. Starting the profiler again does nothing.
1106 if (getenv("NVPROF_ID") != NULL)
1109 stat = cudaProfilerStart();
1110 CU_RET_ERR(stat, "cudaProfilerStart failed");
1113 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1115 init_timings(nbv->gpu_nbv->timings);
1119 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1122 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1126 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1128 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1129 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));