2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/typedefs.h"
56 #include "gromacs/legacyheaders/types/enums.h"
57 #include "gromacs/legacyheaders/types/force_flags.h"
58 #include "gromacs/legacyheaders/types/interaction_const.h"
59 #include "gromacs/mdlib/nb_verlet.h"
60 #include "gromacs/mdlib/nbnxn_consts.h"
61 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
62 #include "gromacs/pbcutil/ishift.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/basedefinitions.h"
65 #include "gromacs/utility/cstringutil.h"
66 #include "gromacs/utility/fatalerror.h"
67 #include "gromacs/utility/smalloc.h"
69 #include "nbnxn_cuda_types.h"
71 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
73 /* This is a heuristically determined parameter for the Fermi, Kepler
74 * and Maxwell architectures for the minimum size of ci lists by multiplying
75 * this constant with the # of multiprocessors on the current device.
76 * Since the maximum number of blocks per multiprocessor is 16, the ideal
77 * count for small systems is 32 or 48 blocks per multiprocessor. Because
78 * there is a bit of fluctuations in the generated block counts, we use
79 * a target of 44 instead of the ideal value of 48.
81 static unsigned int gpu_min_ci_balanced_factor = 44;
83 /* Functions from nbnxn_cuda.cu */
84 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
85 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
86 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
87 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
89 /* We should actually be using md_print_warn in md_logging.c,
90 * but we can't include mpi.h in CUDA code.
92 static void md_print_warn(FILE *fplog,
99 /* We should only print to stderr on the master node,
100 * in most cases fplog is only set on the master node, so this works.
103 fprintf(stderr, "\n");
104 vfprintf(stderr, fmt, ap);
105 fprintf(stderr, "\n");
109 fprintf(fplog, "\n");
110 vfprintf(fplog, fmt, ap);
111 fprintf(fplog, "\n");
118 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
121 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
122 const gmx_device_info_t *dev_info);
126 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
127 static bool use_texobj(const gmx_device_info_t *dev_info)
129 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
130 return (dev_info->prop.major >= 3);
134 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
135 and the table GPU array. If called with an already allocated table,
136 it just re-uploads the table.
138 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
140 const gmx_device_info_t *dev_info)
145 if (nbp->coulomb_tab != NULL)
147 nbnxn_cuda_free_nbparam_table(nbp, dev_info);
150 stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
151 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
153 nbp->coulomb_tab = coul_tab;
155 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
156 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
157 if (use_texobj(dev_info))
160 memset(&rd, 0, sizeof(rd));
161 rd.resType = cudaResourceTypeLinear;
162 rd.res.linear.devPtr = nbp->coulomb_tab;
163 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
164 rd.res.linear.desc.x = 32;
165 rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab);
168 memset(&td, 0, sizeof(td));
169 td.readMode = cudaReadModeElementType;
170 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
171 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
174 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
176 GMX_UNUSED_VALUE(dev_info);
177 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
178 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
180 ic->tabq_size*sizeof(*coul_tab));
181 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
184 cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
186 nbp->coulomb_tab_size = ic->tabq_size;
187 nbp->coulomb_tab_scale = ic->tabq_scale;
191 /*! Initializes the atomdata structure first time, it only gets filled at
193 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
198 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
199 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
200 ad->bShiftVecUploaded = false;
202 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
203 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
205 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
206 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
207 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
208 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
210 /* initialize to NULL poiters to data that is not allocated here and will
211 need reallocation in nbnxn_cuda_init_atomdata */
215 /* size -1 indicates that the respective array hasn't been initialized yet */
220 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
221 earlier GPUs, single or twin cut-off. */
222 static int pick_ewald_kernel_type(bool bTwinCut,
223 const gmx_device_info_t *dev_info)
225 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
228 /* Benchmarking/development environment variables to force the use of
229 analytical or tabulated Ewald kernel. */
230 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
231 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
233 if (bForceAnalyticalEwald && bForceTabulatedEwald)
235 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
236 "requested through environment variables.");
239 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
240 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
242 bUseAnalyticalEwald = true;
246 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
251 bUseAnalyticalEwald = false;
255 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
259 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
260 forces it (use it for debugging/benchmarking only). */
261 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
263 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
267 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
273 /*! Copies all parameters related to the cut-off from ic to nbp */
274 static void set_cutoff_parameters(cu_nbparam_t *nbp,
275 const interaction_const_t *ic)
277 nbp->ewald_beta = ic->ewaldcoeff_q;
278 nbp->sh_ewald = ic->sh_ewald;
279 nbp->epsfac = ic->epsfac;
280 nbp->two_k_rf = 2.0 * ic->k_rf;
281 nbp->c_rf = ic->c_rf;
282 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
283 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
284 nbp->rlist_sq = ic->rlist * ic->rlist;
286 nbp->sh_lj_ewald = ic->sh_lj_ewald;
287 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
289 nbp->rvdw_switch = ic->rvdw_switch;
290 nbp->dispersion_shift = ic->dispersion_shift;
291 nbp->repulsion_shift = ic->repulsion_shift;
292 nbp->vdw_switch = ic->vdw_switch;
295 /*! Initializes the nonbonded parameter data structure. */
296 static void init_nbparam(cu_nbparam_t *nbp,
297 const interaction_const_t *ic,
298 const nbnxn_atomdata_t *nbat,
299 const gmx_device_info_t *dev_info)
302 int ntypes, nnbfp, nnbfp_comb;
304 ntypes = nbat->ntype;
306 set_cutoff_parameters(nbp, ic);
308 if (ic->vdwtype == evdwCUT)
310 switch (ic->vdw_modifier)
313 case eintmodPOTSHIFT:
314 nbp->vdwtype = evdwCuCUT;
316 case eintmodFORCESWITCH:
317 nbp->vdwtype = evdwCuFSWITCH;
319 case eintmodPOTSWITCH:
320 nbp->vdwtype = evdwCuPSWITCH;
323 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
327 else if (ic->vdwtype == evdwPME)
329 if (ic->ljpme_comb_rule == ljcrGEOM)
331 assert(nbat->comb_rule == ljcrGEOM);
332 nbp->vdwtype = evdwCuEWALDGEOM;
336 assert(nbat->comb_rule == ljcrLB);
337 nbp->vdwtype = evdwCuEWALDLB;
342 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
345 if (ic->eeltype == eelCUT)
347 nbp->eeltype = eelCuCUT;
349 else if (EEL_RF(ic->eeltype))
351 nbp->eeltype = eelCuRF;
353 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
355 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
356 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
360 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
361 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
364 /* generate table for PME */
365 nbp->coulomb_tab = NULL;
366 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
368 init_ewald_coulomb_force_table(ic, nbp, dev_info);
371 nnbfp = 2*ntypes*ntypes;
372 nnbfp_comb = 2*ntypes;
374 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
375 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
376 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
379 if (ic->vdwtype == evdwPME)
381 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
382 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
383 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
386 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
387 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
388 if (use_texobj(dev_info))
393 memset(&rd, 0, sizeof(rd));
394 rd.resType = cudaResourceTypeLinear;
395 rd.res.linear.devPtr = nbp->nbfp;
396 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
397 rd.res.linear.desc.x = 32;
398 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
400 memset(&td, 0, sizeof(td));
401 td.readMode = cudaReadModeElementType;
402 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
403 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
405 if (ic->vdwtype == evdwPME)
407 memset(&rd, 0, sizeof(rd));
408 rd.resType = cudaResourceTypeLinear;
409 rd.res.linear.devPtr = nbp->nbfp_comb;
410 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
411 rd.res.linear.desc.x = 32;
412 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
414 memset(&td, 0, sizeof(td));
415 td.readMode = cudaReadModeElementType;
416 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
417 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
421 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
423 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
424 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
425 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
426 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
428 if (ic->vdwtype == evdwPME)
430 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
431 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
432 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
437 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
438 * electrostatic type switching to twin cut-off (or back) if needed. */
439 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
440 const interaction_const_t *ic)
442 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
446 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
447 cu_nbparam_t *nbp = nb->nbparam;
449 set_cutoff_parameters(nbp, ic);
451 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
454 init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
457 /*! Initializes the pair list data structure. */
458 static void init_plist(cu_plist_t *pl)
460 /* initialize to NULL pointers to data that is not allocated here and will
461 need reallocation in nbnxn_gpu_init_pairlist */
466 /* size -1 indicates that the respective array hasn't been initialized yet */
473 pl->excl_nalloc = -1;
474 pl->bDoPrune = false;
477 /*! Initializes the timer data structure. */
478 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
481 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
483 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
484 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
485 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
486 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
488 /* The non-local counters/stream (second in the array) are needed only with DD. */
489 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
491 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
492 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
493 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
494 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
497 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
498 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
499 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
500 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
502 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
503 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
504 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
505 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
507 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
508 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
509 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
510 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
514 /*! Initializes the timings data structure. */
515 static void init_timings(gmx_wallclock_gpu_t *t)
524 for (i = 0; i < 2; i++)
526 for (j = 0; j < 2; j++)
528 t->ktime[i][j].t = 0.0;
529 t->ktime[i][j].c = 0;
534 void nbnxn_gpu_init(FILE *fplog,
535 gmx_nbnxn_cuda_t **p_nb,
536 const gmx_gpu_info_t *gpu_info,
537 const gmx_gpu_opt_t *gpu_opt,
539 gmx_bool bLocalAndNonlocal)
542 gmx_nbnxn_cuda_t *nb;
544 bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
556 snew(nb->nbparam, 1);
557 snew(nb->plist[eintLocal], 1);
558 if (bLocalAndNonlocal)
560 snew(nb->plist[eintNonlocal], 1);
563 nb->bUseTwoStreams = bLocalAndNonlocal;
566 snew(nb->timings, 1);
569 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
570 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
571 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
573 init_plist(nb->plist[eintLocal]);
575 /* set device info, just point it to the right GPU among the detected ones */
576 nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
578 /* local/non-local GPU streams */
579 stat = cudaStreamCreate(&nb->stream[eintLocal]);
580 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
581 if (nb->bUseTwoStreams)
583 init_plist(nb->plist[eintNonlocal]);
585 /* CUDA stream priority available in the CUDA RT 5.5 API.
586 * Note that the device we're running on does not have to support
587 * priorities, because we are querying the priority range which in this
588 * case will be a single value.
590 #if GMX_CUDA_VERSION >= 5050
592 int highest_priority;
593 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
594 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
596 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
599 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
602 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
603 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
607 /* init events for sychronization (timing disabled for performance reasons!) */
608 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
609 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
610 stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
611 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
613 /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
614 * (which increases with shorter time/step) caused by a known CUDA driver bug.
615 * To work around the issue we'll use an (admittedly fragile) memory polling
616 * waiting to preserve performance. This requires support for atomic
617 * operations and only works on x86/x86_64.
618 * With polling wait event-timing also needs to be disabled.
620 * The overhead is greatly reduced in API v5.0 drivers and the improvement
621 * is independent of runtime version. Hence, with API v5.0 drivers and later
622 * we won't switch to polling.
624 * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
625 * ranks so we will also disable it in that case.
628 bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
629 bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
634 bTMPIAtomics = false;
637 #ifdef GMX_TARGET_X86
643 if (bStreamSync && bNoStreamSync)
645 gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
648 stat = cudaDriverGetVersion(&cuda_drv_ver);
649 CU_RET_ERR(stat, "cudaDriverGetVersion failed");
651 bOldDriver = (cuda_drv_ver < 5000);
653 if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
655 /* Polling wait should be used instead of cudaStreamSynchronize only if:
656 * - ECC is ON & driver is old (checked above),
657 * - we're on x86/x86_64,
658 * - atomics are available, and
659 * - GPUs are not being shared.
661 bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
662 (gmx_count_gpu_dev_shared(gpu_opt) < 1));
666 nb->bUseStreamSync = true;
668 /* only warn if polling should be used */
669 if (bShouldUsePollSync)
672 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
673 " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
678 nb->bUseStreamSync = !bShouldUsePollSync;
680 if (bShouldUsePollSync)
683 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
684 " cause performance loss. Switching to the alternative polling GPU wait.\n"
685 " If you encounter issues, switch back to standard GPU waiting by setting\n"
686 " the GMX_CUDA_STREAMSYNC environment variable.\n");
690 /* Tell the user that the ECC+old driver combination can be bad */
692 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
693 " A known bug in this driver version can cause performance loss.\n"
694 " However, the polling wait workaround can not be used because\n%s\n"
695 " Consider updating the driver or turning ECC off.",
696 (bX86 && bTMPIAtomics) ?
697 " GPU(s) are being oversubscribed." :
698 " atomic operations are not supported by the platform/CPU+compiler.");
699 md_print_warn(fplog, sbuf);
707 nb->bUseStreamSync = false;
710 "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
714 /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
715 nb->bUseStreamSync = true;
719 /* CUDA timing disabled as event timers don't work:
720 - with multiple streams = domain-decomposition;
721 - with the polling waiting hack (without cudaStreamSynchronize);
722 - when turned off by GMX_DISABLE_CUDA_TIMING.
724 nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
725 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
729 init_timers(nb->timers, nb->bUseTwoStreams);
730 init_timings(nb->timings);
733 /* set the kernel type for the current GPU */
734 /* pick L1 cache configuration */
735 nbnxn_cuda_set_cacheconfig(nb->dev_info);
741 fprintf(debug, "Initialized CUDA data structures.\n");
745 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
746 const interaction_const_t *ic,
747 const nonbonded_verlet_group_t *nbv_group)
749 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
750 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
752 /* clear energy and shift force outputs */
753 nbnxn_cuda_clear_e_fshift(nb);
756 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
757 const nbnxn_pairlist_t *h_plist,
762 bool bDoTime = nb->bDoTime;
763 cudaStream_t stream = nb->stream[iloc];
764 cu_plist_t *d_plist = nb->plist[iloc];
766 if (d_plist->na_c < 0)
768 d_plist->na_c = h_plist->na_ci;
772 if (d_plist->na_c != h_plist->na_ci)
774 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
775 d_plist->na_c, h_plist->na_ci);
782 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
783 CU_RET_ERR(stat, "cudaEventRecord failed");
786 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
787 &d_plist->nsci, &d_plist->sci_nalloc,
791 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
792 &d_plist->ncj4, &d_plist->cj4_nalloc,
796 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
797 &d_plist->nexcl, &d_plist->excl_nalloc,
803 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
804 CU_RET_ERR(stat, "cudaEventRecord failed");
807 /* need to prune the pair list during the next step */
808 d_plist->bDoPrune = true;
811 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
812 const nbnxn_atomdata_t *nbatom)
814 cu_atomdata_t *adat = nb->atdat;
815 cudaStream_t ls = nb->stream[eintLocal];
817 /* only if we have a dynamic box */
818 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
820 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
821 SHIFTS * sizeof(*adat->shift_vec), ls);
822 adat->bShiftVecUploaded = true;
826 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
827 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
830 cu_atomdata_t *adat = nb->atdat;
831 cudaStream_t ls = nb->stream[eintLocal];
833 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
834 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
837 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
838 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
841 cu_atomdata_t *adat = nb->atdat;
842 cudaStream_t ls = nb->stream[eintLocal];
844 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
845 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
846 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
847 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
848 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
849 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
852 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
854 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
855 /* clear shift force array and energies if the outputs were
856 used in the current step */
857 if (flags & GMX_FORCE_VIRIAL)
859 nbnxn_cuda_clear_e_fshift(nb);
863 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
864 const struct nbnxn_atomdata_t *nbat)
869 bool bDoTime = nb->bDoTime;
870 cu_timers_t *timers = nb->timers;
871 cu_atomdata_t *d_atdat = nb->atdat;
872 cudaStream_t ls = nb->stream[eintLocal];
874 natoms = nbat->natoms;
879 /* time async copy */
880 stat = cudaEventRecord(timers->start_atdat, ls);
881 CU_RET_ERR(stat, "cudaEventRecord failed");
884 /* need to reallocate if we have to copy more atoms than the amount of space
885 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
886 if (natoms > d_atdat->nalloc)
888 nalloc = over_alloc_small(natoms);
890 /* free up first if the arrays have already been initialized */
891 if (d_atdat->nalloc != -1)
893 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
894 cu_free_buffered(d_atdat->xq);
895 cu_free_buffered(d_atdat->atom_types);
898 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
899 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
900 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
901 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
903 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
904 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
906 d_atdat->nalloc = nalloc;
910 d_atdat->natoms = natoms;
911 d_atdat->natoms_local = nbat->natoms_local;
913 /* need to clear GPU f output if realloc happened */
916 nbnxn_cuda_clear_f(nb, nalloc);
919 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
920 natoms*sizeof(*d_atdat->atom_types), ls);
924 stat = cudaEventRecord(timers->stop_atdat, ls);
925 CU_RET_ERR(stat, "cudaEventRecord failed");
929 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
930 const gmx_device_info_t *dev_info)
934 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
936 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
937 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
938 if (use_texobj(dev_info))
940 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
941 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
946 GMX_UNUSED_VALUE(dev_info);
947 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
948 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
950 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
954 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
957 cu_atomdata_t *atdat;
958 cu_nbparam_t *nbparam;
959 cu_plist_t *plist, *plist_nl;
962 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
963 uninitialization API calls from the trace. */
964 if (getenv("NVPROF_ID") != NULL)
966 stat = cudaProfilerStop();
967 CU_RET_ERR(stat, "cudaProfilerStop failed");
976 nbparam = nb->nbparam;
977 plist = nb->plist[eintLocal];
978 plist_nl = nb->plist[eintNonlocal];
981 nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
983 stat = cudaEventDestroy(nb->nonlocal_done);
984 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
985 stat = cudaEventDestroy(nb->misc_ops_done);
986 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
990 stat = cudaEventDestroy(timers->start_atdat);
991 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
992 stat = cudaEventDestroy(timers->stop_atdat);
993 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
995 /* The non-local counters/stream (second in the array) are needed only with DD. */
996 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
998 stat = cudaEventDestroy(timers->start_nb_k[i]);
999 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
1000 stat = cudaEventDestroy(timers->stop_nb_k[i]);
1001 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
1003 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
1004 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
1005 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
1006 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
1008 stat = cudaStreamDestroy(nb->stream[i]);
1009 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1011 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1012 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1013 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1014 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1016 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1017 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1018 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1019 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1023 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1024 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1025 if (use_texobj(nb->dev_info))
1027 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1028 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1033 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1034 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1036 cu_free_buffered(nbparam->nbfp);
1038 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1040 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1041 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1042 if (use_texobj(nb->dev_info))
1044 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1045 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1050 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1051 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1053 cu_free_buffered(nbparam->nbfp_comb);
1056 stat = cudaFree(atdat->shift_vec);
1057 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1058 stat = cudaFree(atdat->fshift);
1059 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1061 stat = cudaFree(atdat->e_lj);
1062 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1063 stat = cudaFree(atdat->e_el);
1064 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1066 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1067 cu_free_buffered(atdat->xq);
1068 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1070 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1071 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1072 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1073 if (nb->bUseTwoStreams)
1075 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1076 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1077 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1083 if (nb->bUseTwoStreams)
1093 fprintf(debug, "Cleaned up CUDA data structures.\n");
1097 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1100 cudaStream_t stream = nb->stream[iloc];
1102 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1103 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1106 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1108 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1111 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1113 /* The NVPROF_ID environment variable is set by nvprof and indicates that
1114 mdrun is executed in the CUDA profiler.
1115 If nvprof was run is with "--profile-from-start off", the profiler will
1116 be started here. This way we can avoid tracing the CUDA events from the
1117 first part of the run. Starting the profiler again does nothing.
1119 if (getenv("NVPROF_ID") != NULL)
1122 stat = cudaProfilerStart();
1123 CU_RET_ERR(stat, "cudaProfilerStart failed");
1126 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1128 init_timings(nbv->gpu_nbv->timings);
1132 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1135 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1139 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1141 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1142 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));