2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
38 * \author Szilard Pall <pall.szilard@gmail.com>
49 #include <cuda_profiler_api.h>
51 #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
52 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
53 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
54 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
55 #include "gromacs/legacyheaders/tables.h"
56 #include "gromacs/legacyheaders/typedefs.h"
57 #include "gromacs/legacyheaders/types/enums.h"
58 #include "gromacs/legacyheaders/types/force_flags.h"
59 #include "gromacs/legacyheaders/types/interaction_const.h"
60 #include "gromacs/mdlib/nb_verlet.h"
61 #include "gromacs/mdlib/nbnxn_consts.h"
62 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
63 #include "gromacs/pbcutil/ishift.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/basedefinitions.h"
66 #include "gromacs/utility/cstringutil.h"
67 #include "gromacs/utility/fatalerror.h"
68 #include "gromacs/utility/smalloc.h"
70 #include "nbnxn_cuda_types.h"
72 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
74 /* This is a heuristically determined parameter for the Fermi architecture for
75 * the minimum size of ci lists by multiplying this constant with the # of
76 * multiprocessors on the current device.
78 static unsigned int gpu_min_ci_balanced_factor = 40;
80 /* Functions from nbnxn_cuda.cu */
81 extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
82 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
83 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
84 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
86 /* We should actually be using md_print_warn in md_logging.c,
87 * but we can't include mpi.h in CUDA code.
89 static void md_print_warn(FILE *fplog,
96 /* We should only print to stderr on the master node,
97 * in most cases fplog is only set on the master node, so this works.
100 fprintf(stderr, "\n");
101 vfprintf(stderr, fmt, ap);
102 fprintf(stderr, "\n");
106 fprintf(fplog, "\n");
107 vfprintf(fplog, fmt, ap);
108 fprintf(fplog, "\n");
115 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
118 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
119 and the table GPU array. If called with an already allocated table,
120 it just re-uploads the table.
122 static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp,
123 const gmx_device_info_t *dev_info)
125 float *ftmp, *coul_tab;
130 tabsize = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
131 /* Subtract 2 iso 1 to avoid access out of range due to rounding */
132 tabscale = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
134 pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
136 table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
137 1/tabscale, nbp->ewald_beta, v_q_ewald_lr);
139 /* If the table pointer == NULL the table is generated the first time =>
140 the array pointer will be saved to nbparam and the texture is bound.
142 coul_tab = nbp->coulomb_tab;
143 if (coul_tab == NULL)
145 stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
146 CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
148 nbp->coulomb_tab = coul_tab;
150 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
151 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
152 if (dev_info->prop.major >= 3)
155 memset(&rd, 0, sizeof(rd));
156 rd.resType = cudaResourceTypeLinear;
157 rd.res.linear.devPtr = nbp->coulomb_tab;
158 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
159 rd.res.linear.desc.x = 32;
160 rd.res.linear.sizeInBytes = tabsize*sizeof(*coul_tab);
163 memset(&td, 0, sizeof(td));
164 td.readMode = cudaReadModeElementType;
165 stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
166 CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
169 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
171 GMX_UNUSED_VALUE(dev_info);
172 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
173 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
174 coul_tab, &cd, tabsize*sizeof(*coul_tab));
175 CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
179 cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
181 nbp->coulomb_tab_size = tabsize;
182 nbp->coulomb_tab_scale = tabscale;
188 /*! Initializes the atomdata structure first time, it only gets filled at
190 static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
195 stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
196 CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
197 ad->bShiftVecUploaded = false;
199 stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
200 CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
202 stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
203 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
204 stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
205 CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
207 /* initialize to NULL poiters to data that is not allocated here and will
208 need reallocation in nbnxn_cuda_init_atomdata */
212 /* size -1 indicates that the respective array hasn't been initialized yet */
217 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
218 earlier GPUs, single or twin cut-off. */
219 static int pick_ewald_kernel_type(bool bTwinCut,
220 const gmx_device_info_t *dev_info)
222 bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
225 /* Benchmarking/development environment variables to force the use of
226 analytical or tabulated Ewald kernel. */
227 bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
228 bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
230 if (bForceAnalyticalEwald && bForceTabulatedEwald)
232 gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
233 "requested through environment variables.");
236 /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
237 if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
239 bUseAnalyticalEwald = true;
243 fprintf(debug, "Using analytical Ewald CUDA kernels\n");
248 bUseAnalyticalEwald = false;
252 fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
256 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
257 forces it (use it for debugging/benchmarking only). */
258 if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
260 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
264 kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
270 /*! Copies all parameters related to the cut-off from ic to nbp */
271 static void set_cutoff_parameters(cu_nbparam_t *nbp,
272 const interaction_const_t *ic)
274 nbp->ewald_beta = ic->ewaldcoeff_q;
275 nbp->sh_ewald = ic->sh_ewald;
276 nbp->epsfac = ic->epsfac;
277 nbp->two_k_rf = 2.0 * ic->k_rf;
278 nbp->c_rf = ic->c_rf;
279 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
280 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
281 nbp->rlist_sq = ic->rlist * ic->rlist;
283 nbp->sh_lj_ewald = ic->sh_lj_ewald;
284 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
286 nbp->rvdw_switch = ic->rvdw_switch;
287 nbp->dispersion_shift = ic->dispersion_shift;
288 nbp->repulsion_shift = ic->repulsion_shift;
289 nbp->vdw_switch = ic->vdw_switch;
292 /*! Initializes the nonbonded parameter data structure. */
293 static void init_nbparam(cu_nbparam_t *nbp,
294 const interaction_const_t *ic,
295 const nbnxn_atomdata_t *nbat,
296 const gmx_device_info_t *dev_info)
299 int ntypes, nnbfp, nnbfp_comb;
301 ntypes = nbat->ntype;
303 set_cutoff_parameters(nbp, ic);
305 if (ic->vdwtype == evdwCUT)
307 switch (ic->vdw_modifier)
310 case eintmodPOTSHIFT:
311 nbp->vdwtype = evdwCuCUT;
313 case eintmodFORCESWITCH:
314 nbp->vdwtype = evdwCuFSWITCH;
316 case eintmodPOTSWITCH:
317 nbp->vdwtype = evdwCuPSWITCH;
320 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
324 else if (ic->vdwtype == evdwPME)
326 if (ic->ljpme_comb_rule == ljcrGEOM)
328 assert(nbat->comb_rule == ljcrGEOM);
329 nbp->vdwtype = evdwCuEWALDGEOM;
333 assert(nbat->comb_rule == ljcrLB);
334 nbp->vdwtype = evdwCuEWALDLB;
339 gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
342 if (ic->eeltype == eelCUT)
344 nbp->eeltype = eelCuCUT;
346 else if (EEL_RF(ic->eeltype))
348 nbp->eeltype = eelCuRF;
350 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
352 /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
353 nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
357 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
358 gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
361 /* generate table for PME */
362 nbp->coulomb_tab = NULL;
363 if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
365 init_ewald_coulomb_force_table(nbp, dev_info);
368 nnbfp = 2*ntypes*ntypes;
369 nnbfp_comb = 2*ntypes;
371 stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
372 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
373 cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
376 if (ic->vdwtype == evdwPME)
378 stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
379 CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
380 cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
383 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
384 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
385 if (dev_info->prop.major >= 3)
390 memset(&rd, 0, sizeof(rd));
391 rd.resType = cudaResourceTypeLinear;
392 rd.res.linear.devPtr = nbp->nbfp;
393 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
394 rd.res.linear.desc.x = 32;
395 rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
397 memset(&td, 0, sizeof(td));
398 td.readMode = cudaReadModeElementType;
399 stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
400 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
402 if (ic->vdwtype == evdwPME)
404 memset(&rd, 0, sizeof(rd));
405 rd.resType = cudaResourceTypeLinear;
406 rd.res.linear.devPtr = nbp->nbfp_comb;
407 rd.res.linear.desc.f = cudaChannelFormatKindFloat;
408 rd.res.linear.desc.x = 32;
409 rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
411 memset(&td, 0, sizeof(td));
412 td.readMode = cudaReadModeElementType;
413 stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
414 CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
418 #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */
420 cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
421 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
422 nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
423 CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
425 if (ic->vdwtype == evdwPME)
427 stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
428 nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
429 CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
434 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
435 * electrostatic type switching to twin cut-off (or back) if needed. */
436 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
437 const interaction_const_t *ic)
439 if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
443 gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
444 cu_nbparam_t *nbp = nb->nbparam;
446 set_cutoff_parameters(nbp, ic);
448 nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
451 init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
454 /*! Initializes the pair list data structure. */
455 static void init_plist(cu_plist_t *pl)
457 /* initialize to NULL pointers to data that is not allocated here and will
458 need reallocation in nbnxn_gpu_init_pairlist */
463 /* size -1 indicates that the respective array hasn't been initialized yet */
470 pl->excl_nalloc = -1;
471 pl->bDoPrune = false;
474 /*! Initializes the timer data structure. */
475 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
478 int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
480 stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
481 CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
482 stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
483 CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
485 /* The non-local counters/stream (second in the array) are needed only with DD. */
486 for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
488 stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
489 CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
490 stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
491 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
494 stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
495 CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
496 stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
497 CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
499 stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
500 CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
501 stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
502 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
504 stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
505 CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
506 stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
507 CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
511 /*! Initializes the timings data structure. */
512 static void init_timings(gmx_wallclock_gpu_t *t)
521 for (i = 0; i < 2; i++)
523 for (j = 0; j < 2; j++)
525 t->ktime[i][j].t = 0.0;
526 t->ktime[i][j].c = 0;
531 void nbnxn_gpu_init(FILE *fplog,
532 gmx_nbnxn_cuda_t **p_nb,
533 const gmx_gpu_info_t *gpu_info,
534 const gmx_gpu_opt_t *gpu_opt,
536 gmx_bool bLocalAndNonlocal)
539 gmx_nbnxn_cuda_t *nb;
541 bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
553 snew(nb->nbparam, 1);
554 snew(nb->plist[eintLocal], 1);
555 if (bLocalAndNonlocal)
557 snew(nb->plist[eintNonlocal], 1);
560 nb->bUseTwoStreams = bLocalAndNonlocal;
563 snew(nb->timings, 1);
566 pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
567 pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
568 pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
570 init_plist(nb->plist[eintLocal]);
572 /* set device info, just point it to the right GPU among the detected ones */
573 nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
575 /* local/non-local GPU streams */
576 stat = cudaStreamCreate(&nb->stream[eintLocal]);
577 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
578 if (nb->bUseTwoStreams)
580 init_plist(nb->plist[eintNonlocal]);
582 /* CUDA stream priority available in the CUDA RT 5.5 API.
583 * Note that the device we're running on does not have to support
584 * priorities, because we are querying the priority range which in this
585 * case will be a single value.
587 #if GMX_CUDA_VERSION >= 5050
589 int highest_priority;
590 stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
591 CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
593 stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
596 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
599 stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
600 CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
604 /* init events for sychronization (timing disabled for performance reasons!) */
605 stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
606 CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
607 stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
608 CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
610 /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
611 * (which increases with shorter time/step) caused by a known CUDA driver bug.
612 * To work around the issue we'll use an (admittedly fragile) memory polling
613 * waiting to preserve performance. This requires support for atomic
614 * operations and only works on x86/x86_64.
615 * With polling wait event-timing also needs to be disabled.
617 * The overhead is greatly reduced in API v5.0 drivers and the improvement
618 * is independent of runtime version. Hence, with API v5.0 drivers and later
619 * we won't switch to polling.
621 * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
622 * ranks so we will also disable it in that case.
625 bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
626 bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
631 bTMPIAtomics = false;
634 #ifdef GMX_TARGET_X86
640 if (bStreamSync && bNoStreamSync)
642 gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
645 stat = cudaDriverGetVersion(&cuda_drv_ver);
646 CU_RET_ERR(stat, "cudaDriverGetVersion failed");
648 bOldDriver = (cuda_drv_ver < 5000);
650 if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
652 /* Polling wait should be used instead of cudaStreamSynchronize only if:
653 * - ECC is ON & driver is old (checked above),
654 * - we're on x86/x86_64,
655 * - atomics are available, and
656 * - GPUs are not being shared.
658 bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
659 (gmx_count_gpu_dev_shared(gpu_opt) < 1));
663 nb->bUseStreamSync = true;
665 /* only warn if polling should be used */
666 if (bShouldUsePollSync)
669 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
670 " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
675 nb->bUseStreamSync = !bShouldUsePollSync;
677 if (bShouldUsePollSync)
680 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
681 " cause performance loss. Switching to the alternative polling GPU wait.\n"
682 " If you encounter issues, switch back to standard GPU waiting by setting\n"
683 " the GMX_CUDA_STREAMSYNC environment variable.\n");
687 /* Tell the user that the ECC+old driver combination can be bad */
689 "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
690 " A known bug in this driver version can cause performance loss.\n"
691 " However, the polling wait workaround can not be used because\n%s\n"
692 " Consider updating the driver or turning ECC off.",
693 (bX86 && bTMPIAtomics) ?
694 " GPU(s) are being oversubscribed." :
695 " atomic operations are not supported by the platform/CPU+compiler.");
696 md_print_warn(fplog, sbuf);
704 nb->bUseStreamSync = false;
707 "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
711 /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
712 nb->bUseStreamSync = true;
716 /* CUDA timing disabled as event timers don't work:
717 - with multiple streams = domain-decomposition;
718 - with the polling waiting hack (without cudaStreamSynchronize);
719 - when turned off by GMX_DISABLE_CUDA_TIMING.
721 nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
722 (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
726 init_timers(nb->timers, nb->bUseTwoStreams);
727 init_timings(nb->timings);
730 /* set the kernel type for the current GPU */
731 /* pick L1 cache configuration */
732 nbnxn_cuda_set_cacheconfig(nb->dev_info);
738 fprintf(debug, "Initialized CUDA data structures.\n");
742 void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
743 const interaction_const_t *ic,
744 const nonbonded_verlet_group_t *nbv_group)
746 init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
747 init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
749 /* clear energy and shift force outputs */
750 nbnxn_cuda_clear_e_fshift(nb);
753 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
754 const nbnxn_pairlist_t *h_plist,
759 bool bDoTime = nb->bDoTime;
760 cudaStream_t stream = nb->stream[iloc];
761 cu_plist_t *d_plist = nb->plist[iloc];
763 if (d_plist->na_c < 0)
765 d_plist->na_c = h_plist->na_ci;
769 if (d_plist->na_c != h_plist->na_ci)
771 sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
772 d_plist->na_c, h_plist->na_ci);
779 stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
780 CU_RET_ERR(stat, "cudaEventRecord failed");
783 cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
784 &d_plist->nsci, &d_plist->sci_nalloc,
788 cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
789 &d_plist->ncj4, &d_plist->cj4_nalloc,
793 cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
794 &d_plist->nexcl, &d_plist->excl_nalloc,
800 stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
801 CU_RET_ERR(stat, "cudaEventRecord failed");
804 /* need to prune the pair list during the next step */
805 d_plist->bDoPrune = true;
808 void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
809 const nbnxn_atomdata_t *nbatom)
811 cu_atomdata_t *adat = nb->atdat;
812 cudaStream_t ls = nb->stream[eintLocal];
814 /* only if we have a dynamic box */
815 if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
817 cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
818 SHIFTS * sizeof(*adat->shift_vec), ls);
819 adat->bShiftVecUploaded = true;
823 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
824 static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
827 cu_atomdata_t *adat = nb->atdat;
828 cudaStream_t ls = nb->stream[eintLocal];
830 stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
831 CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
834 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
835 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
838 cu_atomdata_t *adat = nb->atdat;
839 cudaStream_t ls = nb->stream[eintLocal];
841 stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
842 CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
843 stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
844 CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
845 stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
846 CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
849 void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
851 nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
852 /* clear shift force array and energies if the outputs were
853 used in the current step */
854 if (flags & GMX_FORCE_VIRIAL)
856 nbnxn_cuda_clear_e_fshift(nb);
860 void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
861 const struct nbnxn_atomdata_t *nbat)
866 bool bDoTime = nb->bDoTime;
867 cu_timers_t *timers = nb->timers;
868 cu_atomdata_t *d_atdat = nb->atdat;
869 cudaStream_t ls = nb->stream[eintLocal];
871 natoms = nbat->natoms;
876 /* time async copy */
877 stat = cudaEventRecord(timers->start_atdat, ls);
878 CU_RET_ERR(stat, "cudaEventRecord failed");
881 /* need to reallocate if we have to copy more atoms than the amount of space
882 available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
883 if (natoms > d_atdat->nalloc)
885 nalloc = over_alloc_small(natoms);
887 /* free up first if the arrays have already been initialized */
888 if (d_atdat->nalloc != -1)
890 cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
891 cu_free_buffered(d_atdat->xq);
892 cu_free_buffered(d_atdat->atom_types);
895 stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
896 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
897 stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
898 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
900 stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
901 CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
903 d_atdat->nalloc = nalloc;
907 d_atdat->natoms = natoms;
908 d_atdat->natoms_local = nbat->natoms_local;
910 /* need to clear GPU f output if realloc happened */
913 nbnxn_cuda_clear_f(nb, nalloc);
916 cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
917 natoms*sizeof(*d_atdat->atom_types), ls);
921 stat = cudaEventRecord(timers->stop_atdat, ls);
922 CU_RET_ERR(stat, "cudaEventRecord failed");
926 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
929 cu_atomdata_t *atdat;
930 cu_nbparam_t *nbparam;
931 cu_plist_t *plist, *plist_nl;
934 /* Stopping the nvidia profiler here allows us to eliminate the subsequent
935 uninitialization API calls from the trace. */
936 if (getenv("NVPROF_ID") != NULL)
938 stat = cudaProfilerStop();
939 CU_RET_ERR(stat, "cudaProfilerStop failed");
948 nbparam = nb->nbparam;
949 plist = nb->plist[eintLocal];
950 plist_nl = nb->plist[eintNonlocal];
953 if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
956 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
957 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
958 if (nb->dev_info->prop.major >= 3)
960 stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
961 CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
966 stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
967 CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
969 cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
972 stat = cudaEventDestroy(nb->nonlocal_done);
973 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
974 stat = cudaEventDestroy(nb->misc_ops_done);
975 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
979 stat = cudaEventDestroy(timers->start_atdat);
980 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
981 stat = cudaEventDestroy(timers->stop_atdat);
982 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
984 /* The non-local counters/stream (second in the array) are needed only with DD. */
985 for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
987 stat = cudaEventDestroy(timers->start_nb_k[i]);
988 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
989 stat = cudaEventDestroy(timers->stop_nb_k[i]);
990 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
992 stat = cudaEventDestroy(timers->start_pl_h2d[i]);
993 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
994 stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
995 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
997 stat = cudaStreamDestroy(nb->stream[i]);
998 CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
1000 stat = cudaEventDestroy(timers->start_nb_h2d[i]);
1001 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
1002 stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
1003 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
1005 stat = cudaEventDestroy(timers->start_nb_d2h[i]);
1006 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
1007 stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
1008 CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
1012 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1013 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1014 if (nb->dev_info->prop.major >= 3)
1016 stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
1017 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
1022 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
1023 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
1025 cu_free_buffered(nbparam->nbfp);
1027 if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
1029 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
1030 /* Only device CC >= 3.0 (Kepler and later) support texture objects */
1031 if (nb->dev_info->prop.major >= 3)
1033 stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
1034 CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
1039 stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
1040 CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
1042 cu_free_buffered(nbparam->nbfp_comb);
1045 stat = cudaFree(atdat->shift_vec);
1046 CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
1047 stat = cudaFree(atdat->fshift);
1048 CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
1050 stat = cudaFree(atdat->e_lj);
1051 CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
1052 stat = cudaFree(atdat->e_el);
1053 CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
1055 cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
1056 cu_free_buffered(atdat->xq);
1057 cu_free_buffered(atdat->atom_types, &atdat->ntypes);
1059 cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
1060 cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
1061 cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
1062 if (nb->bUseTwoStreams)
1064 cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
1065 cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
1066 cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
1072 if (nb->bUseTwoStreams)
1082 fprintf(debug, "Cleaned up CUDA data structures.\n");
1086 void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
1089 cudaStream_t stream = nb->stream[iloc];
1091 stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
1092 CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
1095 gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
1097 return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
1100 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
1102 /* The NVPROF_ID environment variable is set by nvprof and indicates that
1103 mdrun is executed in the CUDA profiler.
1104 If nvprof was run is with "--profile-from-start off", the profiler will
1105 be started here. This way we can avoid tracing the CUDA events from the
1106 first part of the run. Starting the profiler again does nothing.
1108 if (getenv("NVPROF_ID") != NULL)
1111 stat = cudaProfilerStart();
1112 CU_RET_ERR(stat, "cudaProfilerStart failed");
1115 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
1117 init_timings(nbv->gpu_nbv->timings);
1121 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
1124 gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
1128 gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
1130 return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
1131 (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));