With the 304.xx (API v5.0) drivers the cudaStreamSynchronize overhead
has been substantially reduced, so with these there is no need to switch
to polling. Additionally, with GPUs oversubscribed the polling has been
shown to be unstable, so this case is now detected and switch to polling
is not done.
Also corrected some warning/note printing layout.
Change-Id: I32a7ecda42de31f825a6341c81f04ca0b97c0d4a
typedef struct
{
gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
typedef struct
{
gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
+ gmx_bool bDevShare; /* true if any of the devices is shared by
+ (t)MPI ranks, with auto-detection always FALSE */
int ncuda_dev_use; /* number of devices selected to be used */
int *cuda_dev_use; /* index of the devices selected to be used */
int ncuda_dev_use; /* number of devices selected to be used */
int *cuda_dev_use; /* index of the devices selected to be used */
md_print_warn(cr,fplog,
"NOTE: potentially sub-optimal launch configuration, %s started with less\n"
" PP %s%s%s than GPU%s available.\n"
md_print_warn(cr,fplog,
"NOTE: potentially sub-optimal launch configuration, %s started with less\n"
" PP %s%s%s than GPU%s available.\n"
- " Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
+ " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
ShortProgram(),
th_or_proc, th_or_proc_plural, pernode, gpu_plural,
th_or_proc, npppn, gpu_plural, pernode);
ShortProgram(),
th_or_proc, th_or_proc_plural, pernode, gpu_plural,
th_or_proc, npppn, gpu_plural, pernode);
+ hwinfo->gpu_info.bDevShare = FALSE;
if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
{
int i, j, same_count;
gmx_bool bSomeSame, bAllDifferent;
if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
{
int i, j, same_count;
gmx_bool bSomeSame, bAllDifferent;
+ same_count = 0; /* number of GPUs shared among ranks */
bSomeSame = FALSE;
bAllDifferent = TRUE;
bSomeSame = FALSE;
bAllDifferent = TRUE;
+ /* store the number of shared/oversubscribed GPUs */
+ hwinfo->gpu_info.bDevShare = bSomeSame;
+
if (btMPI && !bAllDifferent)
{
gmx_fatal(FARGS,
if (btMPI && !bAllDifferent)
{
gmx_fatal(FARGS,
{
md_print_warn(cr,fplog,
"NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
{
md_print_warn(cr,fplog,
"NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
- " multiple %s%s; this should be avoided as it generally\n"
- " causes performance loss.",
+ " multiple %s%s; this should be avoided as it can cause\n"
+ " performance loss.\n",
same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
}
}
same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
}
}
extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+/* We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in CUDA code.
+ */
+static void md_print_warn(FILE *fplog, const char *buf)
+{
+ if (fplog != NULL)
+ {
+ /* We should only print to stderr on the master node,
+ * in most cases fplog is only set on the master node, so this works.
+ */
+ fprintf(stderr, "\n%s\n", buf);
+ fprintf(fplog, "\n%s\n", buf);
+ }
+}
+
/* Fw. decl. */
static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
/* Fw. decl. */
static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
cudaError_t stat;
nbnxn_cuda_ptr_t nb;
char sbuf[STRLEN];
cudaError_t stat;
nbnxn_cuda_ptr_t nb;
char sbuf[STRLEN];
- bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86;
+ bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+ int cuda_drv_ver;
* waiting to preserve performance. This requires support for atomic
* operations and only works on x86/x86_64.
* With polling wait event-timing also needs to be disabled.
* waiting to preserve performance. This requires support for atomic
* operations and only works on x86/x86_64.
* With polling wait event-timing also needs to be disabled.
+ *
+ * The overhead is greatly reduced in 304.xx drivers (independent of runtime ver).
+ * The corresponding driver API version (which is what we can query) should
+ * be at least 5.0. Hence we will not switch to polling when >=5.0 is returned.
+ *
+ * NOTE: Unfortunately, this is knonw to fail when GPUs are shared by (t)MPI,
+ * ranks so we will also disable it in that case.
*/
bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
*/
bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
}
gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
}
+ stat = cudaDriverGetVersion(&cuda_drv_ver);
+ CU_RET_ERR(stat, "cudaDriverGetVersion failed");
+ bOldDriver = (cuda_drv_ver < 5000);
+
if (nb->dev_info->prop.ECCEnabled == 1)
{
if (bStreamSync)
{
nb->bUseStreamSync = true;
if (nb->dev_info->prop.ECCEnabled == 1)
{
if (bStreamSync)
{
nb->bUseStreamSync = true;
- sprintf(sbuf,
- "NOTE: Using a GPU with ECC enabled, but cudaStreamSynchronize-based waiting is\n"
- " forced by the GMX_CUDA_STREAMSYNC env. var. Due to a CUDA bug, this \n"
- " combination causes performance loss.");
- fprintf(stderr, "\n%s\n", sbuf);
- if (fplog)
+ /* only warn if polling should be used */
+ if (bOldDriver && !gpu_info->bDevShare)
- fprintf(fplog, "\n%s\n", sbuf);
+ md_print_warn(fplog,
+ "NOTE: Using a GPU with ECC enabled and a driver older than 5.0, but\n"
+ " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
- /* can use polling wait only on x86/x86_64 *if* atomics are available */
- nb->bUseStreamSync = ((bX86 && bTMPIAtomics) == false);
-
- if (!bX86)
+ /* Can/should turn of cudaStreamSynchronize wait only if
+ * - we're on x86/x86_64
+ * - atomics are available
+ * - GPUs are not being shared
+ * - and driver is old. */
+ nb->bUseStreamSync =
+ (bX86 && bTMPIAtomics && !gpu_info->bDevShare && bOldDriver) ?
+ true : false;
+
+ if (nb->bUseStreamSync)
- sprintf(sbuf,
- "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
- " CUDA bug, causes performance loss when used in combination with ECC.\n"
- " However, the polling waiting workaround can not be used as it is only\n"
- " supported on x86/x86_64, but not on the current architecture.");
- gmx_warning("%s\n", sbuf);
- if (fplog)
- {
- fprintf(fplog, "\n%s\n", sbuf);
- }
-
+ md_print_warn(fplog,
+ "NOTE: Using a GPU with ECC enabled and CUDA driver version <5.0, will switch to\n"
+ " polling wait to avoid performance loss. If you encounter issues, set the\n"
+ " GMX_CUDA_STREAMSYNC env. var. to switch back to standard GPU waiting.\n");
- else if (bTMPIAtomics)
- {
- if (fplog)
- {
- fprintf(fplog,
- "NOTE: Using a GPU with ECC enabled; will use polling waiting.\n");
- }
- }
- else
+ /* Tell the user that the ECC+old driver combination can be bad */
- "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
- " CUDA bug, causes performance loss when used in combination with ECC.\n"
- " However, the polling waiting workaround can not be used as atomic\n"
- " operations are not supported by the current CPU+compiler combination.");
- gmx_warning("%s\n", sbuf);
- if (fplog)
- {
- fprintf(fplog, "\n%s\n", sbuf);
- }
+ "NOTE: Using a GPU with ECC enabled and driver version <5.0. A bug in this\n"
+ " driver can cause performance loss.\n"
+ " However, the polling waiting workaround can not be used because\n%s\n"
+ " Consider updating the driver or turning ECC off.",
+ (!bX86 || !bTMPIAtomics) ?
+ " atomic operations are not supported by the platform/CPU+compiler." :
+ " GPU(s) are being oversubscribed.");
+ md_print_warn(fplog, sbuf);
{
nb->bUseStreamSync = false;
{
nb->bUseStreamSync = false;
- sprintf(sbuf,
- "NOTE: Using a GPU with no/disabled ECC, but cudaStreamSynchronize-based waiting\n"
- " is turned off and polling turned on by the GMX_NO_CUDA_STREAMSYNC env. var.");
- fprintf(stderr, "\n%s\n", sbuf);
- if (fplog)
- {
- fprintf(fplog, "\n%s\n", sbuf);
- }
+ md_print_warn(fplog,
+ "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");