From 904d4645ca712bd58e0a22fcdebead0291ec19d3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 19 Nov 2013 03:00:24 +0100 Subject: [PATCH] Fix DD load balancing bug with GPU sharing The recent DD load balancing fix which solved the issue of incorrect imbalance measure with GPU sharing (ba8232e9) addressed GPUs with incorrect indexing. This caused out of bounds indexing in the GPU ID query function. The query function also had a bug in the error checking which allowed the incorrect indexing. Now also mdrun -nb cpu -gpu_id ... is allowed, which before would give a fatal error. This commit addresses both issues; fixes #1385 Change-Id: I2800f610b873da92afe78bbfd869258f378ba2d7 --- src/gmxlib/gpu_utils/gpu_utils.cu | 5 +---- src/kernel/runner.c | 5 +++++ src/mdlib/domdec.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/gmxlib/gpu_utils/gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu index ee3d5e10d6..24fc7557a6 100644 --- a/src/gmxlib/gpu_utils/gpu_utils.cu +++ b/src/gmxlib/gpu_utils/gpu_utils.cu @@ -860,10 +860,7 @@ int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, { assert(gpu_info); assert(gpu_opt); - if (idx < 0 && idx >= gpu_opt->ncuda_dev_use) - { - return -1; - } + assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use); return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id; } diff --git a/src/kernel/runner.c b/src/kernel/runner.c index 17b5f351ad..68ea884a0b 100644 --- a/src/kernel/runner.c +++ b/src/kernel/runner.c @@ -1481,6 +1481,11 @@ int mdrunner(gmx_hw_opt_t *hw_opt, gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU, &hw_opt->gpu_opt); } + else + { + /* Ignore (potentially) manually selected GPUs */ + hw_opt->gpu_opt.ncuda_dev_use = 0; + } /* check consistency of CPU acceleration and number of GPUs selected */ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU); diff --git a/src/mdlib/domdec.c b/src/mdlib/domdec.c index d488b0bdf1..92fa8c1640 100644 --- a/src/mdlib/domdec.c +++ b/src/mdlib/domdec.c @@ -5697,7 +5697,7 @@ void dd_setup_dlb_resource_sharing(t_commrec *cr, physicalnode_id_hash = gmx_physicalnode_id_hash(); - gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid); + gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode); dd = cr->dd; -- 2.22.0