From: Szilárd Páll Date: Tue, 19 Nov 2013 02:00:24 +0000 (+0100) Subject: Fix DD load balancing bug with GPU sharing X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=904d4645ca712bd58e0a22fcdebead0291ec19d3;p=alexxy%2Fgromacs.git Fix DD load balancing bug with GPU sharing The recent DD load balancing fix which solved the issue of incorrect imbalance measure with GPU sharing (ba8232e9) addressed GPUs with incorrect indexing. This caused out of bounds indexing in the GPU ID query function. The query function also had a bug in the error checking which allowed the incorrect indexing. Now also mdrun -nb cpu -gpu_id ... is allowed, which before would give a fatal error. This commit addresses both issues; fixes #1385 Change-Id: I2800f610b873da92afe78bbfd869258f378ba2d7 --- diff --git a/src/gmxlib/gpu_utils/gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu index ee3d5e10d6..24fc7557a6 100644 --- a/src/gmxlib/gpu_utils/gpu_utils.cu +++ b/src/gmxlib/gpu_utils/gpu_utils.cu @@ -860,10 +860,7 @@ int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, { assert(gpu_info); assert(gpu_opt); - if (idx < 0 && idx >= gpu_opt->ncuda_dev_use) - { - return -1; - } + assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use); return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id; } diff --git a/src/kernel/runner.c b/src/kernel/runner.c index 17b5f351ad..68ea884a0b 100644 --- a/src/kernel/runner.c +++ b/src/kernel/runner.c @@ -1481,6 +1481,11 @@ int mdrunner(gmx_hw_opt_t *hw_opt, gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU, &hw_opt->gpu_opt); } + else + { + /* Ignore (potentially) manually selected GPUs */ + hw_opt->gpu_opt.ncuda_dev_use = 0; + } /* check consistency of CPU acceleration and number of GPUs selected */ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU); diff --git a/src/mdlib/domdec.c b/src/mdlib/domdec.c index d488b0bdf1..92fa8c1640 100644 --- a/src/mdlib/domdec.c +++ b/src/mdlib/domdec.c @@ -5697,7 +5697,7 @@ void dd_setup_dlb_resource_sharing(t_commrec *cr, physicalnode_id_hash = gmx_physicalnode_id_hash(); - gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid); + gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode); dd = cr->dd;