Fix deviceIdsAssigned() to return a unique list of IDs
authorSzilárd Páll <pall.szilard@gmail.com>
Tue, 23 Mar 2021 13:44:45 +0000 (13:44 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 23 Mar 2021 13:44:45 +0000 (13:44 +0000)
The method originally returned the rank's task-to-device mapping
rather than the unique list of devices that have been assigned tasks.
When #ranks > #devices this list contains duplicate device IDs.

Due to the changes made in e2a2fe80 the vector returned by
deviceIdsAssigned() was passed to setupGpuDevicePeerAccess()
which results in redundant attempts to repeatedly enable peer access for
the same device pairs.

Refs #3980

src/gromacs/taskassignment/taskassignment.cpp
src/gromacs/taskassignment/taskassignment.h

index a9256ba27c97162ed6202bef6b0743eeb4d5ba5c..34acebbd268dadc9f5155820cba1a3af6d855363 100644 (file)
@@ -265,7 +265,7 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const in
 
     std::exception_ptr             exceptionPtr;
     std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
-    std::vector<int>               deviceIdsAssigned;
+    std::vector<int>               deviceIdAssignment;
     try
     {
         // Use the GPU IDs from the user if they supplied
@@ -320,7 +320,7 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const in
                         host,
                         availableDevices.size())));
             }
-            deviceIdsAssigned = generatedGpuIds;
+            deviceIdAssignment = generatedGpuIds;
         }
         else
         {
@@ -342,10 +342,10 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const in
             // Did the user choose compatible GPUs?
             checkUserGpuIds(hardwareInfo.deviceInfoList, availableDevices, userGpuTaskAssignment);
 
-            deviceIdsAssigned = gmx::copyOf(userGpuTaskAssignment);
+            deviceIdAssignment = gmx::copyOf(userGpuTaskAssignment);
         }
         taskAssignmentOnRanksOfThisNode =
-                buildTaskAssignment(gpuTasksOnRanksOfThisNode, deviceIdsAssigned);
+                buildTaskAssignment(gpuTasksOnRanksOfThisNode, deviceIdAssignment);
     }
     catch (...)
     {
@@ -392,7 +392,11 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const in
     gpuTaskAssignments.indexOfThisRank_                 = physicalNodeComm.rank_;
     gpuTaskAssignments.numGpuTasksOnThisNode_           = numGpuTasksOnThisNode;
     gpuTaskAssignments.numRanksOnThisNode_              = numRanksOnThisNode;
-    gpuTaskAssignments.deviceIdsAssigned_               = deviceIdsAssigned;
+    gpuTaskAssignments.deviceIdsAssigned_               = deviceIdAssignment;
+    std::sort(gpuTaskAssignments.deviceIdsAssigned_.begin(), gpuTaskAssignments.deviceIdsAssigned_.end());
+    gpuTaskAssignments.deviceIdsAssigned_.erase(unique(gpuTaskAssignments.deviceIdsAssigned_.begin(),
+                                                       gpuTaskAssignments.deviceIdsAssigned_.end()),
+                                                gpuTaskAssignments.deviceIdsAssigned_.end());
     return gpuTaskAssignments;
 }
 
index 63cf9c54ff35c243df80a86e5f619f0147fa6a70..1267de148af9ee052d5d3bb972acd3e797667d56 100644 (file)
@@ -256,7 +256,7 @@ public:
     bool thisRankHasPmeGpuTask() const;
     //! Return whether this rank has any task running on a GPU
     bool thisRankHasAnyGpuTask() const;
-    //! Get the list of devices assigned to this node
+    //! Get the list of unique devices that have been assigned tasks on this physical node
     std::vector<int> deviceIdsAssigned() { return deviceIdsAssigned_; }
 };