* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
- * \param[in] useGpuForceReduction True if GPU-based force reduction is active this step
* \param[in] wcycle The wallcycle structure
*/
static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
const matrix box,
const StepWorkload &stepWork,
int pmeFlags,
- bool useGpuForceReduction,
gmx_wallcycle_t wcycle)
{
- pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+ pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
pme_gpu_launch_spread(pmedata, wcycle);
}
*
* \param[in] legacyFlags Force bitmask flags used to construct the new flags
* \param[in] isNonbondedOn Global override, if false forces to turn off all nonbonded calculation.
+ * \param[in] simulationWork Simulation workload description.
+ * \param[in] rankHasPmeDuty If this rank computes PME.
+ *
* \returns New Stepworkload description.
*/
static StepWorkload
-setupStepWorkload(const int legacyFlags,
- const bool isNonbondedOn)
+setupStepWorkload(const int legacyFlags,
+ const bool isNonbondedOn,
+ const SimulationWorkload &simulationWork,
+ const bool rankHasPmeDuty)
{
StepWorkload flags;
flags.stateChanged = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
flags.computeNonbondedForces = ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && isNonbondedOn;
flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
+
+ if (simulationWork.useGpuBufferOps)
+ {
+ GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded");
+ }
+ flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+ // on virial steps the CPU reduction path is taken
+ // TODO: remove flags.computeEnergy, ref #3128
+ flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
+ flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu && rankHasPmeDuty);
+
return flags;
}
legacyFlags &= ~GMX_FORCE_NONBONDED;
}
- runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded);
- const StepWorkload &stepWork = runScheduleWork->stepWork;
-
const SimulationWorkload &simulationWork = runScheduleWork->simulationWork;
- const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
- const int pmeFlags = makePmeFlags(stepWork);
+
+ runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded,
+ simulationWork, thisRankHasDuty(cr, DUTY_PME));
+ const StepWorkload &stepWork = runScheduleWork->stepWork;
+
+
+ const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+ const int pmeFlags = makePmeFlags(stepWork);
// Switches on whether to use GPU for position and force buffer operations
// TODO consider all possible combinations of triggers, and how to combine optimally in each case.
- const BufferOpsUseGpu useGpuXBufOps = (simulationWork.useGpuBufferOps &&
- simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+ const BufferOpsUseGpu useGpuXBufOps = stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
// GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
- const BufferOpsUseGpu useGpuFBufOps = ((simulationWork.useGpuBufferOps &&
- simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) &&
- !(stepWork.computeVirial || stepWork.computeEnergy)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
- // TODO: move / add this flag to the internal PME GPU data structures
- const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
- useGpuPmeOnThisRank; // only supported if this rank is perfoming PME on the GPU
+ const BufferOpsUseGpu useGpuFBufOps = stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
if (useGpuPmeOnThisRank)
{
- launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
}
/* do gridding for pair search */
}
const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
- const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+ const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
// TODO: move this into DomainLifetimeWorkload, including the second part of the condition
const bool haveCpuLocalForces = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
(fr->efep != efepNO));
{
gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
- if (useGpuPmeFReduction)
+ if (stepWork.useGpuPmeFReduction)
{
dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
}
stateGpu->getForces(),
pme_gpu_get_device_f(fr->pmedata),
dependencyList,
- useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
+ stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
// This function call synchronizes the local stream
nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
* Note that some development features overrides are applied already here:
* the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
*
- * \param[in] mdlog Logger object.
- * \returns The object populated with development feature flags.
+ * \param[in] mdlog Logger object.
+ * \param[in] useGpuForNonbonded True if the nonbonded task is offloaded in this run.
+ * \returns The object populated with development feature flags.
*/
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog,
+ const bool useGpuForNonbonded)
{
DevelopmentFeatureFlags devFlags;
- devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+ devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
devFlags.enableGpuPmePPComm = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
gmx::MDLogger mdlog(logOwner.logger());
- // report any development features that may be enabled by environment variables
- const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog);
-
// TODO The thread-MPI master rank makes a working
// PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
// after the threads have been launched. This works because no use
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ // Initialize development feature flags that enabled by environment variable
+ // and report those features that are enabled.
+ const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded);
+
// Build restraints.
// TODO: hide restraint implementation details from Mdrunner.
// There is nothing unique about restraints at this point as far as the