{
DevelopmentFeatureFlags devFlags;
- devFlags.enableGpuBufferOps =
- GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+ devFlags.enableGpuBufferOps = (GMX_GPU_CUDA || GMX_GPU_SYCL) && useGpuForNonbonded
+ && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
hw_opt.nthreads_tmpi);
useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
pmeTarget,
+ pmeFftTarget,
numAvailableDevices,
userGpuTaskAssignment,
*hwinfo_,
// master and spawned threads joins at the end of this block.
}
- GMX_RELEASE_ASSERT(ms || simulationCommunicator != MPI_COMM_NULL,
+ GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
"Must have valid communicator unless running a multi-simulation");
CommrecHandle crHandle = init_commrec(simulationCommunicator);
t_commrec* cr = crHandle.get();
GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
partialDeserializedTpr.reset(nullptr);
- GMX_RELEASE_ASSERT(
- !inputrec->useConstantAcceleration,
- "Linear acceleration has been removed in GROMACS 2022, and was broken for many years "
- "before that. Use GROMACS 4.5 or earlier if you need this feature.");
-
- // Now we decide whether to use the domain decomposition machinery.
- // Note that this does not necessarily imply actually using multiple domains.
// Now the number of ranks is known to all ranks, and each knows
// the inputrec read by the master rank. The ranks can now all run
// the task-deciding functions and will agree on the result
gpusWereDetected);
useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded,
pmeTarget,
+ pmeFftTarget,
userGpuTaskAssignment,
*hwinfo_,
*inputrec,
int deviceId = -1;
DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
- // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
- bool useTiming = true;
-
- if (GMX_GPU_CUDA)
- {
- /* WARNING: CUDA timings are incorrect with multiple streams.
- * This is the main reason why they are disabled by default.
- */
- // TODO: Consider turning on by default when we can detect nr of streams.
- useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
- }
- else if (GMX_GPU_OPENCL)
- {
- useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
- }
-
// TODO Currently this is always built, yet DD partition code
// checks if it is built before using it. Probably it should
// become an MDModule that is made only when another module
{
dd_setup_dlb_resource_sharing(cr, deviceId);
}
- deviceStreamManager = std::make_unique<DeviceStreamManager>(
- *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
+ const bool useGpuTiming = decideGpuTimingsUsage();
+ deviceStreamManager = std::make_unique<DeviceStreamManager>(
+ *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useGpuTiming);
}
// If the user chose a task assignment, give them some hints
makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
}
- if (runScheduleWork.simulationWork.useGpuBufferOps)
+ if (runScheduleWork.simulationWork.useGpuFBufferOps)
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
&& ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
- || runScheduleWork.simulationWork.useGpuBufferOps))
+ || runScheduleWork.simulationWork.useGpuXBufferOps))
{
GpuApiCallBehavior transferKind =
(inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)