#include "gromacs/domdec/partition.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/listed_forces/manage_threading.h"
return bCutoffAllowed;
}
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
- const t_commrec& cr,
- const DeviceContext& deviceContext,
- const DeviceStream& streamLocal,
- const DeviceStream& streamNonLocal)
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
+ const t_commrec& cr,
+ const gmx::DeviceStreamManager& deviceStreamManager)
{
-
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+ "Local non-bonded stream should be valid when using"
+ "GPU halo exchange.");
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+ "Non-local non-bonded stream should be valid when using "
+ "GPU halo exchange.");
int gpuHaloExchangeSize = 0;
int pulseStart = 0;
if (cr.dd->gpuHaloExchange.empty())
for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
{
cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
- cr.dd, cr.mpi_comm_mysim, deviceContext, streamLocal, streamNonLocal, pulse));
+ cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
+ deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
+ deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
}
}
}
struct t_nrnb;
struct gmx_wallcycle;
enum class PbcType : int;
-class DeviceStream;
class t_state;
class DeviceContext;
class GpuEventSynchronizer;
namespace gmx
{
+class DeviceStreamManager;
class ForceWithShiftForces;
class MDLogger;
class RangePartitioning;
real* r_2b,
real* r_mb);
-/*! \brief Construct the GPU halo exchange object(s)
- * \param[in] mdlog The logger object
- * \param[in] cr The commrec object
- * \param[in] deviceContext GPU device context
- * \param[in] streamLocal The local GPU stream
- * \param[in] streamNonLocal The non-local GPU stream
+/*! \brief Construct the GPU halo exchange object(s).
+ *
+ * \param[in] mdlog The logger object.
+ * \param[in] cr The commrec object.
+ * \param[in] deviceStreamManager Manager of the GPU context and streams.
*/
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
- const t_commrec& cr,
- const DeviceContext& deviceContext,
- const DeviceStream& streamLocal,
- const DeviceStream& streamNonLocal);
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
+ const t_commrec& cr,
+ const gmx::DeviceStreamManager& deviceStreamManager);
/*! \brief
* (Re-) Initialization for GPU halo exchange
return (enumerator + denominator - 1) / denominator;
}
-gmx_pme_t* gmx_pme_init(const t_commrec* cr,
- const NumPmeDomains& numPmeDomains,
- const t_inputrec* ir,
- gmx_bool bFreeEnergy_q,
- gmx_bool bFreeEnergy_lj,
- gmx_bool bReproducible,
- real ewaldcoeff_q,
- real ewaldcoeff_lj,
- int nthread,
- PmeRunMode runMode,
- PmeGpu* pmeGpu,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
+gmx_pme_t* gmx_pme_init(const t_commrec* cr,
+ const NumPmeDomains& numPmeDomains,
+ const t_inputrec* ir,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
+ gmx_bool bReproducible,
+ real ewaldcoeff_q,
+ real ewaldcoeff_lj,
+ int nthread,
+ PmeRunMode runMode,
+ PmeGpu* pmeGpu,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
const gmx::MDLogger& /*mdlog*/)
{
int use_threads, sum_use_threads, i;
{
GMX_THROW(gmx::NotImplementedError(errorString));
}
+ pme_gpu_reinit(pme.get(), deviceContext, deviceStream, pmeGpuProgram);
}
- pme_gpu_reinit(pme.get(), deviceInfo, pmeGpuProgram);
+ else
+ {
+ GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object when PME is on a CPU.");
+ }
+
pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
*pmedata = gmx_pme_init(cr, numPmeDomains, &irc, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE,
ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread, pme_src->runMode,
- pme_src->gpu, nullptr, nullptr, dummyLogger);
+ pme_src->gpu, nullptr, nullptr, nullptr, dummyLogger);
/* When running PME on the CPU not using domain decomposition,
* the atom data is allocated once only in gmx_pme_(re)init().
*/
struct t_nrnb;
struct PmeGpu;
struct gmx_wallclock_gpu_pme_t;
-struct DeviceInformation;
struct gmx_enerdata_t;
struct gmx_mtop_t;
struct gmx_pme_t;
* related things whose lifetime can/should exceed that of a task (or
* perhaps task manager). See Redmine #2522.
*/
-gmx_pme_t* gmx_pme_init(const t_commrec* cr,
- const NumPmeDomains& numPmeDomains,
- const t_inputrec* ir,
- gmx_bool bFreeEnergy_q,
- gmx_bool bFreeEnergy_lj,
- gmx_bool bReproducible,
- real ewaldcoeff_q,
- real ewaldcoeff_lj,
- int nthread,
- PmeRunMode runMode,
- PmeGpu* pmeGpu,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const gmx::MDLogger& mdlog);
+gmx_pme_t* gmx_pme_init(const t_commrec* cr,
+ const NumPmeDomains& numPmeDomains,
+ const t_inputrec* ir,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
+ gmx_bool bReproducible,
+ real ewaldcoeff_q,
+ real ewaldcoeff_lj,
+ int nthread,
+ PmeRunMode runMode,
+ PmeGpu* pmeGpu,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const gmx::MDLogger& mdlog);
/*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from
* pme_src. This is only called when the PME cut-off/grid size changes.
GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
-/*! \brief Returns the pointer to the GPU stream.
- * \param[in] pme The PME data structure.
- * \returns Pointer to GPU stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
- GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
* \param[in] pme The PME data structure.
* \returns Pointer to sychronizer
pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
}
-const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
-{
- if (!pme || !pme_gpu_active(pme))
- {
- return nullptr;
- }
- return pme_gpu_get_stream(pme->gpu);
-}
-
GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
{
if (!pme || !pme_gpu_active(pme))
#include <string>
#include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/math/units.h"
pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
}
-void pme_gpu_init_internal(PmeGpu* pmeGpu)
+/*! \brief Internal GPU initialization for PME.
+ *
+ * \param[in] pmeGpu GPU PME data.
+ * \param[in] deviceContext GPU context.
+ * \param[in] deviceStream GPU stream.
+ */
+static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceContext, const DeviceStream& deviceStream)
{
#if GMX_GPU == GMX_GPU_CUDA
// Prepare to use the device that this PME task was assigned earlier.
// Other entities, such as CUDA timing events, are known to implicitly use the device context.
- CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+ CU_RET_ERR(cudaSetDevice(deviceContext.deviceInfo().id), "Switching to PME CUDA device");
#endif
/* Allocate the target-specific structures */
- pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
+ pmeGpu->archSpecific.reset(new PmeGpuSpecific(deviceContext, deviceStream));
pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
pmeGpu->archSpecific->performOutOfPlaceFFT = true;
* TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
*/
- // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
- if (GMX_GPU == GMX_GPU_CUDA)
- {
- /* WARNING: CUDA timings are incorrect with multiple streams.
- * This is the main reason why they are disabled by default.
- */
- // TODO: Consider turning on by default when we can detect nr of streams.
- pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
- }
- else if (GMX_GPU == GMX_GPU_OPENCL)
- {
- pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
- }
-
#if GMX_GPU == GMX_GPU_CUDA
- pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+ pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
#elif GMX_GPU == GMX_GPU_OPENCL
pmeGpu->maxGridWidthX = INT32_MAX / 2;
// TODO: is there no really global work size limit in OpenCL?
#endif
-
- /* Creating a PME GPU stream:
- * - default high priority with CUDA
- * - no priorities implemented yet with OpenCL; see #2532
- */
- pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
- DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
}
void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
* TODO: this should become PmeGpu::PmeGpu()
*
* \param[in,out] pme The PME structure.
- * \param[in,out] deviceInfo The GPU device information structure.
- * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ * \param[in] deviceContext The GPU context.
+ * \param[in] deviceStream The GPU stream.
+ * \param[in,out] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
*/
-static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t* pme,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ const PmeGpuProgram* pmeGpuProgram)
{
- GMX_ASSERT(deviceInfo != nullptr,
- "Device information can not be nullptr when GPU is used for PME.");
pme->gpu = new PmeGpu();
PmeGpu* pmeGpu = pme->gpu;
changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
pme_gpu_set_testing(pmeGpu, false);
- pmeGpu->deviceInfo = deviceInfo;
GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
pmeGpu->programHandle_ = pmeGpuProgram;
pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
- pme_gpu_init_internal(pmeGpu);
+ pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
pme_gpu_alloc_energy_virial(pmeGpu);
pme_gpu_copy_common_data_from(pme);
}
}
-void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t* pme,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram)
{
GMX_ASSERT(pme != nullptr, "Need valid PME object");
- if (pme->runMode == PmeRunMode::CPU)
- {
- GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
- return;
- }
if (!pme->gpu)
{
+ GMX_RELEASE_ASSERT(deviceContext != nullptr,
+ "Device context can not be nullptr when setting up PME on GPU.");
+ GMX_RELEASE_ASSERT(deviceStream != nullptr,
+ "Device stream can not be nullptr when setting up PME on GPU.");
/* First-time initialization */
- pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
+ pme_gpu_init(pme, *deviceContext, *deviceStream, pmeGpuProgram);
}
else
{
pmeGpu->kernelParams->atoms.d_coordinates = d_x;
}
-const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
-{
- if (pmeGpu)
- {
- return &pmeGpu->archSpecific->pmeStream_;
- }
- else
- {
- return nullptr;
- }
-}
-
GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
{
if (pmeGpu && pmeGpu->kernelParams)
#include "pme_gpu_types_host.h"
#include "pme_output.h"
-class GpuEventSynchronizer;
+class DeviceContext;
struct DeviceInformation;
+class DeviceStream;
+class GpuEventSynchronizer;
struct gmx_hw_info_t;
struct gmx_gpu_opt_t;
struct gmx_pme_t; // only used in pme_gpu_reinit
namespace gmx
{
class MDLogger;
-}
+} // namespace gmx
//! Type of spline data
enum class PmeSplineDataType
*/
void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
-/*! \libinternal \brief
- * Does the one-time GPU-framework specific PME initialization.
- * For CUDA, the PME stream is created with the highest priority.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_init_internal(PmeGpu* pmeGpu);
-
/*! \libinternal \brief
* Initializes the CUDA FFT structures.
*
GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
-/*! \brief Return pointer to GPU stream.
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Pointer to stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
- GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to sync object
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
*
- * \param[in,out] pme The PME structure.
- * \param[in] deviceInfo The GPU device information structure.
- * \param[in] pmeGpuProgram The PME GPU program data
+ * \param[in,out] pme The PME structure.
+ * \param[in] deviceContext The GPU context.
+ * \param[in] deviceStream The GPU stream.
+ * \param[in,out] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ *
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
+ const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
+ const DeviceStream* GPU_FUNC_ARGUMENT(deviceStream),
const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
/*! \libinternal \brief
*/
int nAtomsAlloc;
- /*! \brief A pointer to the device used during the execution. */
- const DeviceInformation* deviceInfo;
-
/*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
* Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
* OpenCL seems to not have readily available global work size limit, so we just assign a large arbitrary constant to this instead.
{
/*! \brief Constructor
*
- * \param[in] deviceContext GPU device context.
+ * \param[in] deviceContext GPU device context
+ * \param[in] pmeStream GPU pme stream.
*/
- PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
+ PmeGpuSpecific(const DeviceContext& deviceContext, const DeviceStream& pmeStream) :
+ deviceContext_(deviceContext),
+ pmeStream_(pmeStream)
+ {
+ }
/*! \brief
* A handle to the GPU context.
const DeviceContext& deviceContext_;
/*! \brief The GPU stream where everything related to the PME happens. */
- DeviceStream pmeStream_;
+ const DeviceStream& pmeStream_;
/* Synchronization events */
/*! \brief Triggered after the PME Force Calculations have been completed */
#include "gromacs/fileio/pdbio.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/math/gmxcomplex.h"
#include "gromacs/math/units.h"
#endif
}
-int gmx_pmeonly(struct gmx_pme_t* pme,
- const t_commrec* cr,
- t_nrnb* mynrnb,
- gmx_wallcycle* wcycle,
- gmx_walltime_accounting_t walltime_accounting,
- t_inputrec* ir,
- PmeRunMode runMode,
- const DeviceContext* deviceContext)
+int gmx_pmeonly(struct gmx_pme_t* pme,
+ const t_commrec* cr,
+ t_nrnb* mynrnb,
+ gmx_wallcycle* wcycle,
+ gmx_walltime_accounting_t walltime_accounting,
+ t_inputrec* ir,
+ PmeRunMode runMode,
+ const gmx::DeviceStreamManager* deviceStreamManager)
{
int ret;
int natoms = 0;
const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
if (useGpuForPme)
{
- const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
-
+ GMX_RELEASE_ASSERT(
+ deviceStreamManager != nullptr,
+ "Device stream manager can not be nullptr when using GPU in PME-only rank.");
+ GMX_RELEASE_ASSERT(deviceStreamManager->streamIsValid(gmx::DeviceStreamType::Pme),
+ "Device stream can not be nullptr when using GPU in PME-only rank");
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
if (c_enableGpuPmePpComms)
{
pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
- deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+ deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+ pme_pp->ppRanks);
pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
- deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+ deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+ pme_pp->ppRanks);
}
- GMX_RELEASE_ASSERT(
- deviceContext != nullptr,
- "Device context can not be nullptr when building GPU propagator data object.");
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
- stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
- GpuApiCallBehavior::Async,
- pme_gpu_get_block_size(pme), wcycle);
+ stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
+ &deviceStreamManager->stream(gmx::DeviceStreamType::Pme), deviceStreamManager->context(),
+ GpuApiCallBehavior::Async, pme_gpu_get_block_size(pme), wcycle);
}
clear_nrnb(mynrnb);
struct gmx_pme_t;
struct gmx_wallcycle;
-class DeviceContext;
enum class PmeRunMode;
+namespace gmx
+{
+class DeviceStreamManager;
+}
/*! \brief Called on the nodes that do PME exclusively */
-int gmx_pmeonly(gmx_pme_t* pme,
- const t_commrec* cr,
- t_nrnb* mynrnb,
- gmx_wallcycle* wcycle,
- gmx_walltime_accounting_t walltime_accounting,
- t_inputrec* ir,
- PmeRunMode runMode,
- const DeviceContext* deviceContext);
+int gmx_pmeonly(gmx_pme_t* pme,
+ const t_commrec* cr,
+ t_nrnb* mynrnb,
+ gmx_wallcycle* wcycle,
+ gmx_walltime_accounting_t walltime_accounting,
+ t_inputrec* ir,
+ PmeRunMode runMode,
+ const gmx::DeviceStreamManager* deviceStreamManager);
#endif
#include "gromacs/utility/gmxmpi.h"
class DeviceContext;
+class DeviceStream;
class GpuEventSynchronizer;
namespace gmx
{
+class DeviceStreamManager;
+
/*! \libinternal
* \brief Manages communication related to GPU buffers between this
* \param[in] comm Communicator used for simulation
* \param[in] pmeRank Rank of PME task
* \param[in] deviceContext GPU context.
+ * \param[in] deviceStream GPU stream.
*/
- PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+ PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
~PmePpCommGpu();
/*! \brief Perform steps required when buffer size changes
};
/*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, const DeviceContext& /* deviceContext */) :
+PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
+ int /* pmeRank */,
+ const DeviceContext& /* deviceContext */,
+ const DeviceStream& /* deviceStream */) :
impl_(nullptr)
{
GMX_ASSERT(false,
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/utility/gmxmpi.h"
namespace gmx
{
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+PmePpCommGpu::Impl::Impl(MPI_Comm comm,
+ int pmeRank,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream) :
deviceContext_(deviceContext),
+ pmePpCommStream_(deviceStream),
comm_(comm),
pmeRank_(pmeRank)
{
GMX_RELEASE_ASSERT(
GMX_THREAD_MPI,
"PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-
- // In CUDA we only need priority to create stream.
- // (note that this will be moved from here in the follow-up patch)
- pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
}
PmePpCommGpu::Impl::~Impl() = default;
return static_cast<void*>(&forcesReadySynchronizer_);
}
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
- impl_(new Impl(comm, pmeRank, deviceContext))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm,
+ int pmeRank,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream) :
+ impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
{
}
public:
/*! \brief Creates PME-PP GPU communication object.
+ *
* \param[in] comm Communicator used for simulation
* \param[in] pmeRank Rank of PME task
* \param[in] deviceContext GPU context.
+ * \param[in] deviceStream GPU stream.
*/
- Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+ Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
~Impl();
/*! \brief Perform steps required when buffer size changes
void* getForcesReadySynchronizer();
private:
- //! Device context object
+ //! GPU context handle (not used in CUDA)
const DeviceContext& deviceContext_;
- //! CUDA stream used for the communication operations in this class
- DeviceStream pmePpCommStream_;
+ //! Handle for CUDA stream used for the communication operations in this class
+ const DeviceStream& pmePpCommStream_;
//! Remote location of PME coordinate data buffer
void* remotePmeXBuffer_ = nullptr;
//! Remote location of PME force data buffer
testhardwarecontexts.cpp
GPU_CPP_SOURCE_FILES
pmetestcommon.cpp
- )
+)
+
+gmx_add_libgromacs_sources(
+ testhardwarecontext.cpp
+)
+if (GMX_USE_CUDA)
+gmx_compile_cpp_as_cuda(
+ testhardwarecontext.cpp
+)
+endif()
TestReferenceData refData;
for (const auto& context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context->getCodePath();
+ CodePath codePath = context->codePath();
const bool supportedInput =
pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+ EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
NotImplementedError);
continue;
}
SCOPED_TRACE(
formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
", order %d, %zu atoms",
- codePathToString(codePath), context->getDescription().c_str(),
+ codePathToString(codePath), context->description().c_str(),
gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
- PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
- context->getPmeGpuProgram(), box);
+ PmeSafePointer pmeSafe =
+ pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+ context->deviceStream(), context->pmeGpuProgram(), box);
std::unique_ptr<StatePropagatorDataGpu> stateGpu =
(codePath == CodePath::GPU)
- ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+ ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+ context->deviceStream())
: nullptr;
pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates,
TestReferenceData refData;
for (const auto& context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context->getCodePath();
+ CodePath codePath = context->codePath();
const bool supportedInput =
pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box,
+ EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, nullptr, box,
ewaldCoeff_q, ewaldCoeff_lj),
NotImplementedError);
continue;
"size %d %d %d, Ewald coefficients %g %g",
(method == PmeSolveAlgorithm::LennardJones) ? "Lennard-Jones" : "Coulomb",
gridOrdering.second.c_str(), computeEnergyAndVirial ? "with" : "without",
- codePathToString(codePath), context->getDescription().c_str(),
+ codePathToString(codePath), context->description().c_str(),
gridSize[XX], gridSize[YY], gridSize[ZZ], ewaldCoeff_q, ewaldCoeff_lj));
/* Running the test */
- PmeSafePointer pmeSafe =
- pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
- context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
+ PmeSafePointer pmeSafe = pmeInitEmpty(
+ &inputRec, codePath, context->deviceContext(), context->deviceStream(),
+ context->pmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
const real cellVolume = box[0] * box[4] * box[8];
// FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
for (const auto& context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context->getCodePath();
+ CodePath codePath = context->codePath();
const bool supportedInput =
pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+ EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
NotImplementedError);
continue;
}
{
/* Describing the test uniquely in case it fails */
- SCOPED_TRACE(
- formatString("Testing %s with %s %sfor PME grid size %d %d %d"
- ", order %d, %zu atoms",
- option.second.c_str(), codePathToString(codePath),
- context->getDescription().c_str(), gridSize[XX], gridSize[YY],
- gridSize[ZZ], pmeOrder, atomCount));
+ SCOPED_TRACE(formatString(
+ "Testing %s with %s %sfor PME grid size %d %d %d"
+ ", order %d, %zu atoms",
+ option.second.c_str(), codePathToString(codePath), context->description().c_str(),
+ gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
/* Running the test */
- PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
- context->getPmeGpuProgram(), box);
+ PmeSafePointer pmeSafe =
+ pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+ context->deviceStream(), context->pmeGpuProgram(), box);
std::unique_ptr<StatePropagatorDataGpu> stateGpu =
(codePath == CodePath::GPU)
- ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+ ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+ context->deviceStream())
: nullptr;
pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
#include "gromacs/ewald/pme_solve.h"
#include "gromacs/ewald/pme_spread.h"
#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/mdtypes/commrec.h"
}
//! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
- const CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- const real ewaldCoeff_q,
- const real ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
+ const CodePath mode,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const Matrix3x3& box,
+ const real ewaldCoeff_q,
+ const real ewaldCoeff_lj)
{
const MDLogger dummyLogger;
const auto runMode = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
t_commrec dummyCommrec = { 0 };
NumPmeDomains numPmeDomains = { 1, 1 };
- gmx_pme_t* pmeDataRaw =
- gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
- ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+ gmx_pme_t* pmeDataRaw = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true,
+ ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr,
+ deviceContext, deviceStream, pmeGpuProgram, dummyLogger);
PmeSafePointer pme(pmeDataRaw); // taking ownership
// TODO get rid of this with proper matrix type
}
//! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec,
- const CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- const real ewaldCoeff_q,
- const real ewaldCoeff_lj)
-{
- return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec,
+ const CodePath mode,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const Matrix3x3& box,
+ const real ewaldCoeff_q,
+ const real ewaldCoeff_lj)
+{
+ return pmeInitWrapper(inputRec, mode, deviceContext, deviceStream, pmeGpuProgram, box,
+ ewaldCoeff_q, ewaldCoeff_lj);
// hiding the fact that PME actually needs to know the number of atoms in advance
}
PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
{
const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
- return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+ return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
}
//! Make a GPU state-propagator manager
std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t& pme,
- const DeviceContext& deviceContext)
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream)
{
// TODO: Pin the host buffer and use async memory copies
// TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
// restrict one from using other constructor here.
- return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
- GpuApiCallBehavior::Sync,
+ return std::make_unique<StatePropagatorDataGpu>(deviceStream, *deviceContext, GpuApiCallBehavior::Sync,
pme_gpu_get_block_size(&pme), nullptr);
}
namespace gmx
{
+
+class DeviceStreamManager;
namespace test
{
// PME stages
//! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
- CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- real ewaldCoeff_q = 1.0F,
- real ewaldCoeff_lj = 1.0F);
+PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
+ CodePath mode,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const Matrix3x3& box,
+ real ewaldCoeff_q = 1.0F,
+ real ewaldCoeff_lj = 1.0F);
//! Simple PME initialization (no atom data)
-PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec,
- CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- real ewaldCoeff_q,
- real ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec,
+ CodePath mode,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const Matrix3x3& box,
+ real ewaldCoeff_q,
+ real ewaldCoeff_lj);
+
//! Simple PME initialization based on inputrec only
PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
+
//! Make a GPU state-propagator manager
std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t& pme,
- const DeviceContext& deviceContext);
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream);
//! PME initialization with atom data and system box
void pmeInitAtoms(gmx_pme_t* pme,
StatePropagatorDataGpu* stateGpu,
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontext.h"
+
+#include <memory>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+namespace gmx
+{
+namespace test
+{
+
+TestHardwareContext::TestHardwareContext(CodePath codePath, const char* description) :
+ codePath_(codePath),
+ description_(description)
+{
+ GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
+ "A GPU code path should provide DeviceInformation to the "
+ "TestHerdwareContext constructor.");
+ deviceContext_ = nullptr;
+ deviceStream_ = nullptr;
+}
+
+TestHardwareContext::TestHardwareContext(CodePath codePath,
+ const char* description,
+ const DeviceInformation& deviceInfo) :
+ codePath_(codePath),
+ description_(description)
+{
+ GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
+ "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
+ "in CPU build.");
+ deviceContext_ = new DeviceContext(deviceInfo);
+ deviceStream_ = new DeviceStream(*deviceContext_, DeviceStreamPriority::Normal, false);
+ program_ = buildPmeGpuProgram(*deviceContext_);
+}
+
+TestHardwareContext::~TestHardwareContext()
+{
+ delete (deviceStream_);
+ delete (deviceContext_);
+}
+
+const DeviceInformation* TestHardwareContext::deviceInfo() const
+{
+ return &deviceContext_->deviceInfo();
+}
+
+const DeviceContext* TestHardwareContext::deviceContext() const
+{
+ return deviceContext_;
+}
+//! Get the device stream
+const DeviceStream* TestHardwareContext::deviceStream() const
+{
+ return deviceStream_;
+}
+
+const char* codePathToString(CodePath codePath)
+{
+ switch (codePath)
+ {
+ case CodePath::CPU: return "CPU";
+ case CodePath::GPU: return "GPU";
+ default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+ }
+}
+
+} // namespace test
+} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+struct DeviceInformation;
+class DeviceStream;
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+ CPU,
+ GPU
+};
+
+//! Return a string useful for human-readable messages describing a \c codePath.
+const char* codePathToString(CodePath codePath);
+
+/*! \internal \brief
+ * A structure to describe a hardware context that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
+ */
+struct TestHardwareContext
+{
+ //! Hardware path for the code being tested.
+ CodePath codePath_;
+ //! Readable description
+ std::string description_;
+ //! Device context
+ DeviceContext* deviceContext_ = nullptr;
+ //! Device stream
+ DeviceStream* deviceStream_ = nullptr;
+ //! Persistent compiled GPU kernels for PME.
+ PmeGpuProgramStorage program_;
+
+public:
+ //! Retuns the code path for this context.
+ CodePath codePath() const { return codePath_; }
+ //! Returns a human-readable context description line
+ std::string description() const { return description_; }
+ //! Returns the device info pointer
+ const DeviceInformation* deviceInfo() const;
+ //! Get the device context
+ const DeviceContext* deviceContext() const;
+ //! Get the device stream
+ const DeviceStream* deviceStream() const;
+ //! Returns the persistent PME GPU kernels
+ const PmeGpuProgram* pmeGpuProgram() const { return program_.get(); }
+ //! Constructs the context for CPU builds
+ TestHardwareContext(CodePath codePath, const char* description);
+ //! Constructs the context for GPU builds
+ TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo);
+ //! Destructor
+ ~TestHardwareContext();
+};
+
+} // namespace test
+} // namespace gmx
+#endif
namespace test
{
-TestHardwareContext::~TestHardwareContext() = default;
-
-const char* codePathToString(CodePath codePath)
-{
- switch (codePath)
- {
- case CodePath::CPU: return "CPU";
- case CodePath::GPU: return "GPU";
- default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
- }
-}
-
/* Implements the "construct on first use" idiom to avoid any static
* initialization order fiasco.
*
for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
{
const DeviceInformation* deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
- GMX_RELEASE_ASSERT(deviceInfo != nullptr,
- "Device information should be provided for the GPU builds.");
init_gpu(deviceInfo);
char stmp[200] = {};
}
}
+void PmeTestEnvironment::TearDown()
+{
+ hardwareContexts_.clear();
+}
+
} // namespace test
} // namespace gmx
#include <gtest/gtest.h>
#include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/gmxassert.h"
+#include "testhardwarecontext.h"
+
struct gmx_hw_info_t;
namespace gmx
{
namespace test
{
-//! Hardware code path being tested
-enum class CodePath
-{
- CPU,
- GPU
-};
-
-//! Return a string useful for human-readable messages describing a \c codePath.
-const char* codePathToString(CodePath codePath);
-
-/*! \internal \brief
- * A structure to describe a hardware context that persists over the lifetime
- * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
- */
-struct TestHardwareContext
-{
- //! Hardware path for the code being tested.
- CodePath codePath_;
- //! Readable description
- std::string description_;
- //! Device information pointer
- const DeviceInformation* deviceInfo_;
- //! Local copy of the device context pointer
- std::unique_ptr<DeviceContext> deviceContext_;
- //! Persistent compiled GPU kernels for PME.
- PmeGpuProgramStorage program_;
-
-public:
- //! Retuns the code path for this context.
- CodePath getCodePath() const { return codePath_; }
- //! Returns a human-readable context description line
- std::string getDescription() const { return description_; }
- //! Getter for the DeviceContext
- const DeviceContext& deviceContext() const
- {
- GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
- "Trying to get device context before it was initialized or in builds "
- "without GPU support.");
- return *deviceContext_;
- }
- //! Returns the device info pointer
- const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
- //! Returns the persistent PME GPU kernels
- const PmeGpuProgram* getPmeGpuProgram() const { return program_.get(); }
- //! Constructs the context for CPU builds
- TestHardwareContext(CodePath codePath, const char* description) :
- codePath_(codePath),
- description_(description)
- {
- GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
- "A GPU code path should provide DeviceInformation to the "
- "TestHardwareContext constructor.");
- }
- //! Constructs the context for GPU builds
- TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
- codePath_(codePath),
- description_(description),
- deviceInfo_(&deviceInfo)
- {
- GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
- "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
- "in CPU build.");
- deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
- program_ = buildPmeGpuProgram(*deviceContext_);
- }
- ~TestHardwareContext();
-};
//! A container of handles to hardware contexts
typedef std::vector<std::unique_ptr<TestHardwareContext>> TestHardwareContexts;
public:
//! This is called by GTest framework once to query the hardware
void SetUp() override;
+ //! This is called by GTest framework once release the hardware
+ void TearDown() override;
//! Get available hardware contexts.
const TestHardwareContexts& getHardwareContexts() const { return hardwareContexts_; }
//! Get available hardware information.
return false;
}
-void DeviceStream::synchronize() const {}
+void DeviceStream::synchronize() const {};
return impl_->streams_[streamToGet];
}
+const DeviceStream& DeviceStreamManager::bondedStream(bool hasPPDomainDecomposition) const
+{
+ if (hasPPDomainDecomposition)
+ {
+ GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedNonLocal).isValid(),
+ "GPU non-bonded non-local stream should be valid in order to use GPU "
+ "version of bonded forces with domain decomposition.");
+ return stream(DeviceStreamType::NonBondedNonLocal);
+ }
+ else
+ {
+ GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedLocal).isValid(),
+ "GPU non-bonded local stream should be valid in order to use GPU "
+ "version of bonded forces without domain decomposition.");
+ return stream(DeviceStreamType::NonBondedLocal);
+ }
+}
+
bool DeviceStreamManager::streamIsValid(DeviceStreamType streamToCheck) const
{
return impl_->streams_[streamToCheck].isValid();
*/
const DeviceStream& stream(DeviceStreamType streamToGet) const;
+ /*! \brief Returns a handle to the GPU stream to compute bonded forces in.
+ *
+ * \param[in] hasPPDomainDecomposition Whether there is a particle-particle domain decomposition.
+ */
+ const DeviceStream& bondedStream(bool hasPPDomainDecomposition) const;
+
/*! \brief Return whether the requested GPU stream is valid for use.
*
* \param[in] streamToCheck Which stream to check.
expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
DeviceStreamType::UpdateAndConstraints });
- expectInvalidStreams(&manager, {});
}
{
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/typecasts.cuh"
#include "gromacs/mdtypes/enerdata.h"
deviceContext_(deviceContext),
deviceStream_(deviceStream)
{
+ GMX_RELEASE_ASSERT(deviceStream.isValid(),
+ "Can't run GPU version of bonded forces in stream that is not valid.");
+
wcycle_ = wcycle;
allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
deviceStream_, GpuApiCallBehavior::Sync, nullptr);
vTot_.resize(F_NRE);
allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
- clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
+ clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
kernelParams_.d_forceParams = d_forceParams_;
kernelParams_.d_xq = d_xq_;
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
// Make sure that the forces are ready on device before proceeding with the update.
fReadyOnDevice->enqueueWaitEvent(deviceStream_);
- // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
- // into d_x_. The d_xp_ is only needed by constraints.
+ // The integrate should save a copy of the current coordinates in d_xp_ and write updated
+ // once into d_x_. The d_xp_ is only needed by constraints.
integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
// Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
#include "gromacs/fileio/trxio.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/imd/imd.h"
#include "gromacs/listed_forces/manage_threading.h"
StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+ // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
if (useGpuForUpdate)
{
GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
{
GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
}
-
- GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
- "GPU device context should be initialized to use GPU update.");
- GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
- "Update stream can not be nullptr when update is on a GPU.");
- integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
- *stateGpu->getUpdateStream(),
- stateGpu->xUpdatedOnDevice());
+ GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+ "Device stream manager should be initialized in order to use GPU "
+ "update-constraints.");
+ GMX_RELEASE_ASSERT(
+ fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
+ "Update stream should be initialized in order to use GPU "
+ "update-constraints.");
+ integrator = std::make_unique<UpdateConstrainGpu>(
+ *ir, *top_global, fr->deviceStreamManager->context(),
+ fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
+ stateGpu->xUpdatedOnDevice());
integrator->setPbc(PbcType::Xyz, state->box);
}
if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
&& useGpuForNonbonded && is1D(*cr->dd))
{
+ GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+ "GPU device manager has to be initialized to use GPU "
+ "version of halo exchange.");
// TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
- const DeviceStream* localStream =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
- const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
- fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
- GMX_RELEASE_ASSERT(
- fr->deviceContext != nullptr,
- "GPU device context should be initialized to use GPU halo exchange.");
- GMX_RELEASE_ASSERT(localStream != nullptr,
- "Local non-bonded stream can't be nullptr when using GPU "
- "halo exchange.");
- GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
- "Non-local non-bonded stream can't be nullptr when using "
- "GPU halo exchange.");
- constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
+ constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
}
}
}
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/detecthardware.h"
EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
// Get the device handles for the modules, nullptr when no task is assigned.
- int deviceId = -1;
- DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
- std::unique_ptr<DeviceContext> deviceContext = nullptr;
- if (deviceInfo != nullptr)
+ int deviceId = -1;
+ DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
+
+ // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
+ bool useTiming = true;
+ if (GMX_GPU == GMX_GPU_CUDA)
{
- if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
- {
- dd_setup_dlb_resource_sharing(cr, deviceId);
- }
- deviceContext = std::make_unique<DeviceContext>(*deviceInfo);
+ /* WARNING: CUDA timings are incorrect with multiple streams.
+ * This is the main reason why they are disabled by default.
+ */
+ // TODO: Consider turning on by default when we can detect nr of streams.
+ useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ }
+ else if (GMX_GPU == GMX_GPU_OPENCL)
+ {
+ useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
}
-
- // TODO Initialize GPU streams here.
// TODO Currently this is always built, yet DD partition code
// checks if it is built before using it. Probably it should
const bool printHostName = (cr->nnodes > 1);
gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
+ std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
+
+ if (deviceInfo != nullptr)
+ {
+ if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+ {
+ dd_setup_dlb_resource_sharing(cr, deviceId);
+ }
+ deviceStreamManager = std::make_unique<DeviceStreamManager>(
+ *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
+ useGpuForUpdate, useTiming);
+ }
+
// If the user chose a task assignment, give them some hints
// where appropriate.
if (!userGpuTaskAssignment.empty())
opt2fn("-tablep", filenames.size(), filenames.data()),
opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
- fr->deviceContext = deviceContext.get();
+ // Save a handle to device stream manager to use elsewhere in the code
+ // TODO: Forcerec is not a correct place to store it.
+ fr->deviceStreamManager = deviceStreamManager.get();
if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
{
GMX_RELEASE_ASSERT(
- deviceContext != nullptr,
- "Device context can not be nullptr when PME-PP direct communications object.");
+ deviceStreamManager != nullptr,
+ "GPU device stream manager should be valid in order to use PME-PP direct "
+ "communications.");
+ GMX_RELEASE_ASSERT(
+ deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
+ "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
+ "communications.");
fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
- cr->mpi_comm_mysim, cr->dd->pme_nodeid, *deviceContext);
+ cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
+ deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
}
- fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, deviceInfo,
- fr->deviceContext, &mtop, box, wcycle);
+ fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, useGpuForNonbonded,
+ deviceStreamManager.get(), &mtop, box, wcycle);
+ // TODO: Move the logic below to a GPU bonded builder
if (useGpuForBonded)
{
- auto stream = havePPDomainDecomposition(cr)
- ? Nbnxm::gpu_get_command_stream(
- fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
- : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
- gmx::InteractionLocality::Local);
- GMX_RELEASE_ASSERT(
- fr->deviceContext != nullptr,
- "Device context can not be nullptr when computing bonded interactions on GPU.");
- GMX_RELEASE_ASSERT(stream != nullptr,
- "Can'r run GPU version of bonded forces in nullptr stream.");
- gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
+ GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+ "GPU device stream manager should be valid in order to use GPU "
+ "version of bonded forces.");
+ gpuBonded = std::make_unique<GpuBonded>(
+ mtop.ffparams, deviceStreamManager->context(),
+ deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
fr->gpuBonded = gpuBonded.get();
}
if (thisRankHasPmeGpuTask)
{
GMX_RELEASE_ASSERT(
- deviceContext != nullptr,
- "Device context can not be nullptr when building PME GPU program object.");
- pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
+ (deviceStreamManager != nullptr),
+ "GPU device stream manager should be initialized in order to use GPU for PME.");
+ GMX_RELEASE_ASSERT((deviceInfo != nullptr),
+ "GPU device should be initialized in order to use GPU for PME.");
+ pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
}
/* Initiate PME if necessary,
{
try
{
+ // TODO: This should be in the builder.
+ GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+ "Device stream manager should be valid in order to use GPU "
+ "version of PME.");
+ GMX_RELEASE_ASSERT(
+ !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+ "GPU PME stream should be valid in order to use GPU version of PME.");
+
+ const DeviceContext* deviceContext =
+ useGpuForPme ? &deviceStreamManager->context() : nullptr;
+ const DeviceStream* pmeStream =
+ useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+
pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
- nullptr, deviceInfo, pmeGpuProgram.get(), mdlog);
+ nullptr, deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
&& ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
|| runScheduleWork.simulationWork.useGpuBufferOps))
{
- const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
- const DeviceStream* localStream =
- fr->nbv->gpu_nbv != nullptr
- ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
- : nullptr;
- const DeviceStream* nonLocalStream =
- fr->nbv->gpu_nbv != nullptr
- ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
- : nullptr;
GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
? GpuApiCallBehavior::Async
: GpuApiCallBehavior::Sync;
- GMX_RELEASE_ASSERT(
- deviceContext != nullptr,
- "Device context can not be nullptr when building GPU propagator data object.");
+ GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+ "GPU device stream manager should be initialized to use GPU.");
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
- pme_gpu_get_block_size(fr->pmedata), wcycle);
+ *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
fr->stateGpu = stateGpu.get();
}
/* do PME only */
walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode,
- deviceContext.get());
+ deviceStreamManager.get());
}
wallcycle_stop(wcycle, ewcRUN);
// clean up cycle counter
wallcycle_destroy(wcycle);
+ deviceStreamManager.reset(nullptr);
// Free PME data
if (pmedata)
{
}
free_gpu(deviceInfo);
- deviceContext.reset(nullptr);
sfree(fcd);
if (doMembed)
}
#endif
return rc;
-}
+} // namespace gmx
Mdrunner::~Mdrunner()
{
namespace gmx
{
+class DeviceStreamManager;
class GpuBonded;
class ForceProviders;
class StatePropagatorDataGpu;
// TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
// general StatePropagatorData object that is passed around
gmx::StatePropagatorDataGpu* stateGpu = nullptr;
+ // TODO: Should not be here. This is here only to pass the pointer around.
+ gmx::DeviceStreamManager* deviceStreamManager = nullptr;
//! GPU device context
DeviceContext* deviceContext = nullptr;
namespace gmx
{
+class DeviceStreamManager;
class StatePropagatorDataGpu
{
* ops are offloaded. This feature is currently not available in OpenCL and
* hence these streams are not set in these builds.
*
- * \note In CUDA, the update stream is created in the constructor as a temporary
- * solution, in place until the stream manager is introduced.
- * Note that this makes it impossible to construct this object in CUDA
- * builds executing on a host without any CUDA-capable device available.
- *
- * \note In CUDA, \p deviceContext is unused, hence always nullptr;
- * all stream arguments can also be nullptr in runs where the
- * respective streams are not required.
- * In OpenCL, \p deviceContext needs to be a valid device context.
- * In OpenCL runs StatePropagatorDataGpu is currently only used
- * with PME offload, and only on ranks with PME duty. Hence, the
- * \p pmeStream argument needs to be a valid OpenCL queue object
- * which must have been created in \p deviceContext.
- *
- * \param[in] pmeStream Device PME stream, nullptr allowed.
- * \param[in] localStream Device NBNXM local stream, nullptr allowed.
- * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
- * \param[in] deviceContext Device context, nullptr allowed.
- * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] deviceStreamManager Object that owns the DeviceContext and DeviceStreams.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] allocationBlockSizeDivisor Deterines padding size for coordinates buffer.
- * \param[in] wcycle Wall cycle counter data.
+ * \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const DeviceStream* pmeStream,
- const DeviceStream* localStream,
- const DeviceStream* nonLocalStream,
- const DeviceContext& deviceContext,
- GpuApiCallBehavior transferKind,
- int allocationBlockSizeDivisor,
- gmx_wallcycle* wcycle);
+ StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
{
};
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */,
- const DeviceStream* /* localStream */,
- const DeviceStream* /* nonLocalStream */,
- const DeviceContext& /* deviceContext */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& /* deviceStreamManager */,
GpuApiCallBehavior /* transferKind */,
int /* allocationBlockSizeDivisor */,
gmx_wallcycle* /* wcycle */) :
* ops are offloaded. This feature is currently not available in OpenCL and
* hence these streams are not set in these builds.
*
- * \note In CUDA, the update stream is created in the constructor as a temporary
- * solution, in place until the stream manager is introduced.
- * Note that this makes it impossible to construct this object in CUDA
- * builds executing on a host without any CUDA-capable device available.
- *
- * \note In CUDA, \p deviceContext is unused, hence always nullptr;
- * all stream arguments can also be nullptr in runs where the
- * respective streams are not required.
- * In OpenCL, \p deviceContext needs to be a valid device context.
- * In OpenCL runs StatePropagatorDataGpu is currently only used
- * with PME offload, and only on ranks with PME duty. Hence, the
- * \p pmeStream argument needs to be a valid OpenCL queue object
- * which must have been created in \p deviceContext.
- *
- * \param[in] pmeStream Device PME stream, nullptr allowed.
- * \param[in] localStream Device NBNXM local stream, nullptr allowed.
- * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
- * \param[in] deviceContext Device context, nullptr allowed.
- * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] deviceStreamManager Object that owns the DeviceContext and DeviceStreams.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] allocationBlockSizeDivisor Determines the padding size for coordinates buffer.
- * \param[in] wcycle Wall cycle counter data.
+ * \param[in] wcycle Wall cycle counter data.
*/
- Impl(const DeviceStream* pmeStream,
- const DeviceStream* localStream,
- const DeviceStream* nonLocalStream,
- const DeviceContext& deviceContext,
- GpuApiCallBehavior transferKind,
- int allocationBlockSizeDivisor,
- gmx_wallcycle* wcycle);
+ Impl(const DeviceStreamManager& deviceStreamManager,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
//! GPU Update-constreaints stream.
const DeviceStream* updateStream_;
- //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
- DeviceStream updateStreamOwn_;
-
// Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
// Streams to use for velocities H2D and D2H copies (one event for each atom locality)
#if GMX_GPU != GMX_GPU_NONE
-# if GMX_GPU == GMX_GPU_CUDA
-# include "gromacs/gpu_utils/cudautils.cuh"
-# endif
+# include "gromacs/gpu_utils/device_stream_manager.h"
# include "gromacs/gpu_utils/devicebuffer.h"
# include "gromacs/gpu_utils/gputraits.h"
-# if GMX_GPU == GMX_GPU_OPENCL
-# include "gromacs/gpu_utils/oclutils.h"
-# endif
# include "gromacs/math/vectypes.h"
# include "gromacs/mdtypes/state_propagator_data_gpu.h"
# include "gromacs/timing/wallcycle.h"
namespace gmx
{
-StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream,
- const DeviceStream* localStream,
- const DeviceStream* nonLocalStream,
- const DeviceContext& deviceContext,
- GpuApiCallBehavior transferKind,
- int allocationBlockSizeDivisor,
- gmx_wallcycle* wcycle) :
- deviceContext_(deviceContext),
+StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle) :
+ deviceContext_(deviceStreamManager.context()),
transferKind_(transferKind),
allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
wcycle_(wcycle)
{
- static_assert(GMX_GPU != GMX_GPU_NONE,
- "This object should only be constructed on the GPU code-paths.");
+ static_assert(
+ GMX_GPU != GMX_GPU_NONE,
+ "GPU state propagator data object should only be constructed on the GPU code-paths.");
- // TODO: Refactor when the StreamManager is introduced.
+ // We need to keep local copies for re-initialization.
+ pmeStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
+ localStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
+ nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
+ // PME stream is used in OpenCL for H2D coordinate transfer
if (GMX_GPU == GMX_GPU_OPENCL)
{
- GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
-
- // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
- pmeStream_ = pmeStream;
- updateStream_ = pmeStream;
- GMX_UNUSED_VALUE(localStream);
- GMX_UNUSED_VALUE(nonLocalStream);
+ updateStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
}
-
- if (GMX_GPU == GMX_GPU_CUDA)
+ else
{
- if (pmeStream != nullptr)
- {
- pmeStream_ = pmeStream;
- }
- if (localStream != nullptr)
- {
- localStream_ = localStream;
- }
- if (nonLocalStream != nullptr)
- {
- nonLocalStream_ = nonLocalStream;
- }
-
- // TODO: The update stream should be created only when it is needed.
-# if (GMX_GPU == GMX_GPU_CUDA)
- // In CUDA we only need priority to create stream.
- // (note that this will be moved from here in the follow-up patch)
- updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
- updateStream_ = &updateStreamOwn_;
-# endif
+ updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
}
// Map the atom locality to the stream that will be used for coordinates,
allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
wcycle_(wcycle)
{
- static_assert(GMX_GPU != GMX_GPU_NONE,
- "This object should only be constructed on the GPU code-paths.");
+ static_assert(
+ GMX_GPU != GMX_GPU_NONE,
+ "GPU state propagator data object should only be constructed on the GPU code-paths.");
- GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
+ GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
pmeStream_ = pmeStream;
localStream_ = pmeStream; // For clearing the force buffer
nonLocalStream_ = nullptr;
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
- GMX_ASSERT(deviceStream.stream() != nullptr,
- "No stream is valid for copying with given atom locality.");
+ GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
- GMX_ASSERT(deviceStream.stream() != nullptr,
- "No stream is valid for copying with given atom locality.");
+ GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream,
- const DeviceStream* localStream,
- const DeviceStream* nonLocalStream,
- const DeviceContext& deviceContext,
- GpuApiCallBehavior transferKind,
- int allocationBlockSizeDivisor,
- gmx_wallcycle* wcycle) :
- impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+ GpuApiCallBehavior transferKind,
+ int allocationBlockSizeDivisor,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
{
}
*/
void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
- const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
+ const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
/* When we get here all misc operations issued in the local stream as well as
the local xq H2D are done,
cu_atomdata_t* adat = nb->atdat;
cu_plist_t* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
cu_nbparam_t* nbp = nb->nbparam;
cu_plist_t* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
cu_nbparam_t* nbp = nb->nbparam;
cu_plist_t* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
cu_atomdata_t* adat = nb->atdat;
cu_timers_t* t = nb->timers;
bool bDoTime = nb->bDoTime;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
const int numAtomsPerCell = grid.numAtomsPerCell();
Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
- const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
// avoid empty kernel launch, skip to inserting stream dependency
GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- const DeviceStream& deviceStream = nb->deviceStreams[iLocality];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iLocality];
cu_atomdata_t* adat = nb->atdat;
size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
nbnxn_cuda_clear_e_fshift(nb);
}
-NbnxmGpu* gpu_init(const DeviceContext& deviceContext,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t* nbat,
- bool bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+ const interaction_const_t* ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t* nbat,
+ bool bLocalAndNonlocal)
{
cudaError_t stat;
auto nb = new NbnxmGpu();
- nb->deviceContext_ = &deviceContext;
+ nb->deviceContext_ = &deviceStreamManager.context();
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
snew(nb->plist[InteractionLocality::Local], 1);
init_plist(nb->plist[InteractionLocality::Local]);
/* local/non-local GPU streams */
- nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
- DeviceStreamPriority::Normal, nb->bDoTime);
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+ "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+ nb->deviceStreams[InteractionLocality::Local] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
if (nb->bUseTwoStreams)
{
init_plist(nb->plist[InteractionLocality::NonLocal]);
* priorities, because we are querying the priority range which in this
* case will be a single value.
*/
- nb->deviceStreams[InteractionLocality::NonLocal].init(
- *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+ "Non-local non-bonded stream should be initialized to use GPU for "
+ "non-bonded with domain decomposition.");
+ nb->deviceStreams[InteractionLocality::NonLocal] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+ ;
}
/* init events for sychronization (timing disabled for performance reasons!) */
{
char sbuf[STRLEN];
bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
cu_plist_t* d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream();
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream();
stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream();
stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
bool bDoTime = nb->bDoTime;
cu_timers_t* timers = nb->timers;
cu_atomdata_t* d_atdat = nb->atdat;
- const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+ const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
}
-const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
-{
- assert(nb);
-
- return &nb->deviceStreams[iloc];
-}
-
void* gpu_get_xq(NbnxmGpu* nb)
{
assert(nb);
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
{
- const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+ const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
bool bDoTime = gpu_nbv->bDoTime;
const int maxNumColumns = gridSet.numColumnsMax();
GpuEventSynchronizer* const localReductionDone)
{
- const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+ const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
gpu_nbv->localFReductionDone = localReductionDone;
/*! \brief staging area where fshift/energies get downloaded */
nb_staging_t nbst;
/*! \brief local and non-local GPU streams */
- gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
/*! \brief Events used for synchronization */
/*! \{ */
// GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
- if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
+ if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
{
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
}
else if (haveResultToWaitFor)
{
- nb->deviceStreams[iLocality].synchronize();
+ nb->deviceStreams[iLocality]->synchronize();
}
// TODO: this needs to be moved later because conditional wait could brake timing
#include "gromacs/gpu_utils/gpu_macros.h"
#include "gromacs/mdtypes/locality.h"
-class DeviceContext;
-class DeviceStream;
-
struct NbnxmGpu;
struct gmx_gpu_info_t;
struct DeviceInformation;
struct PairlistParams;
struct interaction_const_t;
+class DeviceStream;
+
+namespace gmx
+{
+class DeviceStreamManager;
+}
+
namespace Nbnxm
{
/** Initializes the data structures related to GPU nonbonded calculations. */
GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager gmx_unused& deviceStreamManager,
const interaction_const_t gmx_unused* ic,
const PairlistParams gmx_unused& listParams,
const nbnxn_atomdata_t gmx_unused* nbat,
#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/real.h"
-class DeviceContext;
struct DeviceInformation;
struct gmx_domdec_zones_t;
struct gmx_enerdata_t;
namespace gmx
{
+class DeviceStreamManager;
class ForceWithShiftForces;
class GpuBonded;
template<typename>
{
/*! \brief Creates an Nbnxm object */
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
- const t_inputrec* ir,
- const t_forcerec* fr,
- const t_commrec* cr,
- const gmx_hw_info_t& hardwareInfo,
- const DeviceInformation* deviceInfo,
- const DeviceContext* deviceContext,
- const gmx_mtop_t* mtop,
- matrix box,
- gmx_wallcycle* wcycle);
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+ const t_inputrec* ir,
+ const t_forcerec* fr,
+ const t_commrec* cr,
+ const gmx_hw_info_t& hardwareInfo,
+ bool useGpuForNonbonded,
+ const gmx::DeviceStreamManager* deviceStreamManager,
+ const gmx_mtop_t* mtop,
+ matrix box,
+ gmx_wallcycle* wcycle);
} // namespace Nbnxm
return minimumIlistCount;
}
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
- const t_inputrec* ir,
- const t_forcerec* fr,
- const t_commrec* cr,
- const gmx_hw_info_t& hardwareInfo,
- const DeviceInformation* deviceInfo,
- const DeviceContext* deviceContext,
- const gmx_mtop_t* mtop,
- matrix box,
- gmx_wallcycle* wcycle)
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+ const t_inputrec* ir,
+ const t_forcerec* fr,
+ const t_commrec* cr,
+ const gmx_hw_info_t& hardwareInfo,
+ const bool useGpuForNonbonded,
+ const gmx::DeviceStreamManager* deviceStreamManager,
+ const gmx_mtop_t* mtop,
+ matrix box,
+ gmx_wallcycle* wcycle)
{
const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
- const bool useGpu = deviceInfo != nullptr;
- GMX_RELEASE_ASSERT(!(emulateGpu && useGpu),
+ GMX_RELEASE_ASSERT(!(emulateGpu && useGpuForNonbonded),
"When GPU emulation is active, there cannot be a GPU assignment");
NonbondedResource nonbondedResource;
- if (useGpu)
+ if (useGpuForNonbonded)
{
nonbondedResource = NonbondedResource::Gpu;
}
enbnxninitcombrule = enbnxninitcombruleNONE;
}
- auto pinPolicy = (useGpu ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
+ auto pinPolicy = (useGpuForNonbonded ? gmx::PinningPolicy::PinnedIfSupported
+ : gmx::PinningPolicy::CannotBePinned);
auto nbat = std::make_unique<nbnxn_atomdata_t>(pinPolicy);
}
nbnxn_atomdata_init(mdlog, nbat.get(), kernelSetup.kernelType, enbnxninitcombrule, fr->ntype,
fr->nbfp, mimimumNumEnergyGroupNonbonded,
- (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
+ (useGpuForNonbonded || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
NbnxmGpu* gpu_nbv = nullptr;
int minimumIlistCountForGpuBalancing = 0;
- if (useGpu)
+ if (useGpuForNonbonded)
{
- GMX_RELEASE_ASSERT(
- deviceContext != nullptr,
- "Device context can not be nullptr when to use GPU for non-bonded forces.");
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
- gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
+ GMX_RELEASE_ASSERT(
+ (deviceStreamManager != nullptr),
+ "Device stream manager should be initialized in order to use GPU for non-bonded.");
+ gpu_nbv = gpu_init(*deviceStreamManager, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
}
cl_atomdata_t* adat = nb->atdat;
cl_plist_t* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
cl_nbparam_t* nbp = nb->nbparam;
cl_plist_t* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
cl_nbparam_t* nbp = nb->nbparam;
cl_plist_t* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
if (plist->haveFreshList)
cl_atomdata_t* adat = nb->atdat;
cl_timers_t* t = nb->timers;
bool bDoTime = nb->bDoTime;
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
#include <cmath>
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/gpu_hw_info.h"
cl_int cl_error;
cl_atomdata_t* adat = nb->atdat;
- cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
size_t local_work_size[3] = { 1, 1, 1 };
size_t global_work_size[3] = { 1, 1, 1 };
//! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceContext& deviceContext,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t* nbat,
- const bool bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+ const interaction_const_t* ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t* nbat,
+ const bool bLocalAndNonlocal)
{
GMX_ASSERT(ic, "Need a valid interaction constants object");
auto nb = new NbnxmGpu();
- nb->deviceContext_ = &deviceContext;
+ nb->deviceContext_ = &deviceStreamManager.context();
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
snew(nb->plist[InteractionLocality::Local], 1);
nb->timers = new cl_timers_t();
snew(nb->timings, 1);
+ /* set device info, just point it to the right GPU among the detected ones */
nb->dev_rundata = new gmx_device_runtime_data_t();
/* init nbst */
nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
/* local/non-local GPU streams */
- nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
- DeviceStreamPriority::Normal, nb->bDoTime);
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+ "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+ nb->deviceStreams[InteractionLocality::Local] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
if (nb->bUseTwoStreams)
{
init_plist(nb->plist[InteractionLocality::NonLocal]);
- nb->deviceStreams[InteractionLocality::NonLocal].init(
- *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+ "Non-local non-bonded stream should be initialized to use GPU for "
+ "non-bonded with domain decomposition.");
+ nb->deviceStreams[InteractionLocality::NonLocal] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
}
if (nb->bDoTime)
cl_int gmx_used_in_debug cl_error;
cl_atomdata_t* atomData = nb->atdat;
- cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
cl_float value = 0.0F;
cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
/* kick off buffer clearing kernel to ensure concurrency with constraints/update */
cl_int gmx_unused cl_error;
- cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
+ cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
}
// because getLastRangeTime() gets skipped with empty lists later
// which leads to the counter not being reset.
bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
- const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
cl_plist_t* d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
cl_atomdata_t* adat = nb->atdat;
- cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
bool bDoTime = nb->bDoTime;
cl_timers_t* timers = nb->timers;
cl_atomdata_t* d_atdat = nb->atdat;
- const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+ const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
nb_staging_t nbst;
//! local and non-local GPU queues
- gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
/*! \brief Events used for synchronization */
/*! \{ */