* - default high priority with CUDA
* - no priorities implemented yet with OpenCL; see #2532
*/
- pmeGpu->archSpecific->pmeStream_.init(*pmeGpu->deviceInfo, pmeGpu->archSpecific->deviceContext_,
+ pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
}
#include "pme_gpu_program_impl.h"
-PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
- impl_(std::make_unique<PmeGpuProgramImpl>(deviceInfo, deviceContext))
+PmeGpuProgram::PmeGpuProgram(const DeviceContext& deviceContext) :
+ impl_(std::make_unique<PmeGpuProgramImpl>(deviceContext))
{
}
return impl_->warpSize();
}
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext)
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceContext& deviceContext)
{
- return std::make_unique<PmeGpuProgram>(deviceInfo, deviceContext);
+ return std::make_unique<PmeGpuProgram>(deviceContext);
}
class PmeGpuProgram
{
public:
- //! Constructor
- explicit PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
+ /*! \brief Construct a PME GPU program.
+ *
+ * \param[in] deviceContext GPU context.
+ */
+ explicit PmeGpuProgram(const DeviceContext& deviceContext);
+ //! Destructor
~PmeGpuProgram();
//! Return the warp size for which the kernels were compiled
/*! \brief
* Factory function used to build persistent PME GPU program for the device at once.
*/
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& /*deviceInfo*/,
- const DeviceContext& /* deviceContext */);
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceContext& /* deviceContext */);
#endif
#include "pme_gpu_program_impl.h"
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
- const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
deviceContext_(deviceContext),
warpSize_(0),
spreadWorkGroupSize(0),
extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
// clang-format on
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
- const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
deviceContext_(deviceContext)
{
// kernel parameters
PmeGpuProgramImpl() = delete;
//! Constructor for the given device
- explicit PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
+ explicit PmeGpuProgramImpl(const DeviceContext& deviceContext);
~PmeGpuProgramImpl();
GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
#include "pme_gpu_types_host.h"
#include "pme_grid.h"
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
deviceContext_(deviceContext)
{
+ const DeviceInformation& deviceInfo = deviceContext.deviceInfo();
// kernel parameters
warpSize_ = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
// TODO: for Intel ideally we'd want to set these based on the compiler warp size
// In CUDA we only need priority to create stream.
// (note that this will be moved from here in the follow-up patch)
- pmePpCommStream_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
+ pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
}
PmePpCommGpu::Impl::~Impl() = default;
//! Device information pointer
const DeviceInformation* deviceInfo_;
//! Local copy of the device context pointer
- DeviceContext deviceContext_;
+ std::unique_ptr<DeviceContext> deviceContext_;
//! Persistent compiled GPU kernels for PME.
PmeGpuProgramStorage program_;
//! Returns a human-readable context description line
std::string getDescription() const { return description_; }
//! Getter for the DeviceContext
- const DeviceContext& deviceContext() const { return deviceContext_; }
+ const DeviceContext& deviceContext() const
+ {
+ GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
+ "Trying to get device context before it was initialized or in builds "
+ "without GPU support.");
+ return *deviceContext_;
+ }
//! Returns the device info pointer
const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
//! Returns the persistent PME GPU kernels
{
GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
"A GPU code path should provide DeviceInformation to the "
- "TestHerdwareContext constructor.");
+ "TestHardwareContext constructor.");
}
//! Constructs the context for GPU builds
TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
codePath_(codePath),
description_(description),
- deviceInfo_(&deviceInfo),
- deviceContext_(deviceInfo),
- program_(buildPmeGpuProgram(deviceInfo, deviceContext_))
+ deviceInfo_(&deviceInfo)
{
GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
- "TestHerdwareContext tries to construct DeviceContext and PmeGpuProgram "
+ "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
"in CPU build.");
+ deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
+ program_ = buildPmeGpuProgram(*deviceContext_);
}
~TestHardwareContext();
};
class DeviceContext
{
public:
- //! Default constructor.
- DeviceContext() {}
//! Constructor.
- DeviceContext(const DeviceInformation& /* deviceInfo */) {}
+ DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo) {}
//! Destructor
~DeviceContext() = default;
+ //! Get the associated device information
+ const DeviceInformation& deviceInfo() const { return deviceInfo_; }
+
+private:
+ //! A reference to the device information used upon context creation
+ const DeviceInformation& deviceInfo_;
+
GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
};
#endif // GMX_GPU != GMX_GPU_OPENCL
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
/**@}*/
-DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo)
{
cl_platform_id platformId = deviceInfo.oclPlatformId;
cl_device_id deviceId = deviceInfo.oclDeviceId;
class DeviceContext
{
public:
- //! Default constructor.
- DeviceContext() {}
/*! \brief Constructor that creates the \c cl_context
*
* \param[in] deviceInfo Platform-specific device information.
//! Destructor
~DeviceContext();
+ //! Get the associated device information
+ const DeviceInformation& deviceInfo() const { return deviceInfo_; }
//! Getter
cl_context context() const;
- GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
-
private:
+ //! A reference to the device information used upon context creation
+ const DeviceInformation& deviceInfo_;
//! OpenCL context object
cl_context context_ = nullptr;
+
+ GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
};
#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
DeviceStream::DeviceStream() = default;
-void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
- const DeviceContext& /* deviceContext */,
+void DeviceStream::init(const DeviceContext& /* deviceContext */,
DeviceStreamPriority /* priority */,
const bool /* useTiming */)
{
stream_ = nullptr;
}
-void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
- const DeviceContext& /* deviceContext */,
+void DeviceStream::init(const DeviceContext& /* deviceContext */,
DeviceStreamPriority priority,
const bool /* useTiming */)
{
/*! \brief Initialize
*
- * \param[in] deviceInfo Platform-specific device information (only used in OpenCL).
* \param[in] deviceContext Device context (not used in CUDA).
* \param[in] priority Stream priority: high or normal.
* \param[in] useTiming If the timing should be enabled (not used in CUDA).
*/
- void init(const DeviceInformation& deviceInfo,
- const DeviceContext& deviceContext,
- DeviceStreamPriority priority,
- const bool useTiming);
+ void init(const DeviceContext& deviceContext, DeviceStreamPriority priority, const bool useTiming);
/*! \brief Construct and init.
*
- * \param[in] deviceInfo Platform-specific device information (only used in OpenCL).
* \param[in] deviceContext Device context (only used in OpenCL).
* \param[in] priority Stream priority: high or normal (only used in CUDA).
* \param[in] useTiming If the timing should be enabled (only used in OpenCL).
*/
- DeviceStream(const DeviceInformation& deviceInfo,
- const DeviceContext& deviceContext,
- DeviceStreamPriority priority,
- const bool useTiming)
+ DeviceStream(const DeviceContext& deviceContext, DeviceStreamPriority priority, const bool useTiming)
{
- init(deviceInfo, deviceContext, priority, useTiming);
+ init(deviceContext, priority, useTiming);
}
//! Synchronize the steam
stream_ = nullptr;
}
-void DeviceStream::init(const DeviceInformation& deviceInfo,
- const DeviceContext& deviceContext,
- DeviceStreamPriority /* priority */,
- const bool useTiming)
+void DeviceStream::init(const DeviceContext& deviceContext, DeviceStreamPriority /* priority */, const bool useTiming)
{
+ const DeviceInformation& deviceInfo = deviceContext.deviceInfo();
cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
cl_device_id deviceId = deviceInfo.oclDeviceId;
cl_int clError;
const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
- const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
}
catch (gmx::GromacsException& ex)
*/
struct gmx_device_runtime_data_t
{
- //! Constructor
- gmx_device_runtime_data_t(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
-
- //! OpenCL context
- const DeviceContext& deviceContext_;
//! OpenCL program
cl_program program;
};
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
- const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
const int numElements = h_rVecInput.size();
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
- const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
deviceContext, deviceStream);
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
- const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
int numAtoms = testData->numAtoms_;
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
- const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, deviceStream);
PmeGpuProgramStorage pmeGpuProgram;
if (thisRankHasPmeGpuTask)
{
- GMX_RELEASE_ASSERT(
- deviceInfo != nullptr,
- "Device information can not be nullptr when building PME GPU program object.");
GMX_RELEASE_ASSERT(
deviceContext != nullptr,
"Device context can not be nullptr when building PME GPU program object.");
- pmeGpuProgram = buildPmeGpuProgram(*deviceInfo, *deviceContext);
+ pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
}
/* Initiate PME if necessary,
# if (GMX_GPU == GMX_GPU_CUDA)
// In CUDA we only need priority to create stream.
// (note that this will be moved from here in the follow-up patch)
- updateStreamOwn_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
+ updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
updateStream_ = &updateStreamOwn_;
# endif
}
* - The 1D block-grid contains as many blocks as super-clusters.
*/
int num_threads_z = 1;
- if (nb->deviceInfo->prop.major == 3 && nb->deviceInfo->prop.minor == 7)
+ if (nb->deviceContext_->deviceInfo().prop.major == 3 && nb->deviceContext_->deviceInfo().prop.minor == 7)
{
num_threads_z = 2;
}
- int nblock = calc_nb_kernel_nblock(plist->nsci, nb->deviceInfo);
+ int nblock = calc_nb_kernel_nblock(plist->nsci, &nb->deviceContext_->deviceInfo());
KernelLaunchConfig config;
- config.blockSize[0] = c_clSize;
- config.blockSize[1] = c_clSize;
- config.blockSize[2] = num_threads_z;
- config.gridSize[0] = nblock;
- config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = nblock;
+ config.sharedMemorySize =
+ calc_shmem_required_nonbonded(num_threads_z, &nb->deviceContext_->deviceInfo(), nbp);
if (debug)
{
}
auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
- const auto kernel = select_nbnxn_kernel(
- nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
- (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->deviceInfo);
+ const auto kernel =
+ select_nbnxn_kernel(nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
+ (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
+ &nb->deviceContext_->deviceInfo());
const auto kernelArgs =
prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial);
launchGpuKernel(kernel, config, deviceStream, timingEvent, "k_calc_nb", kernelArgs);
* and j-cluster concurrency, in x, y, and z, respectively.
* - The 1D block-grid contains as many blocks as super-clusters.
*/
- int num_threads_z = c_cudaPruneKernelJ4Concurrency;
- int nblock = calc_nb_kernel_nblock(numSciInPart, nb->deviceInfo);
+ int num_threads_z = c_cudaPruneKernelJ4Concurrency;
+ int nblock = calc_nb_kernel_nblock(numSciInPart, &nb->deviceContext_->deviceInfo());
KernelLaunchConfig config;
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
nbnxn_cuda_clear_e_fshift(nb);
}
-NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
- const DeviceContext& /* deviceContext */,
+NbnxmGpu* gpu_init(const DeviceContext& deviceContext,
const interaction_const_t* ic,
const PairlistParams& listParams,
const nbnxn_atomdata_t* nbat,
{
cudaError_t stat;
- auto nb = new NbnxmGpu;
+ auto nb = new NbnxmGpu();
+ nb->deviceContext_ = &deviceContext;
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
snew(nb->plist[InteractionLocality::Local], 1);
init_plist(nb->plist[InteractionLocality::Local]);
- /* set device info, just point it to the right GPU among the detected ones */
- nb->deviceInfo = deviceInfo;
-
/* local/non-local GPU streams */
- nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, DeviceContext(),
+ nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
DeviceStreamPriority::Normal, nb->bDoTime);
if (nb->bUseTwoStreams)
{
* case will be a single value.
*/
nb->deviceStreams[InteractionLocality::NonLocal].init(
- *nb->deviceInfo, DeviceContext(), DeviceStreamPriority::High, nb->bDoTime);
+ *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
}
/* init events for sychronization (timing disabled for performance reasons!) */
iTimers.didPairlistH2D = true;
}
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+
reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
- DeviceContext());
+ deviceContext);
copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
- DeviceContext());
+ deviceContext);
copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
- &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext());
+ &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
- &d_plist->excl_nalloc, DeviceContext());
+ &d_plist->excl_nalloc, deviceContext);
copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
int gpu_min_ci_balanced(NbnxmGpu* nb)
{
- return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceInfo->prop.multiProcessorCount : 0;
+ return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceContext_->deviceInfo().prop.multiProcessorCount
+ : 0;
}
gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
const int maxNumColumns = gridSet.numColumnsMax();
reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
+ &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, *gpu_nbv->deviceContext_);
reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext());
+ &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, *gpu_nbv->deviceContext_);
for (unsigned int g = 0; g < gridSet.grids().size(); g++)
{
const int* cxy_ind = grid.cxy_ind().data();
reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize,
- &gpu_nbv->atomIndicesSize_alloc, DeviceContext());
+ &gpu_nbv->atomIndicesSize_alloc, *gpu_nbv->deviceContext_);
if (atomIndicesSize > 0)
{
if (natoms_total > 0)
{
reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
- DeviceContext());
+ *gpu_nbv->deviceContext_);
copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
GpuApiCallBehavior::Async, nullptr);
}
*/
struct NbnxmGpu
{
- /*! \brief CUDA device information */
- const DeviceInformation* deviceInfo = nullptr;
+ /*! \brief GPU device context.
+ *
+ * \todo Make it constant reference, once NbnxmGpu is a proper class.
+ */
+ const DeviceContext* deviceContext_;
/*! \brief true if doing both local/non-local NB work on GPU */
bool bUseTwoStreams = false;
/*! \brief atom data */
/** Initializes the data structures related to GPU nonbonded calculations. */
GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceInformation gmx_unused* deviceInfo,
- const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
const interaction_const_t gmx_unused* ic,
const PairlistParams gmx_unused& listParams,
const nbnxn_atomdata_t gmx_unused* nbat,
"Device context can not be nullptr when to use GPU for non-bonded forces.");
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
- gpu_nbv = gpu_init(deviceInfo, *deviceContext, fr->ic, pairlistParams, nbat.get(),
- haveMultipleDomains);
+ gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
}
config.blockSize[1] = c_clSize;
config.gridSize[0] = plist->nsci;
- validate_global_work_size(config, 3, nb->deviceInfo);
+ validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
if (debug)
{
config.blockSize[2] = num_threads_z;
config.gridSize[0] = numSciInPart;
- validate_global_work_size(config, 3, nb->deviceInfo);
+ validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
if (debug)
{
nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
- init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, nb->dev_rundata->deviceContext_);
+ init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
}
/*! \brief Initializes the pair list data structure.
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d", kernel_name,
- nb->deviceInfo->device_name, cl_error);
+ nb->deviceContext_->deviceInfo().device_name, cl_error);
}
return kernel;
//! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
- const DeviceContext& deviceContext,
+NbnxmGpu* gpu_init(const DeviceContext& deviceContext,
const interaction_const_t* ic,
const PairlistParams& listParams,
const nbnxn_atomdata_t* nbat,
{
GMX_ASSERT(ic, "Need a valid interaction constants object");
- auto nb = new NbnxmGpu;
+ auto nb = new NbnxmGpu();
+ nb->deviceContext_ = &deviceContext;
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
snew(nb->plist[InteractionLocality::Local], 1);
nb->timers = new cl_timers_t();
snew(nb->timings, 1);
- /* set device info, just point it to the right GPU among the detected ones */
- nb->deviceInfo = deviceInfo;
- nb->dev_rundata = new gmx_device_runtime_data_t(deviceContext);
+ nb->dev_rundata = new gmx_device_runtime_data_t();
/* init nbst */
pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
/* local/non-local GPU streams */
- nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, nb->dev_rundata->deviceContext_,
+ nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
DeviceStreamPriority::Normal, nb->bDoTime);
if (nb->bUseTwoStreams)
init_plist(nb->plist[InteractionLocality::NonLocal]);
nb->deviceStreams[InteractionLocality::NonLocal].init(
- *nb->deviceInfo, nb->dev_rundata->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+ *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
}
if (nb->bDoTime)
init_timings(nb->timings);
}
- nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(),
- nb->dev_rundata->deviceContext_);
+ nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(), *nb->deviceContext_);
/* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
* TODO: decide about NVIDIA
*/
nb->bPrefetchLjParam = (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr)
- && ((nb->deviceInfo->deviceVendor == DeviceVendor::Amd)
- || (nb->deviceInfo->deviceVendor == DeviceVendor::Intel)
+ && ((nb->deviceContext_->deviceInfo().deviceVendor == DeviceVendor::Amd)
+ || (nb->deviceContext_->deviceInfo().deviceVendor == DeviceVendor::Intel)
|| (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
/* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
}
// TODO most of this function is same in CUDA and OpenCL, move into the header
- const DeviceContext& deviceContext = nb->dev_rundata->deviceContext_;
+ const DeviceContext& deviceContext = *nb->deviceContext_;
reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
deviceContext);
freeDeviceBuffer(&d_atdat->atom_types);
}
- d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
- CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ d_atdat->f = clCreateBuffer(nb->deviceContext_->context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
- d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
- CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ d_atdat->xq = clCreateBuffer(nb->deviceContext_->context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(cl_float4), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
if (useLjCombRule(nb->nbparam->vdwtype))
{
- d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
+ d_atdat->lj_comb = clCreateBuffer(nb->deviceContext_->context(),
CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(cl_float2), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
}
else
{
- d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
+ d_atdat->atom_types = clCreateBuffer(nb->deviceContext_->context(),
CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(int), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
//! This function is documented in the header file
int gpu_min_ci_balanced(NbnxmGpu* nb)
{
- return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceInfo->compute_units : 0;
+ return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceContext_->deviceInfo().compute_units : 0;
}
//! This function is documented in the header file
the log output here should be written there */
program = gmx::ocl::compileProgram(
stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
- nb->dev_rundata->deviceContext_.context(), nb->deviceInfo->oclDeviceId,
- nb->deviceInfo->deviceVendor);
+ nb->deviceContext_->context(), nb->deviceContext_->deviceInfo().oclDeviceId,
+ nb->deviceContext_->deviceInfo().deviceVendor);
}
catch (gmx::GromacsException& e)
{
e.prependContext(gmx::formatString("Failed to compile NBNXN kernels for GPU #%s\n",
- nb->deviceInfo->device_name));
+ nb->deviceContext_->deviceInfo().device_name));
throw;
}
}
*/
struct NbnxmGpu
{
- //! OpenCL device information
- const DeviceInformation* deviceInfo = nullptr;
+ /* \brief OpenCL device context
+ *
+ * \todo Make it constant reference, once NbnxmGpu is a proper class.
+ */
+ const DeviceContext* deviceContext_;
//! OpenCL runtime data (context, kernels)
struct gmx_device_runtime_data_t* dev_rundata = nullptr;