// reallocate on device only if needed
if (newSize > maxPackedBufferSize_)
{
- reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
- reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
- reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
+ reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
+ reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
maxPackedBufferSize_ = newSize;
}
changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
- allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+ allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
}
GpuHaloExchange::Impl::~Impl()
#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/utility/gmxmpi.h"
GpuEventSynchronizer* haloDataTransferLaunched_ = nullptr;
//! MPI communicator used for simulation
MPI_Comm mpi_comm_mysim_;
+ //! Dummy GPU context object
+ const DeviceContext deviceContext_;
//! CUDA stream for local non-bonded calculations
cudaStream_t localStream_ = nullptr;
//! CUDA stream for non-local non-bonded calculations
struct gmx_wallcycle;
struct NumPmeDomains;
+class DeviceContext;
enum class GpuTaskCompletion;
class PmeGpuProgram;
class GpuEventSynchronizer;
* \param[in] pme The PME data structure.
* \returns Pointer to GPU context object.
*/
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
return pme_gpu_get_stream(pme->gpu);
}
-void* pme_gpu_get_device_context(const gmx_pme_t* pme)
+const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* pme)
{
- if (!pme || !pme_gpu_active(pme))
- {
- return nullptr;
- }
+ GMX_RELEASE_ASSERT(pme, "GPU context requested from PME before PME was constructed.");
+ GMX_RELEASE_ASSERT(pme_gpu_active(pme),
+ "GPU context requested from PME, but PME is running on the CPU.");
return pme_gpu_get_context(pme->gpu);
}
== kernelParamsPtr->grid.complexGridSize[i],
"Complex padding not implemented");
}
- cl_context context = pmeGpu->archSpecific->context;
+ cl_context context = pmeGpu->archSpecific->deviceContext_.context();
commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
realGrid_ = kernelParamsPtr->grid.d_realGrid;
complexGrid_ = kernelParamsPtr->grid.d_fourierGrid;
{
const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
- pmeGpu->archSpecific->context);
+ pmeGpu->archSpecific->deviceContext_);
pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
}
const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
&pmeGpu->archSpecific->splineValuesSize,
- &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->splineValuesSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
if (shouldRealloc)
{
/* Reallocate the host buffer */
const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
- &pmeGpu->archSpecific->forcesSize,
- &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
}
GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
&pmeGpu->archSpecific->coefficientsSize,
- &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->coefficientsSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
int currentSizeTemp = pmeGpu->archSpecific->splineDataSize;
int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
- ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->context);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, ¤tSizeTemp,
+ ¤tSizeTempAlloc, pmeGpu->archSpecific->deviceContext_);
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
- &pmeGpu->archSpecific->splineDataSize,
- &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
// the host side reallocation
if (shouldRealloc)
{
GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
&pmeGpu->archSpecific->gridlineIndicesSize,
- &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->gridlineIndicesSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
pfree(pmeGpu->staging.h_gridlineIndices);
pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
}
/* 2 separate grids */
reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
&pmeGpu->archSpecific->complexGridSize,
- &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
- reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
- &pmeGpu->archSpecific->realGridSize,
- &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->complexGridSizeAlloc,
+ pmeGpu->archSpecific->deviceContext_);
+ reallocateDeviceBuffer(
+ &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize,
+ &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
}
else
{
const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
reallocateDeviceBuffer(
&kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
- &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+ &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
// the size might get used later for copying the grid
#elif GMX_GPU == GMX_GPU_OPENCL
// No dedicated texture routines....
allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
- pmeGpu->archSpecific->context);
+ pmeGpu->archSpecific->deviceContext_);
allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
- pmeGpu->archSpecific->context);
+ pmeGpu->archSpecific->deviceContext_);
copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
GpuApiCallBehavior::Async, nullptr);
#endif
/* Allocate the target-specific structures */
- pmeGpu->archSpecific.reset(new PmeGpuSpecific());
+ pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
pmeGpu->archSpecific->performOutOfPlaceFFT = true;
* TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
*/
- // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
- pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
-
// timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
if (GMX_GPU == GMX_GPU_CUDA)
{
pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
cl_int clError;
- pmeGpu->archSpecific->pmeStream =
- clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
+ pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
+ pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
if (clError != CL_SUCCESS)
{
GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
}
}
-void* pme_gpu_get_context(const PmeGpu* pmeGpu)
+const DeviceContext* pme_gpu_get_context(const PmeGpu* pmeGpu)
{
- if (pmeGpu)
- {
- return static_cast<void*>(&pmeGpu->archSpecific->context);
- }
- else
- {
- return nullptr;
- }
+ GMX_RELEASE_ASSERT(
+ pmeGpu,
+ "GPU context object was requested, but PME GPU object was not (yet) initialized.");
+ return &pmeGpu->archSpecific->deviceContext_;
}
GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to context object.
*/
-GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
#include "config.h"
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits.h"
#include "gromacs/utility/classhelpers.h"
* TODO: Later we want to be able to own the context at a higher level and not here,
* but this class would still need the non-owning context handle to build the kernels.
*/
- DeviceContext context;
+ DeviceContext deviceContext_;
//! Conveniently all the PME kernels use the same single argument type
#if GMX_GPU == GMX_GPU_CUDA
contextProperties[2] = 0; /* Terminates the list of properties */
cl_int clError;
- context = clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError);
+ deviceContext_.setContext(clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError));
if (clError != CL_SUCCESS)
{
const std::string errorString = gmx::formatString(
}
// kernel parameters
- warpSize = gmx::ocl::getDeviceWarpSize(context, deviceId);
+ warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceId);
// TODO: for Intel ideally we'd want to set these based on the compiler warp size
// but given that we've done no tuning for Intel iGPU, this is as good as anything.
spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo->maxWorkGroupSize);
stat |= clReleaseKernel(solveXYZEnergyKernel);
stat |= clReleaseKernel(solveYZXKernel);
stat |= clReleaseKernel(solveYZXEnergyKernel);
- stat |= clReleaseContext(context);
GMX_ASSERT(stat == CL_SUCCESS,
gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat,
ocl_get_error_string(stat).c_str())
/* TODO when we have a proper MPI-aware logging module,
the log output here should be written there */
program = gmx::ocl::compileProgram(stderr, "gromacs/ewald", "pme_program.cl",
- commonDefines, context, deviceInfo->oclDeviceId,
- deviceInfo->deviceVendor);
+ commonDefines, deviceContext_.context(),
+ deviceInfo->oclDeviceId, deviceInfo->deviceVendor);
}
catch (gmx::GromacsException& e)
{
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT
+#include "pme_gpu_3dfft.h"
+
class GpuParallel3dFft;
/*! \internal \brief
*/
struct PmeGpuSpecific
{
+ /*! \brief Constructor
+ *
+ * \param[in] deviceContext GPU device context.
+ */
+ PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
/*! \brief The GPU stream where everything related to the PME happens. */
CommandStream pmeStream;
* but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram,
* managed by high-level code.
*/
- DeviceContext context;
+ const DeviceContext& deviceContext_;
/* Synchronization events */
/*! \brief Triggered after the PME Force Calculations have been completed */
/* Settings which are set at the start of the run */
/*! \brief A boolean which tells whether the complex and real grids for cu/clFFT are different or same. Currenty true. */
- bool performOutOfPlaceFFT;
+ bool performOutOfPlaceFFT = false;
/*! \brief A boolean which tells if the GPU timing events are enabled.
* False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING.
* Note: will not be reliable when multiple GPU tasks are running concurrently on the same
* device context, as CUDA events on multiple streams are untrustworthy.
*/
- bool useTiming;
+ bool useTiming = false;
//! Vector of FFT setups
std::vector<std::unique_ptr<GpuParallel3dFft>> fftSetup;
* TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
*/
/*! \brief The kernelParams.atoms.coordinates float element count (actual)*/
- int coordinatesSize;
+ int coordinatesSize = 0;
/*! \brief The kernelParams.atoms.coordinates float element count (reserved) */
- int coordinatesSizeAlloc;
+ int coordinatesSizeAlloc = 0;
/*! \brief The kernelParams.atoms.forces float element count (actual) */
- int forcesSize;
+ int forcesSize = 0;
/*! \brief The kernelParams.atoms.forces float element count (reserved) */
- int forcesSizeAlloc;
+ int forcesSizeAlloc = 0;
/*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */
- int gridlineIndicesSize;
+ int gridlineIndicesSize = 0;
/*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */
- int gridlineIndicesSizeAlloc;
+ int gridlineIndicesSizeAlloc = 0;
/*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */
- int splineDataSize;
+ int splineDataSize = 0;
/*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
- int splineDataSizeAlloc;
+ int splineDataSizeAlloc = 0;
/*! \brief The kernelParams.atoms.coefficients float element count (actual) */
- int coefficientsSize;
+ int coefficientsSize = 0;
/*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
- int coefficientsSizeAlloc;
+ int coefficientsSizeAlloc = 0;
/*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
- int splineValuesSize;
+ int splineValuesSize = 0;
/*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
- int splineValuesSizeAlloc;
+ int splineValuesSizeAlloc = 0;
/*! \brief The kernelParams.grid.realGrid float element count (actual) */
- int realGridSize;
+ int realGridSize = 0;
/*! \brief The kernelParams.grid.realGrid float element count (reserved) */
- int realGridSizeAlloc;
+ int realGridSizeAlloc = 0;
/*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
- int complexGridSize;
+ int complexGridSize = 0;
/*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
- int complexGridSizeAlloc;
+ int complexGridSizeAlloc = 0;
};
#endif
const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
if (useGpuForPme)
{
- const void* commandStream = pme_gpu_get_device_stream(pme);
- const void* deviceContext = pme_gpu_get_device_context(pme);
+ const void* commandStream = pme_gpu_get_device_stream(pme);
+ const DeviceContext& deviceContext = *pme_gpu_get_device_context(pme);
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
// Reallocate buffer used for staging PME force on GPU
- reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_,
+ DeviceContext());
#else
GMX_UNUSED_VALUE(size);
#endif
// TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
// restrict one from using other constructor here.
return std::make_unique<StatePropagatorDataGpu>(
- pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme),
+ pme_gpu_get_device_stream(&pme), *pme_gpu_get_device_context(&pme),
GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr);
}
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
)
if(GMX_USE_OPENCL)
gmx_add_libgromacs_sources(
+ device_context_ocl.cpp
gpu_utils_ocl.cpp
ocl_compiler.cpp
ocl_caching.cpp
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+# include "gromacs/gpu_utils/device_context_ocl.h"
+#else
+# include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// Stub for device context
+class DeviceContext
+{
+public:
+ //! Default constructor. In OpenCL leaves context \c nullptr.
+ DeviceContext() {}
+ /*! \brief Second stage of construction. Creates the \c cl_context in OpenCL, does nothing in CUDA.
+ *
+ * \param[in] deviceInfo Platform-specific device information.
+ */
+ void init(const DeviceInformation& /*deviceInfo*/) {}
+ /*! \brief Construct the object and call \c init(...) .
+ *
+ * \param[in] deviceInfo Platform-specific device information.
+ */
+ DeviceContext(const DeviceInformation& deviceInfo) { init(deviceInfo); }
+ //! Destructor
+ ~DeviceContext() = default;
+
+ GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+};
+#endif // GMX_GPU != GMX_GPU_OPENCL
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_context_ocl.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Copies of values from cl_driver_diagnostics_intel.h,
+ * which isn't guaranteed to be available. */
+/**@{*/
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
+/**@}*/
+
+DeviceContext::DeviceContext()
+{
+ context_ = nullptr;
+}
+
+void DeviceContext::init(const DeviceInformation& deviceInfo)
+{
+ cl_platform_id platformId = deviceInfo.oclPlatformId;
+ cl_device_id deviceId = deviceInfo.oclDeviceId;
+ std::vector<cl_context_properties> contextProperties;
+
+ contextProperties.emplace_back(CL_CONTEXT_PLATFORM);
+ contextProperties.emplace_back(reinterpret_cast<cl_context_properties>(platformId));
+
+ if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
+ {
+ contextProperties.emplace_back(CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL);
+ contextProperties.emplace_back(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL
+ | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL);
+ }
+ contextProperties.emplace_back(0);
+
+ cl_int clError;
+ context_ = clCreateContext(contextProperties.data(), 1, &deviceId, nullptr, nullptr, &clError);
+ if (clError != CL_SUCCESS)
+ {
+ GMX_THROW(gmx::InternalError(gmx::formatString(
+ "Failed to create OpenCL context on device %s (OpenCL error ID %d).",
+ deviceInfo.device_name, clError)));
+ }
+}
+
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
+{
+ init(deviceInfo);
+}
+
+DeviceContext::~DeviceContext()
+{
+ cl_int clError;
+
+ if (context_)
+ {
+ clError = clReleaseContext(context_);
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Failed to release OpenCL context (OpenCL error ID %d).", clError).c_str());
+ context_ = nullptr;
+ }
+}
+
+cl_context DeviceContext::context() const
+{
+ return context_;
+}
+
+void DeviceContext::setContext(cl_context context)
+{
+ context_ = context;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// OpenCL device context class
+class DeviceContext
+{
+public:
+ //! Default constructor. Sets \c context_ to \c nullptr.
+ DeviceContext();
+ /*! \brief Second stage of construction. Creates the \c cl_context.
+ *
+ * \param[in] deviceInfo Platform-specific device information.
+ *
+ * \throws InternalError if context creation failed.
+ */
+ void init(const DeviceInformation& deviceInfo);
+ /*! \brief Construct the object and call \c init(...) .
+ *
+ * \param[in] deviceInfo Platform-specific device information.
+ *
+ * \throws InternalError if context creation failed.
+ */
+ DeviceContext(const DeviceInformation& deviceInfo);
+ //! Destructor
+ ~DeviceContext();
+
+ //! Getter
+ cl_context context() const;
+
+ //! Transition time setter - will be removed
+ void setContext(cl_context context);
+
+ GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+
+private:
+ //! OpenCL context object
+ cl_context context_ = nullptr;
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
* \inlibraryapi
*/
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
#include "gromacs/gpu_utils/gputraits.cuh"
* \param[in] deviceContext The buffer's dummy device context - not managed explicitly in CUDA RT.
*/
template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext /* deviceContext */)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
size_t numValues,
int* currentNumValues,
int* currentMaxNumValues,
- DeviceContext deviceContext)
+ const DeviceContext& deviceContext)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
GMX_ASSERT(currentNumValues, "needs a size pointer");
* \inlibraryapi
*/
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
#include "gromacs/gpu_utils/gputraits_ocl.h"
* \param[in] deviceContext The buffer's device context-to-be.
*/
template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext deviceContext)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& deviceContext)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
void* hostPtr = nullptr;
cl_int clError;
- *buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType),
- hostPtr, &clError);
+ *buffer = clCreateBuffer(deviceContext.context(), CL_MEM_READ_WRITE,
+ numValues * sizeof(ValueType), hostPtr, &clError);
GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError,
ocl_get_error_string(clError).c_str())
/*! \libinternal \file
* \brief Declares the CUDA type traits.
+ *
* \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
*
* \inlibraryapi
* \ingroup module_gpu_utils
using CommandStream = cudaStream_t;
//! \brief Single GPU call timing event - meaningless in CUDA
using CommandEvent = void;
-//! \brief Context used explicitly in OpenCL, does nothing in CUDA
-using DeviceContext = void*;
/*! \internal \brief
* GPU kernels scheduling description. This is same in OpenCL/CUDA.
#define GMX_GPU_UTILS_GPUTRAITS_H
/*! \libinternal \file
- * \brief Declares the GPU type traits for non-GPU builds
+ * \brief Declares the GPU type traits for non-GPU builds.
+ *
* \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
*
* \inlibraryapi
* \ingroup module_gpu_utils
using CommandStream = void*;
//! \brief Single GPU call timing event
using CommandEvent = void*;
-//! \brief GPU context
-using DeviceContext = void*;
#endif // GMX_GPU
/*! \libinternal \file
* \brief Declares the OpenCL type traits.
+ *
* \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
*
* \inlibraryapi
* \ingroup module_gpu_utils
using CommandStream = cl_command_queue;
//! \brief Single GPU call timing event
using CommandEvent = cl_event;
-//! \brief Context used explicitly in OpenCL
-using DeviceContext = cl_context;
/*! \internal \brief
* GPU kernels scheduling description. This is same in OpenCL/CUDA.
#include <string>
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gmxopencl.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/utility/exceptions.h"
*/
struct gmx_device_runtime_data_t
{
- cl_context context; /**< OpenCL context */
- cl_program program; /**< OpenCL program */
+ //! OpenCL context
+ DeviceContext deviceContext;
+ //! OpenCL program
+ cl_program program;
};
/*! \brief Launches synchronous or asynchronous device to host memory copy.
void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std::vector<gmx::RVec>& h_rVecInput)
{
+ DeviceInformation deviceInfo;
+ const DeviceContext deviceContext(deviceInfo);
+
const int numElements = h_rVecInput.size();
DeviceBuffer<RVec> d_rVecInput;
- allocateDeviceBuffer(&d_rVecInput, numElements, nullptr);
+ allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
GpuApiCallBehavior::Sync, nullptr);
DeviceBuffer<float3> d_float3Output;
- allocateDeviceBuffer(&d_float3Output, numElements * DIM, nullptr);
+ allocateDeviceBuffer(&d_float3Output, numElements * DIM, deviceContext);
std::vector<float3> h_float3Output(numElements);
stream_ = *static_cast<CommandStream*>(streamPtr);
wcycle_ = wcycle;
- allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
+ allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
// This could be an async transfer (if the source is pinned), so
// long as it uses the same stream as the kernels and we are happy
// to consume additional pinned pages.
copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
GpuApiCallBehavior::Sync, nullptr);
vTot_.resize(F_NRE);
- allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+ allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
kernelParams_.d_forceParams = d_forceParams_;
{
t_ilist& d_iList = d_iLists_[fType];
- reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
+ reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
+ deviceContext_);
copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
GpuApiCallBehavior::Async, nullptr);
#ifndef GMX_LISTED_FORCES_GPUBONDED_IMPL_H
#define GMX_LISTED_FORCES_GPUBONDED_IMPL_H
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/listed_forces/gpubonded.h"
//! \brief Device-side total virial
float* d_vTot_ = nullptr;
+ //! Dummy GPU context object
+ const DeviceContext deviceContext_;
//! \brief Bonded GPU stream, not owned by this module
CommandStream stream_;
return;
}
-LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream)
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+ deviceContext_(deviceContext),
+ commandStream_(commandStream)
{
numAtoms_ = 0;
numTempScaleValues_ = numTempScaleValues;
reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
- &numInverseMassesAlloc_, nullptr);
+ &numInverseMassesAlloc_, deviceContext_);
copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
if (numTempScaleValues > 1)
{
reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
- &numTempScaleGroupsAlloc_, nullptr);
+ &numTempScaleGroupsAlloc_, deviceContext_);
copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
}
if (numTempScaleValues_ > 0)
{
h_lambdas_.resize(numTempScaleValues);
- reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_,
+ deviceContext_);
}
}
#ifndef GMX_MDLIB_LEAPFROG_GPU_CUH
#define GMX_MDLIB_LEAPFROG_GPU_CUH
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/mdtypes/group.h"
public:
/*! \brief Constructor.
*
+ * \param[in] deviceContext Device context (dummy in CUDA).
* \param[in] commandStream Device command stream to use.
*/
- LeapFrogGpu(CommandStream commandStream);
+ LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
~LeapFrogGpu();
/*! \brief Integrate
class Impl;
private:
+ //! Dummy GPU context object
+ const DeviceContext& deviceContext_;
//! GPU stream
CommandStream commandStream_;
//! GPU kernel launch config
return;
}
-LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream) :
+LincsGpu::LincsGpu(int numIterations,
+ int expansionOrder,
+ const DeviceContext& deviceContext,
+ CommandStream commandStream) :
+ deviceContext_(deviceContext),
commandStream_(commandStream)
{
kernelParams_.numIterations = numIterations;
c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
"Number of threads per block should be a power of two in order for reduction to work.");
- allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
+ allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
h_virialScaled_.resize(6);
// The data arrays should be expanded/reallocated on first call of set() function.
numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
- allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
+ allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads,
+ deviceContext_);
allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
- kernelParams_.numConstraintsThreads, nullptr);
+ kernelParams_.numConstraintsThreads, deviceContext_);
allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
- kernelParams_.numConstraintsThreads, nullptr);
+ kernelParams_.numConstraintsThreads, deviceContext_);
allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
- maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+ maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
allocateDeviceBuffer(&kernelParams_.d_massFactors,
- maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+ maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
allocateDeviceBuffer(&kernelParams_.d_matrixA,
- maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+ maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
}
// (Re)allocate the memory, if the number of atoms has increased.
freeDeviceBuffer(&kernelParams_.d_inverseMasses);
}
numAtomsAlloc_ = numAtoms;
- allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
+ allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
}
// Copy data to GPU.
#ifndef GMX_MDLIB_LINCS_GPU_CUH
#define GMX_MDLIB_LINCS_GPU_CUH
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/mdlib/constr.h"
#include "gromacs/mdtypes/mdatom.h"
*
* \param[in] numIterations Number of iteration for the correction of the projection.
* \param[in] expansionOrder Order of the matrix inversion algorithm.
+ * \param[in] deviceContext Device context (dummy in CUDA).
* \param[in] commandStream Device command stream.
*/
- LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream);
+ LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
/*! \brief Destructor.*/
~LincsGpu();
static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
private:
+ //! Dummy GPU context object
+ const DeviceContext& deviceContext_;
//! GPU stream
CommandStream commandStream_;
return;
}
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+ deviceContext_(deviceContext),
commandStream_(commandStream)
{
static_assert(sizeof(real) == sizeof(float),
initSettleParameters(&settleParameters_, mO, mH, dOH, dHH);
- allocateDeviceBuffer(&d_virialScaled_, 6, nullptr);
+ allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_);
h_virialScaled_.resize(6);
}
ArrayRef<const int> iatoms = il_settle.iatoms;
numSettles_ = il_settle.size() / nral1;
- reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_);
h_atomIds_.resize(numSettles_);
for (int i = 0; i < numSettles_; i++)
{
#include "gmxpre.h"
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/math/functions.h"
#include "gromacs/math/invertmatrix.h"
* \param[in] mtop Topology of the system to gen the masses for O and H atoms and
* target O-H and H-H distances. These values are also checked for
* consistency.
+ * \param[in] deviceContext Device context (dummy in CUDA).
* \param[in] commandStream Device stream to use.
*/
- SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream);
+ SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
~SettleGpu();
void set(const InteractionDefinitions& idef, const t_mdatoms& md);
private:
+ //! Dummy GPU context object
+ const DeviceContext& deviceContext_;
//! GPU stream
CommandStream commandStream_;
*/
void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
{
- auto lincsGpu =
- std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder, nullptr);
+ DeviceInformation deviceInfo;
+ const DeviceContext deviceContext(deviceInfo);
+
+ auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
+ deviceContext, nullptr);
bool updateVelocities = true;
int numAtoms = testData->numAtoms_;
PbcAiuc pbcAiuc;
setPbcAiuc(pbc.ndim_ePBC, pbc.box, &pbcAiuc);
- allocateDeviceBuffer(&d_x, numAtoms, nullptr);
- allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
- allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+ allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
GpuApiCallBehavior::Sync, nullptr);
void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
{
+ DeviceInformation deviceInfo;
+ const DeviceContext deviceContext(deviceInfo);
+
int numAtoms = testData->numAtoms_;
float3* h_x = reinterpret_cast<float3*>(testData->x_.data());
float3 *d_x, *d_xp, *d_v, *d_f;
- allocateDeviceBuffer(&d_x, numAtoms, nullptr);
- allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
- allocateDeviceBuffer(&d_v, numAtoms, nullptr);
- allocateDeviceBuffer(&d_f, numAtoms, nullptr);
+ allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- auto integrator = std::make_unique<LeapFrogGpu>(nullptr);
+ auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
// TODO: Here we should check that at least 1 suitable GPU is available
GMX_RELEASE_ASSERT(canPerformGpuDetection(), "Can't detect CUDA-capable GPUs.");
- auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, nullptr);
+ DeviceInformation deviceInfo;
+ const DeviceContext deviceContext(deviceInfo);
+
+ auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
settleGpu->set(*testData->idef_, testData->mdatoms_);
PbcAiuc pbcAiuc;
float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data()));
float3* h_v = (float3*)(as_rvec_array(testData->v_.data()));
- allocateDeviceBuffer(&d_x, numAtoms, nullptr);
- allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
- allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+ allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+ allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
: commandStream_ = nullptr;
- integrator_ = std::make_unique<LeapFrogGpu>(commandStream_);
- lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, commandStream_);
- settleGpu_ = std::make_unique<SettleGpu>(mtop, commandStream_);
+ integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
+ lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
+ settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
numAtoms_ = md.nr;
- reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_);
reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
- &numInverseMassesAlloc_, nullptr);
+ &numInverseMassesAlloc_, deviceContext_);
// Integrator should also update something, but it does not even have a method yet
integrator_->set(md, numTempScaleValues, md.cTC);
static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
private:
+ //! Dummy GPU context object
+ const DeviceContext deviceContext_;
//! GPU stream
CommandStream commandStream_ = nullptr;
//! GPU kernel launch config
fr->nbv->gpu_nbv != nullptr
? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
: nullptr;
- const void* deviceContext = pme_gpu_get_device_context(fr->pmedata);
- const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
+ const DeviceContext& deviceContext = *pme_gpu_get_device_context(fr->pmedata);
+ const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
? GpuApiCallBehavior::Async
: GpuApiCallBehavior::Sync;
#include "locality.h"
+class DeviceContext;
class GpuEventSynchronizer;
struct gmx_wallcycle;
*
* \todo Make a \p CommandStream visible in the CPU parts of the code so we
* will not have to pass a void*.
- * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
- * will not have to pass a void*.
*
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ StatePropagatorDataGpu(const void* pmeStream,
+ const void* localStream,
+ const void* nonLocalStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ StatePropagatorDataGpu(const void* pmeStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
//! Move constructor
StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
const void* /* localStream */,
const void* /* nonLocalStream */,
- const void* /* deviceContext */,
+ const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
int /* paddingSize */,
gmx_wallcycle* /* wcycle */) :
}
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
- const void* /* deviceContext */,
+ const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
int /* paddingSize */,
gmx_wallcycle* /* wcycle */) :
*
* \todo Make a \p CommandStream visible in the CPU parts of the code so we
* will not have to pass a void*.
- * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
- * will not have to pass a void*.
*
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- Impl(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ Impl(const void* pmeStream,
+ const void* localStream,
+ const void* nonLocalStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- Impl(const void* pmeStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle);
+ Impl(const void* pmeStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
~Impl();
//! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
- /*! \brief GPU context (for OpenCL builds)
- * \todo Make a Context class usable in CPU code
- */
- DeviceContext deviceContext_ = nullptr;
+ //! GPU context (for OpenCL builds)
+ const DeviceContext& deviceContext_;
//! Default GPU calls behavior
GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
//! Padding size for the coordinates buffer
# include "gromacs/gpu_utils/cudautils.cuh"
# endif
# include "gromacs/gpu_utils/devicebuffer.h"
+# include "gromacs/gpu_utils/gputraits.h"
# if GMX_GPU == GMX_GPU_OPENCL
# include "gromacs/gpu_utils/oclutils.h"
# endif
namespace gmx
{
-StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
+ const void* localStream,
+ const void* nonLocalStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
+ deviceContext_(deviceContext),
transferKind_(transferKind),
paddingSize_(paddingSize),
wcycle_(wcycle)
// TODO: Refactor when the StreamManager is introduced.
if (GMX_GPU == GMX_GPU_OPENCL)
{
- GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
// The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
- pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
- updateStream_ = *static_cast<const CommandStream*>(pmeStream);
- deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+ pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+ updateStream_ = *static_cast<const CommandStream*>(pmeStream);
GMX_UNUSED_VALUE(localStream);
GMX_UNUSED_VALUE(nonLocalStream);
}
stat = cudaStreamCreate(&updateStream_);
CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
# endif
- GMX_UNUSED_VALUE(deviceContext);
}
// Map the atom locality to the stream that will be used for coordinates,
fCopyStreams_[AtomLocality::All] = updateStream_;
}
-StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
+ deviceContext_(deviceContext),
transferKind_(transferKind),
paddingSize_(paddingSize),
wcycle_(wcycle)
static_assert(GMX_GPU != GMX_GPU_NONE,
"This object should only be constructed on the GPU code-paths.");
- if (GMX_GPU == GMX_GPU_OPENCL)
- {
- GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
- deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
- }
-
GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
+ const void* localStream,
+ const void* nonLocalStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
{
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
- const void* deviceContext,
- GpuApiCallBehavior transferKind,
- int paddingSize,
- gmx_wallcycle* wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
+ const DeviceContext& deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
{
}
// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
iTimers.didPairlistH2D = true;
}
- DeviceContext context = nullptr;
-
- reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+ reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+ DeviceContext());
copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
- reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+ reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+ DeviceContext());
copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
- &d_plist->nimask, &d_plist->imask_nalloc, context);
+ &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext());
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
- &d_plist->excl_nalloc, context);
+ &d_plist->excl_nalloc, DeviceContext());
copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
const int maxNumColumns = gridSet.numColumnsMax();
reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+ &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+ &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext());
for (unsigned int g = 0; g < gridSet.grids().size(); g++)
{
const int* cxy_ind = grid.cxy_ind().data();
reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize,
- &gpu_nbv->atomIndicesSize_alloc, nullptr);
+ &gpu_nbv->atomIndicesSize_alloc, DeviceContext());
if (atomIndicesSize > 0)
{
if (natoms_total > 0)
{
- reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+ reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
+ DeviceContext());
copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
}
#include "thread_mpi/atomic.h"
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/hw_info.h"
array_format.image_channel_data_type = CL_FLOAT;
array_format.image_channel_order = CL_R;
- coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
- &array_format, tabsize, 1, 0, ftmp, &cl_error);
+ coul_tab = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
+ CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error);
*/
- coul_tab = clCreateBuffer(
- runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
- tables.tableF.size() * sizeof(cl_float), const_cast<real*>(tables.tableF.data()), &cl_error);
+ coul_tab = clCreateBuffer(runData->deviceContext.context(),
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ tables.tableF.size() * sizeof(cl_float),
+ const_cast<real*>(tables.tableF.data()), &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
ad->ntypes = ntypes;
- ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
- SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
+ ad->shift_vec =
+ clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
ad->bShiftVecUploaded = CL_FALSE;
- ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ ad->fshift = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
SHIFTS * sizeof(nb_staging_t::fshift[0]), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
- ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ ad->e_lj = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
sizeof(float), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
- ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ ad->e_el = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
sizeof(float), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
array_format.image_channel_data_type = CL_FLOAT;
array_format.image_channel_order = CL_R;
- nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
- &array_format, 1, 1, 0, nullptr, &cl_error);
+ nbp->coulomb_tab_climg2d = clCreateImage2D(runData->deviceContext.context(),
+ CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);
*/
- nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+ nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
sizeof(cl_float), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
array_format.image_channel_data_type = CL_FLOAT;
array_format.image_channel_order = CL_R;
- nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY |
+ nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
*/
nbp->nbfp_climg2d = clCreateBuffer(
- runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ runData->deviceContext.context(),
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
nnbfp * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp.data()), &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
{
/* Switched from using textures to using buffers */
// TODO: decide which alternative is most efficient - textures or buffers.
- /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE |
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
- nbp->nbfp_comb_climg2d = clCreateBuffer(
- runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
- nnbfp_comb * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp_comb.data()),
- &cl_error);
+ nbp->nbfp_comb_climg2d =
+ clCreateBuffer(runData->deviceContext.context(),
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ nnbfp_comb * sizeof(cl_float),
+ const_cast<float*>(nbatParams.nbfp_comb.data()), &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
}
// don't accept nullptr values for image2D parameters.
/* Switched from using textures to using buffers */
// TODO: decide which alternative is most efficient - textures or buffers.
- /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
- &array_format, 1, 1, 0, nullptr, &cl_error);*/
- nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(),
+ CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/
+ nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
sizeof(cl_float), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
rank, deviceInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str());
}
- runtimeData->context = context;
+ runtimeData->deviceContext.setContext(context);
}
/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
snew(nb->timings, 1);
/* set device info, just point it to the right GPU among the detected ones */
- nb->deviceInfo = deviceInfo;
- snew(nb->dev_rundata, 1);
+ nb->deviceInfo = deviceInfo;
+ nb->dev_rundata = new gmx_device_runtime_data_t();
/* init nbst */
pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
nbnxn_gpu_create_context(nb->dev_rundata, nb->deviceInfo, rank);
/* local/non-local GPU streams */
- nb->stream[InteractionLocality::Local] = clCreateCommandQueue(
- nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+ nb->stream[InteractionLocality::Local] =
+ clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+ nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
{
init_plist(nb->plist[InteractionLocality::NonLocal]);
- nb->stream[InteractionLocality::NonLocal] = clCreateCommandQueue(
- nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+ nb->stream[InteractionLocality::NonLocal] =
+ clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+ nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
}
// TODO most of this function is same in CUDA and OpenCL, move into the header
- DeviceContext context = nb->dev_rundata->context;
+ const DeviceContext& deviceContext = nb->dev_rundata->deviceContext;
- reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+ reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+ deviceContext);
copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
- reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+ reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+ deviceContext);
copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
- &d_plist->nimask, &d_plist->imask_nalloc, context);
+ &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
- &d_plist->excl_nalloc, context);
+ &d_plist->excl_nalloc, deviceContext);
copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
freeDeviceBuffer(&d_atdat->atom_types);
}
- d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+ CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
- d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(cl_float4), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
if (useLjCombRule(nb->nbparam->vdwtype))
{
- d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context,
+ d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(cl_float2), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
}
else
{
- d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context,
+ d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
nalloc * sizeof(int), nullptr, &cl_error);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
return;
}
- cl_int gmx_unused cl_error;
-
- if (runData->context)
- {
- cl_error = clReleaseContext(runData->context);
- GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
- ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
- runData->context = nullptr;
- }
-
if (runData->program)
{
- cl_error = clReleaseProgram(runData->program);
+ cl_int cl_error = clReleaseProgram(runData->program);
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
runData->program = nullptr;
}
free_gpu_device_runtime_data(nb->dev_rundata);
- sfree(nb->dev_rundata);
+ delete nb->dev_rundata;
/* Free timers and timings */
delete nb->timers;
{
/* TODO when we have a proper MPI-aware logging module,
the log output here should be written there */
- program = gmx::ocl::compileProgram(
- stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
- nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
+ program =
+ gmx::ocl::compileProgram(stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl",
+ extraDefines, nb->dev_rundata->deviceContext.context(),
+ nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
}
catch (gmx::GromacsException& e)
{