From: Artem Zhmurov Date: Wed, 29 Jan 2020 15:59:33 +0000 (+0100) Subject: Make DeviceContext into a proper class X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=345378d48c1bcc8fb1707d7a3987b37189ee7d83;p=alexxy%2Fgromacs.git Make DeviceContext into a proper class Having DeviceContext as a platform-agnostic class allows one to easily create context, which is usefull for setting up the GPU stream/context manager, tests for GPU functionality. Refs. #3315. Change-Id: I496d225b58c96d7642830c8e9552139750891849 --- diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 2ac4228d6f..92a1d9f3d5 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -147,9 +147,9 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo // reallocate on device only if needed if (newSize > maxPackedBufferSize_) { - reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr); - reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr); - reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr); + reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_); + reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_); + reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_); maxPackedBufferSize_ = newSize; } @@ -448,7 +448,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported); - allocateDeviceBuffer(&d_fShift_, 1, nullptr); + allocateDeviceBuffer(&d_fShift_, 1, deviceContext_); } GpuHaloExchange::Impl::~Impl() diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index b139a9b491..a8d2f9204c 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -47,6 +47,7 @@ #define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H #include "gromacs/domdec/gpuhaloexchange.h" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/utility/gmxmpi.h" @@ -175,6 +176,8 @@ private: GpuEventSynchronizer* haloDataTransferLaunched_ = nullptr; //! MPI communicator used for simulation MPI_Comm mpi_comm_mysim_; + //! Dummy GPU context object + const DeviceContext deviceContext_; //! CUDA stream for local non-bonded calculations cudaStream_t localStream_ = nullptr; //! CUDA stream for non-local non-bonded calculations diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index ef89a5bb79..40a34682c0 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -71,6 +71,7 @@ struct gmx_pme_t; struct gmx_wallcycle; struct NumPmeDomains; +class DeviceContext; enum class GpuTaskCompletion; class PmeGpuProgram; class GpuEventSynchronizer; @@ -439,7 +440,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARG * \param[in] pme The PME data structure. * \returns Pointer to GPU context object. */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) +GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 7f092304b5..b4cec47135 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -442,12 +442,11 @@ void* pme_gpu_get_device_stream(const gmx_pme_t* pme) return pme_gpu_get_stream(pme->gpu); } -void* pme_gpu_get_device_context(const gmx_pme_t* pme) +const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* pme) { - if (!pme || !pme_gpu_active(pme)) - { - return nullptr; - } + GMX_RELEASE_ASSERT(pme, "GPU context requested from PME before PME was constructed."); + GMX_RELEASE_ASSERT(pme_gpu_active(pme), + "GPU context requested from PME, but PME is running on the CPU."); return pme_gpu_get_context(pme->gpu); } diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp index 2b14dc7567..c6e1b6448c 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp @@ -80,7 +80,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) == kernelParamsPtr->grid.complexGridSize[i], "Complex padding not implemented"); } - cl_context context = pmeGpu->archSpecific->context; + cl_context context = pmeGpu->archSpecific->deviceContext_.context(); commandStreams_.push_back(pmeGpu->archSpecific->pmeStream); realGrid_ = kernelParamsPtr->grid.d_realGrid; complexGrid_ = kernelParamsPtr->grid.d_fourierGrid; diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index f9932eb36b..17dd6805c0 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -142,7 +142,7 @@ void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu) { const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float); allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, - pmeGpu->archSpecific->context); + pmeGpu->archSpecific->deviceContext_); pmalloc(reinterpret_cast(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize); } @@ -172,7 +172,8 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu) const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize); reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize, - &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->splineValuesSizeAlloc, + pmeGpu->archSpecific->deviceContext_); if (shouldRealloc) { /* Reallocate the host buffer */ @@ -202,8 +203,8 @@ void pme_gpu_realloc_forces(PmeGpu* pmeGpu) const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM; GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize, - &pmeGpu->archSpecific->forcesSize, - &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, + pmeGpu->archSpecific->deviceContext_); pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc); pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms); } @@ -238,7 +239,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_ GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize, - &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->coefficientsSizeAlloc, + pmeGpu->archSpecific->deviceContext_); copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); @@ -270,11 +272,11 @@ void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu) const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize); int currentSizeTemp = pmeGpu->archSpecific->splineDataSize; int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc; - reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, - ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->context); + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, ¤tSizeTemp, + ¤tSizeTempAlloc, pmeGpu->archSpecific->deviceContext_); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize, - &pmeGpu->archSpecific->splineDataSize, - &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, + pmeGpu->archSpecific->deviceContext_); // the host side reallocation if (shouldRealloc) { @@ -300,7 +302,8 @@ void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu) GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSize, - &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, + pmeGpu->archSpecific->deviceContext_); pfree(pmeGpu->staging.h_gridlineIndices); pmalloc(reinterpret_cast(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int)); } @@ -326,10 +329,11 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu) /* 2 separate grids */ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize, &pmeGpu->archSpecific->complexGridSize, - &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context); - reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize, - &pmeGpu->archSpecific->realGridSize, - &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->complexGridSizeAlloc, + pmeGpu->archSpecific->deviceContext_); + reallocateDeviceBuffer( + &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize, + &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_); } else { @@ -337,7 +341,7 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu) const int newGridsSize = std::max(newRealGridSize, newComplexGridSize); reallocateDeviceBuffer( &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize, - &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context); + &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_); kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid; pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize; // the size might get used later for copying the grid @@ -385,9 +389,9 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu) #elif GMX_GPU == GMX_GPU_OPENCL // No dedicated texture routines.... allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize, - pmeGpu->archSpecific->context); + pmeGpu->archSpecific->deviceContext_); allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize, - pmeGpu->archSpecific->context); + pmeGpu->archSpecific->deviceContext_); copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0, newFractShiftsSize, pmeGpu->archSpecific->pmeStream, GpuApiCallBehavior::Async, nullptr); @@ -486,7 +490,7 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu) #endif /* Allocate the target-specific structures */ - pmeGpu->archSpecific.reset(new PmeGpuSpecific()); + pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_)); pmeGpu->kernelParams.reset(new PmeGpuKernelParams()); pmeGpu->archSpecific->performOutOfPlaceFFT = true; @@ -495,9 +499,6 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu) * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7). */ - // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context - pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context; - // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB if (GMX_GPU == GMX_GPU_CUDA) { @@ -537,8 +538,8 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu) pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0; cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId; cl_int clError; - pmeGpu->archSpecific->pmeStream = - clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError); + pmeGpu->archSpecific->pmeStream = clCreateCommandQueue( + pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError); if (clError != CL_SUCCESS) { GMX_THROW(gmx::InternalError("Failed to create PME command queue")); @@ -1526,16 +1527,12 @@ void* pme_gpu_get_stream(const PmeGpu* pmeGpu) } } -void* pme_gpu_get_context(const PmeGpu* pmeGpu) +const DeviceContext* pme_gpu_get_context(const PmeGpu* pmeGpu) { - if (pmeGpu) - { - return static_cast(&pmeGpu->archSpecific->context); - } - else - { - return nullptr; - } + GMX_RELEASE_ASSERT( + pmeGpu, + "GPU context object was requested, but PME GPU object was not (yet) initialized."); + return &pmeGpu->archSpecific->deviceContext_; } GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu) diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 0c5b53ab0e..a9dc9677ce 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -412,7 +412,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeG * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to context object. */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) +GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Return pointer to the sync object triggered after the PME force calculation completion diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h index 2c79f20a71..0b007c3966 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.h +++ b/src/gromacs/ewald/pme_gpu_program_impl.h @@ -44,6 +44,7 @@ #include "config.h" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits.h" #include "gromacs/utility/classhelpers.h" @@ -77,7 +78,7 @@ struct PmeGpuProgramImpl * TODO: Later we want to be able to own the context at a higher level and not here, * but this class would still need the non-owning context handle to build the kernels. */ - DeviceContext context; + DeviceContext deviceContext_; //! Conveniently all the PME kernels use the same single argument type #if GMX_GPU == GMX_GPU_CUDA diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp index ec220c7c2a..1353a99ed2 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp @@ -64,7 +64,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo) contextProperties[2] = 0; /* Terminates the list of properties */ cl_int clError; - context = clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError); + deviceContext_.setContext(clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError)); if (clError != CL_SUCCESS) { const std::string errorString = gmx::formatString( @@ -74,7 +74,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo) } // kernel parameters - warpSize = gmx::ocl::getDeviceWarpSize(context, deviceId); + warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceId); // TODO: for Intel ideally we'd want to set these based on the compiler warp size // but given that we've done no tuning for Intel iGPU, this is as good as anything. spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo->maxWorkGroupSize); @@ -96,7 +96,6 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl() stat |= clReleaseKernel(solveXYZEnergyKernel); stat |= clReleaseKernel(solveYZXKernel); stat |= clReleaseKernel(solveYZXEnergyKernel); - stat |= clReleaseContext(context); GMX_ASSERT(stat == CL_SUCCESS, gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat, ocl_get_error_string(stat).c_str()) @@ -165,8 +164,8 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation* deviceInfo) /* TODO when we have a proper MPI-aware logging module, the log output here should be written there */ program = gmx::ocl::compileProgram(stderr, "gromacs/ewald", "pme_program.cl", - commonDefines, context, deviceInfo->oclDeviceId, - deviceInfo->deviceVendor); + commonDefines, deviceContext_.context(), + deviceInfo->oclDeviceId, deviceInfo->deviceVendor); } catch (gmx::GromacsException& e) { diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h index be4f986957..44ca3fd3c3 100644 --- a/src/gromacs/ewald/pme_gpu_types_host_impl.h +++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -60,6 +60,8 @@ #include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT +#include "pme_gpu_3dfft.h" + class GpuParallel3dFft; /*! \internal \brief @@ -67,6 +69,11 @@ class GpuParallel3dFft; */ struct PmeGpuSpecific { + /*! \brief Constructor + * + * \param[in] deviceContext GPU device context. + */ + PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {} /*! \brief The GPU stream where everything related to the PME happens. */ CommandStream pmeStream; @@ -76,7 +83,7 @@ struct PmeGpuSpecific * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram, * managed by high-level code. */ - DeviceContext context; + const DeviceContext& deviceContext_; /* Synchronization events */ /*! \brief Triggered after the PME Force Calculations have been completed */ @@ -86,13 +93,13 @@ struct PmeGpuSpecific /* Settings which are set at the start of the run */ /*! \brief A boolean which tells whether the complex and real grids for cu/clFFT are different or same. Currenty true. */ - bool performOutOfPlaceFFT; + bool performOutOfPlaceFFT = false; /*! \brief A boolean which tells if the GPU timing events are enabled. * False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING. * Note: will not be reliable when multiple GPU tasks are running concurrently on the same * device context, as CUDA events on multiple streams are untrustworthy. */ - bool useTiming; + bool useTiming = false; //! Vector of FFT setups std::vector> fftSetup; @@ -112,37 +119,37 @@ struct PmeGpuSpecific * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. */ /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/ - int coordinatesSize; + int coordinatesSize = 0; /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */ - int coordinatesSizeAlloc; + int coordinatesSizeAlloc = 0; /*! \brief The kernelParams.atoms.forces float element count (actual) */ - int forcesSize; + int forcesSize = 0; /*! \brief The kernelParams.atoms.forces float element count (reserved) */ - int forcesSizeAlloc; + int forcesSizeAlloc = 0; /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */ - int gridlineIndicesSize; + int gridlineIndicesSize = 0; /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */ - int gridlineIndicesSizeAlloc; + int gridlineIndicesSizeAlloc = 0; /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */ - int splineDataSize; + int splineDataSize = 0; /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */ - int splineDataSizeAlloc; + int splineDataSizeAlloc = 0; /*! \brief The kernelParams.atoms.coefficients float element count (actual) */ - int coefficientsSize; + int coefficientsSize = 0; /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */ - int coefficientsSizeAlloc; + int coefficientsSizeAlloc = 0; /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */ - int splineValuesSize; + int splineValuesSize = 0; /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */ - int splineValuesSizeAlloc; + int splineValuesSizeAlloc = 0; /*! \brief The kernelParams.grid.realGrid float element count (actual) */ - int realGridSize; + int realGridSize = 0; /*! \brief The kernelParams.grid.realGrid float element count (reserved) */ - int realGridSizeAlloc; + int realGridSizeAlloc = 0; /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */ - int complexGridSize; + int complexGridSize = 0; /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */ - int complexGridSizeAlloc; + int complexGridSizeAlloc = 0; }; #endif diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index eed93cd2e3..2ee17b3267 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -628,8 +628,8 @@ int gmx_pmeonly(struct gmx_pme_t* pme, const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed); if (useGpuForPme) { - const void* commandStream = pme_gpu_get_device_stream(pme); - const void* deviceContext = pme_gpu_get_device_context(pme); + const void* commandStream = pme_gpu_get_device_stream(pme); + const DeviceContext& deviceContext = *pme_gpu_get_device_context(pme); changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy()); changePinningPolicy(&pme_pp->x, pme_get_pinning_policy()); diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 827a1bab34..29cb73d0ca 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -73,7 +73,8 @@ void PmePpCommGpu::Impl::reinit(int size) MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE); // Reallocate buffer used for staging PME force on GPU - reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr); + reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, + DeviceContext()); #else GMX_UNUSED_VALUE(size); #endif diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index c5b93d94c2..81edf195fe 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -166,7 +166,7 @@ std::unique_ptr makeStatePropagatorDataGpu(const gmx_pme // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to // restrict one from using other constructor here. return std::make_unique( - pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme), + pme_gpu_get_device_stream(&pme), *pme_gpu_get_device_context(&pme), GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr); } diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index 942141d288..8672e450ca 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by +# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -44,6 +44,7 @@ gmx_add_libgromacs_sources( ) if(GMX_USE_OPENCL) gmx_add_libgromacs_sources( + device_context_ocl.cpp gpu_utils_ocl.cpp ocl_compiler.cpp ocl_caching.cpp diff --git a/src/gromacs/gpu_utils/device_context.h b/src/gromacs/gpu_utils/device_context.h new file mode 100644 index 0000000000..d192b5543f --- /dev/null +++ b/src/gromacs/gpu_utils/device_context.h @@ -0,0 +1,83 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_H +#define GMX_GPU_UTILS_DEVICE_CONTEXT_H + +/*! \libinternal \file + * + * \brief Declarations for DeviceContext class. + * + * Only needed for OpenCL builds. Other platforms will be given a stub class. + * + * \author Mark Abraham + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + * \inlibraryapi + */ + +#include "config.h" + +#if GMX_GPU == GMX_GPU_OPENCL +# include "gromacs/gpu_utils/device_context_ocl.h" +#else +# include "gromacs/utility/classhelpers.h" + +struct DeviceInformation; + +// Stub for device context +class DeviceContext +{ +public: + //! Default constructor. In OpenCL leaves context \c nullptr. + DeviceContext() {} + /*! \brief Second stage of construction. Creates the \c cl_context in OpenCL, does nothing in CUDA. + * + * \param[in] deviceInfo Platform-specific device information. + */ + void init(const DeviceInformation& /*deviceInfo*/) {} + /*! \brief Construct the object and call \c init(...) . + * + * \param[in] deviceInfo Platform-specific device information. + */ + DeviceContext(const DeviceInformation& deviceInfo) { init(deviceInfo); } + //! Destructor + ~DeviceContext() = default; + + GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext); +}; +#endif // GMX_GPU != GMX_GPU_OPENCL + +#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_H diff --git a/src/gromacs/gpu_utils/device_context_ocl.cpp b/src/gromacs/gpu_utils/device_context_ocl.cpp new file mode 100644 index 0000000000..e5dfd5939d --- /dev/null +++ b/src/gromacs/gpu_utils/device_context_ocl.cpp @@ -0,0 +1,122 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements the DeviceContext for OpenCL + * + * \author Mark Abraham + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + */ +#include "gmxpre.h" + +#include "device_context_ocl.h" + +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/stringutil.h" + +/*! \brief Copies of values from cl_driver_diagnostics_intel.h, + * which isn't guaranteed to be available. */ +/**@{*/ +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2 +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4 +/**@}*/ + +DeviceContext::DeviceContext() +{ + context_ = nullptr; +} + +void DeviceContext::init(const DeviceInformation& deviceInfo) +{ + cl_platform_id platformId = deviceInfo.oclPlatformId; + cl_device_id deviceId = deviceInfo.oclDeviceId; + std::vector contextProperties; + + contextProperties.emplace_back(CL_CONTEXT_PLATFORM); + contextProperties.emplace_back(reinterpret_cast(platformId)); + + if (getenv("GMX_OCL_SHOW_DIAGNOSTICS")) + { + contextProperties.emplace_back(CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL); + contextProperties.emplace_back(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL + | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL); + } + contextProperties.emplace_back(0); + + cl_int clError; + context_ = clCreateContext(contextProperties.data(), 1, &deviceId, nullptr, nullptr, &clError); + if (clError != CL_SUCCESS) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "Failed to create OpenCL context on device %s (OpenCL error ID %d).", + deviceInfo.device_name, clError))); + } +} + +DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) +{ + init(deviceInfo); +} + +DeviceContext::~DeviceContext() +{ + cl_int clError; + + if (context_) + { + clError = clReleaseContext(context_); + GMX_RELEASE_ASSERT( + clError == CL_SUCCESS, + gmx::formatString("Failed to release OpenCL context (OpenCL error ID %d).", clError).c_str()); + context_ = nullptr; + } +} + +cl_context DeviceContext::context() const +{ + return context_; +} + +void DeviceContext::setContext(cl_context context) +{ + context_ = context; +} diff --git a/src/gromacs/gpu_utils/device_context_ocl.h b/src/gromacs/gpu_utils/device_context_ocl.h new file mode 100644 index 0000000000..58bb75595e --- /dev/null +++ b/src/gromacs/gpu_utils/device_context_ocl.h @@ -0,0 +1,92 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H +#define GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H + +/*! \libinternal \file + * + * \brief Declarations for DeviceContext class. + * + * Only needed for OpenCL builds. Other platforms will be given a stub class. + * + * \author Mark Abraham + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + * \inlibraryapi + */ + +#include "gromacs/gpu_utils/gmxopencl.h" +#include "gromacs/utility/classhelpers.h" + +struct DeviceInformation; + +// OpenCL device context class +class DeviceContext +{ +public: + //! Default constructor. Sets \c context_ to \c nullptr. + DeviceContext(); + /*! \brief Second stage of construction. Creates the \c cl_context. + * + * \param[in] deviceInfo Platform-specific device information. + * + * \throws InternalError if context creation failed. + */ + void init(const DeviceInformation& deviceInfo); + /*! \brief Construct the object and call \c init(...) . + * + * \param[in] deviceInfo Platform-specific device information. + * + * \throws InternalError if context creation failed. + */ + DeviceContext(const DeviceInformation& deviceInfo); + //! Destructor + ~DeviceContext(); + + //! Getter + cl_context context() const; + + //! Transition time setter - will be removed + void setContext(cl_context context); + + GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext); + +private: + //! OpenCL context object + cl_context context_ = nullptr; +}; + +#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh index 3a1bd3d195..59255bfa93 100644 --- a/src/gromacs/gpu_utils/devicebuffer.cuh +++ b/src/gromacs/gpu_utils/devicebuffer.cuh @@ -45,6 +45,7 @@ * \inlibraryapi */ +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior #include "gromacs/gpu_utils/gputraits.cuh" @@ -60,7 +61,7 @@ * \param[in] deviceContext The buffer's dummy device context - not managed explicitly in CUDA RT. */ template -void allocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, DeviceContext /* deviceContext */) +void allocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, const DeviceContext& /* deviceContext */) { GMX_ASSERT(buffer, "needs a buffer pointer"); cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType)); diff --git a/src/gromacs/gpu_utils/devicebuffer.h b/src/gromacs/gpu_utils/devicebuffer.h index 4c28d49613..c0cdfec329 100644 --- a/src/gromacs/gpu_utils/devicebuffer.h +++ b/src/gromacs/gpu_utils/devicebuffer.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -80,7 +80,7 @@ void reallocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, int* currentNumValues, int* currentMaxNumValues, - DeviceContext deviceContext) + const DeviceContext& deviceContext) { GMX_ASSERT(buffer, "needs a buffer pointer"); GMX_ASSERT(currentNumValues, "needs a size pointer"); diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index 40f1e12941..ee1adc1cce 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -45,6 +45,7 @@ * \inlibraryapi */ +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior #include "gromacs/gpu_utils/gputraits_ocl.h" @@ -62,13 +63,13 @@ * \param[in] deviceContext The buffer's device context-to-be. */ template -void allocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, DeviceContext deviceContext) +void allocateDeviceBuffer(DeviceBuffer* buffer, size_t numValues, const DeviceContext& deviceContext) { GMX_ASSERT(buffer, "needs a buffer pointer"); void* hostPtr = nullptr; cl_int clError; - *buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType), - hostPtr, &clError); + *buffer = clCreateBuffer(deviceContext.context(), CL_MEM_READ_WRITE, + numValues * sizeof(ValueType), hostPtr, &clError); GMX_RELEASE_ASSERT(clError == CL_SUCCESS, gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError, ocl_get_error_string(clError).c_str()) diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index 8a4936dabc..b477cdcb4c 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -37,7 +37,9 @@ /*! \libinternal \file * \brief Declares the CUDA type traits. + * * \author Aleksei Iupinov + * \author Artem Zhmurov * * \inlibraryapi * \ingroup module_gpu_utils @@ -63,8 +65,6 @@ struct DeviceInformation using CommandStream = cudaStream_t; //! \brief Single GPU call timing event - meaningless in CUDA using CommandEvent = void; -//! \brief Context used explicitly in OpenCL, does nothing in CUDA -using DeviceContext = void*; /*! \internal \brief * GPU kernels scheduling description. This is same in OpenCL/CUDA. diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index 0229ea443c..a36a5cc3bc 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -36,8 +36,10 @@ #define GMX_GPU_UTILS_GPUTRAITS_H /*! \libinternal \file - * \brief Declares the GPU type traits for non-GPU builds + * \brief Declares the GPU type traits for non-GPU builds. + * * \author Mark Abraham + * \author Artem Zhmurov * * \inlibraryapi * \ingroup module_gpu_utils @@ -65,8 +67,6 @@ struct DeviceInformation using CommandStream = void*; //! \brief Single GPU call timing event using CommandEvent = void*; -//! \brief GPU context -using DeviceContext = void*; #endif // GMX_GPU diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index 00d9cba90d..caf837552a 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -37,7 +37,9 @@ /*! \libinternal \file * \brief Declares the OpenCL type traits. + * * \author Aleksei Iupinov + * \author Artem Zhmurov * * \inlibraryapi * \ingroup module_gpu_utils @@ -81,8 +83,6 @@ struct DeviceInformation using CommandStream = cl_command_queue; //! \brief Single GPU call timing event using CommandEvent = cl_event; -//! \brief Context used explicitly in OpenCL -using DeviceContext = cl_context; /*! \internal \brief * GPU kernels scheduling description. This is same in OpenCL/CUDA. diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index 91b6059d27..230b3ff94e 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -44,6 +44,7 @@ #include +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gmxopencl.h" #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/utility/exceptions.h" @@ -63,8 +64,10 @@ enum class GpuApiCallBehavior; */ struct gmx_device_runtime_data_t { - cl_context context; /**< OpenCL context */ - cl_program program; /**< OpenCL program */ + //! OpenCL context + DeviceContext deviceContext; + //! OpenCL program + cl_program program; }; /*! \brief Launches synchronous or asynchronous device to host memory copy. diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu index 221959fe79..e16dd8ebf5 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu @@ -110,15 +110,18 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer g void convertRVecToFloat3OnDevice(std::vector& h_rVecOutput, const std::vector& h_rVecInput) { + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + const int numElements = h_rVecInput.size(); DeviceBuffer d_rVecInput; - allocateDeviceBuffer(&d_rVecInput, numElements, nullptr); + allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext); copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr, GpuApiCallBehavior::Sync, nullptr); DeviceBuffer d_float3Output; - allocateDeviceBuffer(&d_float3Output, numElements * DIM, nullptr); + allocateDeviceBuffer(&d_float3Output, numElements * DIM, deviceContext); std::vector h_float3Output(numElements); diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index bc10e76066..ff7092f40c 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -68,14 +68,14 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallc stream_ = *static_cast(streamPtr); wcycle_ = wcycle; - allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr); + allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_); // This could be an async transfer (if the source is pinned), so // long as it uses the same stream as the kernels and we are happy // to consume additional pinned pages. copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_, GpuApiCallBehavior::Sync, nullptr); vTot_.resize(F_NRE); - allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr); + allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_); clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_); kernelParams_.d_forceParams = d_forceParams_; @@ -204,7 +204,8 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef { t_ilist& d_iList = d_iLists_[fType]; - reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr); + reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, + deviceContext_); copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_, GpuApiCallBehavior::Async, nullptr); diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h index a785b16dbb..0532b40315 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.h +++ b/src/gromacs/listed_forces/gpubonded_impl.h @@ -48,6 +48,7 @@ #ifndef GMX_LISTED_FORCES_GPUBONDED_IMPL_H #define GMX_LISTED_FORCES_GPUBONDED_IMPL_H +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/listed_forces/gpubonded.h" @@ -179,6 +180,8 @@ private: //! \brief Device-side total virial float* d_vTot_ = nullptr; + //! Dummy GPU context object + const DeviceContext deviceContext_; //! \brief Bonded GPU stream, not owned by this module CommandStream stream_; diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu index 61bc231e36..b77162c1af 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cu +++ b/src/gromacs/mdlib/leapfrog_gpu.cu @@ -316,7 +316,9 @@ void LeapFrogGpu::integrate(const float3* d_x, return; } -LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream) +LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) : + deviceContext_(deviceContext), + commandStream_(commandStream) { numAtoms_ = 0; @@ -342,7 +344,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u numTempScaleValues_ = numTempScaleValues; reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_, - &numInverseMassesAlloc_, nullptr); + &numInverseMassesAlloc_, deviceContext_); copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); @@ -350,7 +352,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u if (numTempScaleValues > 1) { reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, - &numTempScaleGroupsAlloc_, nullptr); + &numTempScaleGroupsAlloc_, deviceContext_); copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); } @@ -359,7 +361,8 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u if (numTempScaleValues_ > 0) { h_lambdas_.resize(numTempScaleValues); - reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr); + reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, + deviceContext_); } } diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh index ba6d1be3f2..98703c05b9 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cuh +++ b/src/gromacs/mdlib/leapfrog_gpu.cuh @@ -44,6 +44,7 @@ #ifndef GMX_MDLIB_LEAPFROG_GPU_CUH #define GMX_MDLIB_LEAPFROG_GPU_CUH +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/mdtypes/group.h" @@ -62,9 +63,10 @@ class LeapFrogGpu public: /*! \brief Constructor. * + * \param[in] deviceContext Device context (dummy in CUDA). * \param[in] commandStream Device command stream to use. */ - LeapFrogGpu(CommandStream commandStream); + LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream); ~LeapFrogGpu(); /*! \brief Integrate @@ -110,6 +112,8 @@ public: class Impl; private: + //! Dummy GPU context object + const DeviceContext& deviceContext_; //! GPU stream CommandStream commandStream_; //! GPU kernel launch config diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu index edf3e9c58a..03c1bd1d15 100644 --- a/src/gromacs/mdlib/lincs_gpu.cu +++ b/src/gromacs/mdlib/lincs_gpu.cu @@ -507,7 +507,11 @@ void LincsGpu::apply(const float3* d_x, return; } -LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream) : +LincsGpu::LincsGpu(int numIterations, + int expansionOrder, + const DeviceContext& deviceContext, + CommandStream commandStream) : + deviceContext_(deviceContext), commandStream_(commandStream) { kernelParams_.numIterations = numIterations; @@ -519,7 +523,7 @@ LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandS c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0), "Number of threads per block should be a power of two in order for reduction to work."); - allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr); + allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_); h_virialScaled_.resize(6); // The data arrays should be expanded/reallocated on first call of set() function. @@ -911,18 +915,19 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md) numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads; - allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr); + allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, + deviceContext_); allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, - kernelParams_.numConstraintsThreads, nullptr); + kernelParams_.numConstraintsThreads, deviceContext_); allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, - kernelParams_.numConstraintsThreads, nullptr); + kernelParams_.numConstraintsThreads, deviceContext_); allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, - maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr); + maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_); allocateDeviceBuffer(&kernelParams_.d_massFactors, - maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr); + maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_); allocateDeviceBuffer(&kernelParams_.d_matrixA, - maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr); + maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_); } // (Re)allocate the memory, if the number of atoms has increased. @@ -933,7 +938,7 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md) freeDeviceBuffer(&kernelParams_.d_inverseMasses); } numAtomsAlloc_ = numAtoms; - allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr); + allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_); } // Copy data to GPU. diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh index 0fbebcf67f..77423dc323 100644 --- a/src/gromacs/mdlib/lincs_gpu.cuh +++ b/src/gromacs/mdlib/lincs_gpu.cuh @@ -44,6 +44,7 @@ #ifndef GMX_MDLIB_LINCS_GPU_CUH #define GMX_MDLIB_LINCS_GPU_CUH +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/mdlib/constr.h" #include "gromacs/mdtypes/mdatom.h" @@ -103,9 +104,10 @@ public: * * \param[in] numIterations Number of iteration for the correction of the projection. * \param[in] expansionOrder Order of the matrix inversion algorithm. + * \param[in] deviceContext Device context (dummy in CUDA). * \param[in] commandStream Device command stream. */ - LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream); + LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream); /*! \brief Destructor.*/ ~LincsGpu(); @@ -167,6 +169,8 @@ public: static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop); private: + //! Dummy GPU context object + const DeviceContext& deviceContext_; //! GPU stream CommandStream commandStream_; diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu index d1e8f508d5..20933baf96 100644 --- a/src/gromacs/mdlib/settle_gpu.cu +++ b/src/gromacs/mdlib/settle_gpu.cu @@ -485,7 +485,8 @@ void SettleGpu::apply(const float3* d_x, return; } -SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) : +SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) : + deviceContext_(deviceContext), commandStream_(commandStream) { static_assert(sizeof(real) == sizeof(float), @@ -586,7 +587,7 @@ SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) : initSettleParameters(&settleParameters_, mO, mH, dOH, dHH); - allocateDeviceBuffer(&d_virialScaled_, 6, nullptr); + allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_); h_virialScaled_.resize(6); } @@ -611,7 +612,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus ArrayRef iatoms = il_settle.iatoms; numSettles_ = il_settle.size() / nral1; - reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, nullptr); + reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_); h_atomIds_.resize(numSettles_); for (int i = 0; i < numSettles_; i++) { diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh index 3816579d63..f07af017e3 100644 --- a/src/gromacs/mdlib/settle_gpu.cuh +++ b/src/gromacs/mdlib/settle_gpu.cuh @@ -45,6 +45,7 @@ #include "gmxpre.h" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/math/functions.h" #include "gromacs/math/invertmatrix.h" @@ -200,9 +201,10 @@ public: * \param[in] mtop Topology of the system to gen the masses for O and H atoms and * target O-H and H-H distances. These values are also checked for * consistency. + * \param[in] deviceContext Device context (dummy in CUDA). * \param[in] commandStream Device stream to use. */ - SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream); + SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream); ~SettleGpu(); @@ -250,6 +252,8 @@ public: void set(const InteractionDefinitions& idef, const t_mdatoms& md); private: + //! Dummy GPU context object + const DeviceContext& deviceContext_; //! GPU stream CommandStream commandStream_; diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu index 322c3beec9..5c0a007ee4 100644 --- a/src/gromacs/mdlib/tests/constrtestrunners.cu +++ b/src/gromacs/mdlib/tests/constrtestrunners.cu @@ -70,8 +70,11 @@ namespace test */ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc) { - auto lincsGpu = - std::make_unique(testData->ir_.nLincsIter, testData->ir_.nProjOrder, nullptr); + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + + auto lincsGpu = std::make_unique(testData->ir_.nLincsIter, testData->ir_.nProjOrder, + deviceContext, nullptr); bool updateVelocities = true; int numAtoms = testData->numAtoms_; @@ -81,9 +84,9 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc) PbcAiuc pbcAiuc; setPbcAiuc(pbc.ndim_ePBC, pbc.box, &pbcAiuc); - allocateDeviceBuffer(&d_x, numAtoms, nullptr); - allocateDeviceBuffer(&d_xp, numAtoms, nullptr); - allocateDeviceBuffer(&d_v, numAtoms, nullptr); + allocateDeviceBuffer(&d_x, numAtoms, deviceContext); + allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); + allocateDeviceBuffer(&d_v, numAtoms, deviceContext); copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu index 1c3afdd1b0..b794149ddb 100644 --- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu +++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu @@ -66,6 +66,9 @@ namespace test void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps) { + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + int numAtoms = testData->numAtoms_; float3* h_x = reinterpret_cast(testData->x_.data()); @@ -75,17 +78,17 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps) float3 *d_x, *d_xp, *d_v, *d_f; - allocateDeviceBuffer(&d_x, numAtoms, nullptr); - allocateDeviceBuffer(&d_xp, numAtoms, nullptr); - allocateDeviceBuffer(&d_v, numAtoms, nullptr); - allocateDeviceBuffer(&d_f, numAtoms, nullptr); + allocateDeviceBuffer(&d_x, numAtoms, deviceContext); + allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); + allocateDeviceBuffer(&d_v, numAtoms, deviceContext); + allocateDeviceBuffer(&d_f, numAtoms, deviceContext); copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - auto integrator = std::make_unique(nullptr); + auto integrator = std::make_unique(deviceContext, nullptr); integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC); diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu index a14b47e819..6ebc6688da 100644 --- a/src/gromacs/mdlib/tests/settletestrunners.cu +++ b/src/gromacs/mdlib/tests/settletestrunners.cu @@ -86,7 +86,10 @@ void applySettleGpu(SettleTestData* testData, // TODO: Here we should check that at least 1 suitable GPU is available GMX_RELEASE_ASSERT(canPerformGpuDetection(), "Can't detect CUDA-capable GPUs."); - auto settleGpu = std::make_unique(testData->mtop_, nullptr); + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + + auto settleGpu = std::make_unique(testData->mtop_, deviceContext, nullptr); settleGpu->set(*testData->idef_, testData->mdatoms_); PbcAiuc pbcAiuc; @@ -100,9 +103,9 @@ void applySettleGpu(SettleTestData* testData, float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data())); float3* h_v = (float3*)(as_rvec_array(testData->v_.data())); - allocateDeviceBuffer(&d_x, numAtoms, nullptr); - allocateDeviceBuffer(&d_xp, numAtoms, nullptr); - allocateDeviceBuffer(&d_v, numAtoms, nullptr); + allocateDeviceBuffer(&d_x, numAtoms, deviceContext); + allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); + allocateDeviceBuffer(&d_v, numAtoms, deviceContext); copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index c77e1924ed..a8e5a94cc6 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -175,9 +175,9 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, : commandStream_ = nullptr; - integrator_ = std::make_unique(commandStream_); - lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, commandStream_); - settleGpu_ = std::make_unique(mtop, commandStream_); + integrator_ = std::make_unique(deviceContext_, commandStream_); + lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_); + settleGpu_ = std::make_unique(mtop, deviceContext_, commandStream_); coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock; coordinateScalingKernelLaunchConfig_.blockSize[1] = 1; @@ -205,10 +205,10 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, numAtoms_ = md.nr; - reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr); + reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_); reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_, - &numInverseMassesAlloc_, nullptr); + &numInverseMassesAlloc_, deviceContext_); // Integrator should also update something, but it does not even have a method yet integrator_->set(md, numTempScaleValues, md.cTC); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index b835c7cf02..75b6814de0 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -163,6 +163,8 @@ public: static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop); private: + //! Dummy GPU context object + const DeviceContext deviceContext_; //! GPU stream CommandStream commandStream_ = nullptr; //! GPU kernel launch config diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 9b31070072..b233b0737c 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1566,8 +1566,8 @@ int Mdrunner::mdrunner() fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal) : nullptr; - const void* deviceContext = pme_gpu_get_device_context(fr->pmedata); - const int paddingSize = pme_gpu_get_padding_size(fr->pmedata); + const DeviceContext& deviceContext = *pme_gpu_get_device_context(fr->pmedata); + const int paddingSize = pme_gpu_get_padding_size(fr->pmedata); GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index d75cd78ea1..034e7eb604 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -59,6 +59,7 @@ #include "locality.h" +class DeviceContext; class GpuEventSynchronizer; struct gmx_wallcycle; @@ -100,8 +101,6 @@ public: * * \todo Make a \p CommandStream visible in the CPU parts of the code so we * will not have to pass a void*. - * \todo Make a \p DeviceContext object visible in CPU parts of the code so we - * will not have to pass a void*. * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. @@ -111,13 +110,13 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - StatePropagatorDataGpu(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle); + StatePropagatorDataGpu(const void* pmeStream, + const void* localStream, + const void* nonLocalStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle); /*! \brief Constructor to use in PME-only rank and in tests. * @@ -135,11 +134,11 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - StatePropagatorDataGpu(const void* pmeStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle); + StatePropagatorDataGpu(const void* pmeStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle); //! Move constructor StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 15f054eafa..1029dd220f 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -57,7 +57,7 @@ class StatePropagatorDataGpu::Impl StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */, const void* /* localStream */, const void* /* nonLocalStream */, - const void* /* deviceContext */, + const DeviceContext& /* deviceContext */, GpuApiCallBehavior /* transferKind */, int /* paddingSize */, gmx_wallcycle* /* wcycle */) : @@ -66,7 +66,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */ } StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */, - const void* /* deviceContext */, + const DeviceContext& /* deviceContext */, GpuApiCallBehavior /* transferKind */, int /* paddingSize */, gmx_wallcycle* /* wcycle */) : diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index a138bcbb62..679bf2544a 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -101,8 +101,6 @@ public: * * \todo Make a \p CommandStream visible in the CPU parts of the code so we * will not have to pass a void*. - * \todo Make a \p DeviceContext object visible in CPU parts of the code so we - * will not have to pass a void*. * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. @@ -112,13 +110,13 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - Impl(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle); + Impl(const void* pmeStream, + const void* localStream, + const void* nonLocalStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle); /*! \brief Constructor to use in PME-only rank and in tests. * @@ -136,11 +134,11 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - Impl(const void* pmeStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle); + Impl(const void* pmeStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle); ~Impl(); @@ -380,10 +378,8 @@ private: //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality) EnumerationArray fReadyOnHost_; - /*! \brief GPU context (for OpenCL builds) - * \todo Make a Context class usable in CPU code - */ - DeviceContext deviceContext_ = nullptr; + //! GPU context (for OpenCL builds) + const DeviceContext& deviceContext_; //! Default GPU calls behavior GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async; //! Padding size for the coordinates buffer diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 4b385a5a7b..d88f469711 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -50,6 +50,7 @@ # include "gromacs/gpu_utils/cudautils.cuh" # endif # include "gromacs/gpu_utils/devicebuffer.h" +# include "gromacs/gpu_utils/gputraits.h" # if GMX_GPU == GMX_GPU_OPENCL # include "gromacs/gpu_utils/oclutils.h" # endif @@ -64,13 +65,14 @@ namespace gmx { -StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle) : +StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, + const void* localStream, + const void* nonLocalStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle) : + deviceContext_(deviceContext), transferKind_(transferKind), paddingSize_(paddingSize), wcycle_(wcycle) @@ -81,13 +83,11 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, // TODO: Refactor when the StreamManager is introduced. if (GMX_GPU == GMX_GPU_OPENCL) { - GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds."); GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds."); // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context. - pmeStream_ = *static_cast(pmeStream); - updateStream_ = *static_cast(pmeStream); - deviceContext_ = *static_cast(deviceContext); + pmeStream_ = *static_cast(pmeStream); + updateStream_ = *static_cast(pmeStream); GMX_UNUSED_VALUE(localStream); GMX_UNUSED_VALUE(nonLocalStream); } @@ -113,7 +113,6 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, stat = cudaStreamCreate(&updateStream_); CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); # endif - GMX_UNUSED_VALUE(deviceContext); } // Map the atom locality to the stream that will be used for coordinates, @@ -132,11 +131,12 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, fCopyStreams_[AtomLocality::All] = updateStream_; } -StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle) : +StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle) : + deviceContext_(deviceContext), transferKind_(transferKind), paddingSize_(paddingSize), wcycle_(wcycle) @@ -144,12 +144,6 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); - if (GMX_GPU == GMX_GPU_OPENCL) - { - GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds."); - deviceContext_ = *static_cast(deviceContext); - } - GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set."); pmeStream_ = *static_cast(pmeStream); @@ -551,22 +545,22 @@ int StatePropagatorDataGpu::Impl::numAtomsAll() } -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle) : +StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, + const void* localStream, + const void* nonLocalStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle) : impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle)) { } -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, - const void* deviceContext, - GpuApiCallBehavior transferKind, - int paddingSize, - gmx_wallcycle* wcycle) : +StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, + const DeviceContext& deviceContext, + GpuApiCallBehavior transferKind, + int paddingSize, + gmx_wallcycle* wcycle) : impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle)) { } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index fa290e6473..7467f95b69 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -51,6 +51,7 @@ // TODO Remove this comment when the above order issue is resolved #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/pmalloc_cuda.h" @@ -537,21 +538,21 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte iTimers.didPairlistH2D = true; } - DeviceContext context = nullptr; - - reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context); + reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, + DeviceContext()); copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); - reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context); + reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, + DeviceContext()); copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit, - &d_plist->nimask, &d_plist->imask_nalloc, context); + &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext()); reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, - &d_plist->excl_nalloc, context); + &d_plist->excl_nalloc, DeviceContext()); copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); @@ -857,9 +858,9 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv const int maxNumColumns = gridSet.numColumnsMax(); reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(), - &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr); + &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext()); reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(), - &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr); + &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext()); for (unsigned int g = 0; g < gridSet.grids().size(); g++) { @@ -873,7 +874,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv const int* cxy_ind = grid.cxy_ind().data(); reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, - &gpu_nbv->atomIndicesSize_alloc, nullptr); + &gpu_nbv->atomIndicesSize_alloc, DeviceContext()); if (atomIndicesSize > 0) { @@ -948,7 +949,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell, if (natoms_total > 0) { - reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr); + reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, + DeviceContext()); copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr); } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 05a270b7f0..013bd093a0 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -70,6 +70,7 @@ #include "thread_mpi/atomic.h" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/hardware/hw_info.h" diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 59ee706c4f..5056924224 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -134,13 +134,14 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tabl array_format.image_channel_data_type = CL_FLOAT; array_format.image_channel_order = CL_R; - coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - &array_format, tabsize, 1, 0, ftmp, &cl_error); + coul_tab = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE | + CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error); */ - coul_tab = clCreateBuffer( - runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, - tables.tableF.size() * sizeof(cl_float), const_cast(tables.tableF.data()), &cl_error); + coul_tab = clCreateBuffer(runData->deviceContext.context(), + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + tables.tableF.size() * sizeof(cl_float), + const_cast(tables.tableF.data()), &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -158,23 +159,24 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, gmx_device_runtim ad->ntypes = ntypes; - ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, - SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error); + ad->shift_vec = + clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, + SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); ad->bShiftVecUploaded = CL_FALSE; - ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, + ad->fshift = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, SHIFTS * sizeof(nb_staging_t::fshift[0]), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); - ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, + ad->e_lj = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, sizeof(float), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); - ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, + ad->e_el = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, sizeof(float), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -330,11 +332,11 @@ static void init_nbparam(cl_nbparam_t* nbp, array_format.image_channel_data_type = CL_FLOAT; array_format.image_channel_order = CL_R; - nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE, - &array_format, 1, 1, 0, nullptr, &cl_error); + nbp->coulomb_tab_climg2d = clCreateImage2D(runData->deviceContext.context(), + CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error); */ - nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, + nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -352,12 +354,13 @@ static void init_nbparam(cl_nbparam_t* nbp, array_format.image_channel_data_type = CL_FLOAT; array_format.image_channel_order = CL_R; - nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | + nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error); */ nbp->nbfp_climg2d = clCreateBuffer( - runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + runData->deviceContext.context(), + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp * sizeof(cl_float), const_cast(nbatParams.nbfp.data()), &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -366,12 +369,13 @@ static void init_nbparam(cl_nbparam_t* nbp, { /* Switched from using textures to using buffers */ // TODO: decide which alternative is most efficient - textures or buffers. - /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | + /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/ - nbp->nbfp_comb_climg2d = clCreateBuffer( - runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, - nnbfp_comb * sizeof(cl_float), const_cast(nbatParams.nbfp_comb.data()), - &cl_error); + nbp->nbfp_comb_climg2d = + clCreateBuffer(runData->deviceContext.context(), + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + nnbfp_comb * sizeof(cl_float), + const_cast(nbatParams.nbfp_comb.data()), &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); } @@ -382,9 +386,9 @@ static void init_nbparam(cl_nbparam_t* nbp, // don't accept nullptr values for image2D parameters. /* Switched from using textures to using buffers */ // TODO: decide which alternative is most efficient - textures or buffers. - /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE, - &array_format, 1, 1, 0, nullptr, &cl_error);*/ - nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, + /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), + CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/ + nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -515,7 +519,7 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData, rank, deviceInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str()); } - runtimeData->context = context; + runtimeData->deviceContext.setContext(context); } /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */ @@ -636,8 +640,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, snew(nb->timings, 1); /* set device info, just point it to the right GPU among the detected ones */ - nb->deviceInfo = deviceInfo; - snew(nb->dev_rundata, 1); + nb->deviceInfo = deviceInfo; + nb->dev_rundata = new gmx_device_runtime_data_t(); /* init nbst */ pmalloc(reinterpret_cast(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); @@ -662,8 +666,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, nbnxn_gpu_create_context(nb->dev_rundata, nb->deviceInfo, rank); /* local/non-local GPU streams */ - nb->stream[InteractionLocality::Local] = clCreateCommandQueue( - nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); + nb->stream[InteractionLocality::Local] = + clCreateCommandQueue(nb->dev_rundata->deviceContext.context(), + nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank, @@ -674,8 +679,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, { init_plist(nb->plist[InteractionLocality::NonLocal]); - nb->stream[InteractionLocality::NonLocal] = clCreateCommandQueue( - nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); + nb->stream[InteractionLocality::NonLocal] = + clCreateCommandQueue(nb->dev_rundata->deviceContext.context(), + nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", @@ -788,21 +794,23 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte } // TODO most of this function is same in CUDA and OpenCL, move into the header - DeviceContext context = nb->dev_rundata->context; + const DeviceContext& deviceContext = nb->dev_rundata->deviceContext; - reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context); + reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, + deviceContext); copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); - reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context); + reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, + deviceContext); copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit, - &d_plist->nimask, &d_plist->imask_nalloc, context); + &d_plist->nimask, &d_plist->imask_nalloc, deviceContext); reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, - &d_plist->excl_nalloc, context); + &d_plist->excl_nalloc, deviceContext); copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); @@ -865,19 +873,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) freeDeviceBuffer(&d_atdat->atom_types); } - d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, + d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext.context(), + CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); - d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, + d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext.context(), + CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, nalloc * sizeof(cl_float4), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); if (useLjCombRule(nb->nbparam->vdwtype)) { - d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, + d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, nalloc * sizeof(cl_float2), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, @@ -885,7 +895,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) } else { - d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, + d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, nalloc * sizeof(int), nullptr, &cl_error); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, @@ -969,19 +979,9 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t* runData) return; } - cl_int gmx_unused cl_error; - - if (runData->context) - { - cl_error = clReleaseContext(runData->context); - GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, - ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str()); - runData->context = nullptr; - } - if (runData->program) { - cl_error = clReleaseProgram(runData->program); + cl_int cl_error = clReleaseProgram(runData->program); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str()); runData->program = nullptr; @@ -1076,7 +1076,7 @@ void gpu_free(NbnxmGpu* nb) } free_gpu_device_runtime_data(nb->dev_rundata); - sfree(nb->dev_rundata); + delete nb->dev_rundata; /* Free timers and timings */ delete nb->timers; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp index 8a4e217d84..0ba3345780 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp @@ -200,9 +200,10 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb) { /* TODO when we have a proper MPI-aware logging module, the log output here should be written there */ - program = gmx::ocl::compileProgram( - stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines, - nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor); + program = + gmx::ocl::compileProgram(stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", + extraDefines, nb->dev_rundata->deviceContext.context(), + nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor); } catch (gmx::GromacsException& e) {