From 3d3b09a81638135f3025883c46a87faca03fa417 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Sat, 13 Mar 2021 16:13:14 +0300 Subject: [PATCH] Unify gpu_init_atomdata(...) function Refs #2608 --- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 89 ------------- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 126 +++++++++++++++--- .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 100 -------------- .../nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp | 72 ---------- 4 files changed, 111 insertions(+), 276 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index ac77badc41..e6eb1c2d51 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -319,95 +319,6 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) } } -void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) -{ - int nalloc, natoms; - bool realloced; - bool bDoTime = nb->bDoTime; - Nbnxm::GpuTimers* timers = nb->timers; - NBAtomData* d_atdat = nb->atdat; - const DeviceContext& deviceContext = *nb->deviceContext_; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - - natoms = nbat->numAtoms(); - realloced = false; - - if (bDoTime) - { - /* time async copy */ - timers->atdat.openTimingRegion(localStream); - } - - /* need to reallocate if we have to copy more atoms than the amount of space - available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ - if (natoms > d_atdat->numAtomsAlloc) - { - nalloc = over_alloc_small(natoms); - - /* free up first if the arrays have already been initialized */ - if (d_atdat->numAtomsAlloc != -1) - { - freeDeviceBuffer(&d_atdat->f); - freeDeviceBuffer(&d_atdat->xq); - freeDeviceBuffer(&d_atdat->atomTypes); - freeDeviceBuffer(&d_atdat->ljComb); - } - - allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext); - allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext); - if (useLjCombRule(nb->nbparam->vdwType)) - { - allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext); - } - else - { - allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext); - } - - d_atdat->numAtomsAlloc = nalloc; - realloced = true; - } - - d_atdat->numAtoms = natoms; - d_atdat->numAtomsLocal = nbat->natoms_local; - - /* need to clear GPU f output if realloc happened */ - if (realloced) - { - nbnxn_cuda_clear_f(nb, nalloc); - } - - if (useLjCombRule(nb->nbparam->vdwType)) - { - static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2), - "Size of the LJ parameters element should be equal to the size of float2."); - copyToDeviceBuffer(&d_atdat->ljComb, - reinterpret_cast(nbat->params().lj_comb.data()), - 0, - natoms, - localStream, - GpuApiCallBehavior::Async, - nullptr); - } - else - { - static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]), - "Sizes of host- and device-side atom types should be the same."); - copyToDeviceBuffer(&d_atdat->atomTypes, - nbat->params().type.data(), - 0, - natoms, - localStream, - GpuApiCallBehavior::Async, - nullptr); - } - - if (bDoTime) - { - timers->atdat.closeTimingRegion(localStream); - } -} - void gpu_free(NbnxmGpu* nb) { if (nb == nullptr) diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index bd056bcee8..f92f636e19 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -77,6 +77,23 @@ namespace Nbnxm { +inline void issueClFlushInStream(const DeviceStream& deviceStream) +{ +#if GMX_GPU_OPENCL + /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed + * in the stream after marking an event in it in order to be able to sync with + * the event from another stream. + */ + cl_int cl_error = clFlush(deviceStream.stream()); + if (cl_error != CL_SUCCESS) + { + GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error))); + } +#else + GMX_UNUSED_VALUE(deviceStream); +#endif +} + void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables, NBParamGpu* nbp, const DeviceContext& deviceContext) @@ -316,6 +333,100 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte d_plist->haveFreshList = true; } +void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) +{ + bool bDoTime = nb->bDoTime; + Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr; + NBAtomData* atdat = nb->atdat; + const DeviceContext& deviceContext = *nb->deviceContext_; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + + int numAtoms = nbat->numAtoms(); + bool realloced = false; + + if (bDoTime) + { + /* time async copy */ + timers->atdat.openTimingRegion(localStream); + } + + /* need to reallocate if we have to copy more atoms than the amount of space + available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */ + if (numAtoms > atdat->numAtomsAlloc) + { + int numAlloc = over_alloc_small(numAtoms); + + /* free up first if the arrays have already been initialized */ + if (atdat->numAtomsAlloc != -1) + { + freeDeviceBuffer(&atdat->f); + freeDeviceBuffer(&atdat->xq); + freeDeviceBuffer(&atdat->ljComb); + freeDeviceBuffer(&atdat->atomTypes); + } + + + allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext); + allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext); + + if (useLjCombRule(nb->nbparam->vdwType)) + { + // Two Lennard-Jones parameters per atom + allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext); + } + else + { + allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext); + } + + atdat->numAtomsAlloc = numAlloc; + realloced = true; + } + + atdat->numAtoms = numAtoms; + atdat->numAtomsLocal = nbat->natoms_local; + + /* need to clear GPU f output if realloc happened */ + if (realloced) + { + clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream); + } + + if (useLjCombRule(nb->nbparam->vdwType)) + { + static_assert( + sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()), + "Size of a pair of LJ parameters elements should be equal to the size of Float2."); + copyToDeviceBuffer(&atdat->ljComb, + reinterpret_cast(nbat->params().lj_comb.data()), + 0, + numAtoms, + localStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + } + else + { + static_assert(sizeof(int) == sizeof(*nbat->params().type.data()), + "Sizes of host- and device-side atom types should be the same."); + copyToDeviceBuffer(&atdat->atomTypes, + nbat->params().type.data(), + 0, + numAtoms, + localStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + } + + if (bDoTime) + { + timers->atdat.closeTimingRegion(localStream); + } + + /* kick off the tasks enqueued above to ensure concurrency with the search */ + issueClFlushInStream(localStream); +} + //! This function is documented in the header file gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb) { @@ -430,21 +541,6 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); } -inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream) -{ -#if GMX_GPU_OPENCL - /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed - * in the stream after marking an event in it in order to be able to sync with - * the event from another stream. - */ - cl_int cl_error = clFlush(deviceStream.stream()); - if (cl_error != CL_SUCCESS) - { - GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error))); - } -#endif -} - void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) { const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index b927bd3196..36e538d22b 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -426,106 +426,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) } } -//! This function is documented in the header file -void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) -{ - cl_int cl_error; - int nalloc, natoms; - bool realloced; - bool bDoTime = nb->bDoTime; - Nbnxm::GpuTimers* timers = nb->timers; - NBAtomData* d_atdat = nb->atdat; - const DeviceContext& deviceContext = *nb->deviceContext_; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - - natoms = nbat->numAtoms(); - realloced = false; - - if (bDoTime) - { - /* time async copy */ - timers->atdat.openTimingRegion(localStream); - } - - /* need to reallocate if we have to copy more atoms than the amount of space - available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ - if (natoms > d_atdat->numAtomsAlloc) - { - nalloc = over_alloc_small(natoms); - - /* free up first if the arrays have already been initialized */ - if (d_atdat->numAtomsAlloc != -1) - { - freeDeviceBuffer(&d_atdat->f); - freeDeviceBuffer(&d_atdat->xq); - freeDeviceBuffer(&d_atdat->ljComb); - freeDeviceBuffer(&d_atdat->atomTypes); - } - - - allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext); - allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext); - - if (useLjCombRule(nb->nbparam->vdwType)) - { - // Two Lennard-Jones parameters per atom - allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext); - } - else - { - allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext); - } - - d_atdat->numAtomsAlloc = nalloc; - realloced = true; - } - - d_atdat->numAtoms = natoms; - d_atdat->numAtomsLocal = nbat->natoms_local; - - /* need to clear GPU f output if realloc happened */ - if (realloced) - { - nbnxn_ocl_clear_f(nb, nalloc); - } - - if (useLjCombRule(nb->nbparam->vdwType)) - { - static_assert( - sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()), - "Size of a pair of LJ parameters elements should be equal to the size of Float2."); - copyToDeviceBuffer(&d_atdat->ljComb, - reinterpret_cast(nbat->params().lj_comb.data()), - 0, - natoms, - localStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->atdat.fetchNextEvent() : nullptr); - } - else - { - static_assert(sizeof(int) == sizeof(*nbat->params().type.data()), - "Sizes of host- and device-side atom types should be the same."); - copyToDeviceBuffer(&d_atdat->atomTypes, - nbat->params().type.data(), - 0, - natoms, - localStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->atdat.fetchNextEvent() : nullptr); - } - - if (bDoTime) - { - timers->atdat.closeTimingRegion(localStream); - } - - /* kick off the tasks enqueued above to ensure concurrency with the search */ - cl_error = clFlush(localStream.stream()); - GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, - ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); -} - /*! \brief Releases an OpenCL kernel pointer */ static void free_kernel(cl_kernel* kernel_ptr) { diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp index e5a1a4e344..cc4f9f3a6b 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -226,78 +226,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) } } -void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) -{ - GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet"); - NBAtomData* atdat = nb->atdat; - const DeviceContext& deviceContext = *nb->deviceContext_; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - - int numAtoms = nbat->numAtoms(); - bool reallocated = false; - if (numAtoms > atdat->numAtomsAlloc) - { - int numAlloc = over_alloc_small(numAtoms); - - /* free up first if the arrays have already been initialized */ - if (atdat->numAtomsAlloc != -1) - { - freeDeviceBuffer(&atdat->f); - freeDeviceBuffer(&atdat->xq); - freeDeviceBuffer(&atdat->atomTypes); - freeDeviceBuffer(&atdat->ljComb); - } - - allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext); - allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext); - if (useLjCombRule(nb->nbparam->vdwType)) - { - allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext); - } - else - { - allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext); - } - - atdat->numAtomsAlloc = numAlloc; - reallocated = true; - } - - atdat->numAtoms = numAtoms; - atdat->numAtomsLocal = nbat->natoms_local; - - /* need to clear GPU f output if realloc happened */ - if (reallocated) - { - clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream); - } - - if (useLjCombRule(nb->nbparam->vdwType)) - { - GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2), - "Size of the LJ parameters element should be equal to the size of float2."); - copyToDeviceBuffer(&atdat->ljComb, - reinterpret_cast(nbat->params().lj_comb.data()), - 0, - numAtoms, - localStream, - GpuApiCallBehavior::Async, - nullptr); - } - else - { - GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]), - "Sizes of host- and device-side atom types should be the same."); - copyToDeviceBuffer(&atdat->atomTypes, - nbat->params().type.data(), - 0, - numAtoms, - localStream, - GpuApiCallBehavior::Async, - nullptr); - } -} - void gpu_free(NbnxmGpu* nb) { if (nb == nullptr) -- 2.22.0