From a6416394629f5349172082f18d4183ea5b73996c Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 16 Mar 2021 15:53:21 +0300 Subject: [PATCH] Unify gpu_init_atomdata(...) function Refs #2608 --- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 66 +++-------- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 17 +++ .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 109 +++--------------- src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl | 19 +-- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 7 +- .../nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp | 31 ++--- 6 files changed, 58 insertions(+), 191 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index e6eb1c2d51..804a8ea180 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -92,14 +92,14 @@ namespace Nbnxm */ static unsigned int gpu_min_ci_balanced_factor = 44; -/* Fw. decl. */ -static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb); - /*! Initializes the atomdata structure first time, it only gets filled at pair-search. */ -static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext) +static void init_atomdata_first(NBAtomData* ad, + int nTypes, + const DeviceContext& deviceContext, + const DeviceStream& localStream) { - ad->numTypes = ntypes; + ad->numTypes = nTypes; allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext); ad->shiftVecUploaded = false; @@ -107,6 +107,10 @@ static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& allocateDeviceBuffer(&ad->eLJ, 1, deviceContext); allocateDeviceBuffer(&ad->eElec, 1, deviceContext); + clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream); + clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream); + /* initialize to nullptr poiters to data that is not allocated here and will need reallocation in nbnxn_cuda_init_atomdata */ ad->xq = nullptr; @@ -174,19 +178,6 @@ static void init_nbparam(NBParamGpu* nbp, } } -/*! Initializes simulation constant data. */ -static void cuda_init_const(NbnxmGpu* nb, - const interaction_const_t* ic, - const PairlistParams& listParams, - const nbnxn_atomdata_t::Params& nbatParams) -{ - init_atomdata_first(nb->atdat, nbatParams.numTypes, *nb->deviceContext_); - init_nbparam(nb->nbparam, ic, listParams, nbatParams, *nb->deviceContext_); - - /* clear energy and shift force outputs */ - nbnxn_cuda_clear_e_fshift(nb); -} - NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, const interaction_const_t* ic, const PairlistParams& listParams, @@ -218,8 +209,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, /* local/non-local GPU streams */ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal), "Local non-bonded stream should be initialized to use GPU for non-bonded."); - nb->deviceStreams[InteractionLocality::Local] = - &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + nb->deviceStreams[InteractionLocality::Local] = &localStream; if (nb->bUseTwoStreams) { init_plist(nb->plist[InteractionLocality::NonLocal]); @@ -251,7 +242,10 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, /* pick L1 cache configuration */ cuda_set_cacheconfig(); - cuda_init_const(nb, ic, listParams, nbat->params()); + const nbnxn_atomdata_t::Params& nbatParams = nbat->params(); + const DeviceContext& deviceContext = *nb->deviceContext_; + init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream); + init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext); nb->atomIndicesSize = 0; nb->atomIndicesSize_alloc = 0; @@ -289,36 +283,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) } } -/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */ -static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) -{ - NBAtomData* adat = nb->atdat; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream); -} - -/*! Clears nonbonded shift force output array and energy outputs on the GPU. */ -static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb) -{ - NBAtomData* adat = nb->atdat; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - - clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream); - clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream); - clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream); -} - -void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) -{ - nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms); - /* clear shift force array and energies if the outputs were - used in the current step */ - if (computeVirial) - { - nbnxn_cuda_clear_e_fshift(nb); - } -} - void gpu_free(NbnxmGpu* nb) { if (nb == nullptr) diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 50519ced6d..b86b785b94 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -69,6 +69,7 @@ #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" +#include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" @@ -429,6 +430,22 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) issueClFlushInStream(localStream); } +void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) +{ + NBAtomData* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; + // Clear forces + clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream); + // Clear shift force array and energies if the outputs were used in the current step + if (computeVirial) + { + clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream); + clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream); + } + issueClFlushInStream(localStream); +} + //! This function is documented in the header file gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 36e538d22b..f666910d12 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -103,7 +103,10 @@ static unsigned int gpu_min_ci_balanced_factor = 50; /*! \brief Initializes the atomdata structure first time, it only gets filled at pair-search. */ -static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext) +static void init_atomdata_first(NBAtomData* ad, + int ntypes, + const DeviceContext& deviceContext, + const DeviceStream& localStream) { ad->numTypes = ntypes; @@ -114,6 +117,10 @@ static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& allocateDeviceBuffer(&ad->eLJ, 1, deviceContext); allocateDeviceBuffer(&ad->eElec, 1, deviceContext); + clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream); + clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream); + /* initialize to nullptr pointers to data that is not allocated here and will need reallocation in nbnxn_gpu_init_atomdata */ ad->xq = nullptr; @@ -209,40 +216,6 @@ static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name) return kernel; } -/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU. - */ -static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb) -{ - - cl_int cl_error; - NBAtomData* adat = nb->atdat; - cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream(); - - size_t local_work_size[3] = { 1, 1, 1 }; - size_t global_work_size[3] = { 1, 1, 1 }; - - cl_int shifts = SHIFTS * 3; - - cl_int arg_no; - - cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift; - - local_work_size[0] = 64; - // Round the total number of threads up from the array size - global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0]; - - arg_no = 0; - cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift)); - cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ)); - cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec)); - cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts); - GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str()); - - cl_error = clEnqueueNDRangeKernel( - ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr); - GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str()); -} - /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb) { @@ -263,28 +236,8 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb) nb->kernel_pruneonly[epruneFirst] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl"); nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl"); - - /* Init auxiliary kernels */ - nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift"); -} - -/*! \brief Initializes simulation constant data. - * - * Initializes members of the atomdata and nbparam structs and - * clears e/fshift output buffers. - */ -static void nbnxn_ocl_init_const(NBAtomData* atomData, - NBParamGpu* nbParams, - const interaction_const_t* ic, - const PairlistParams& listParams, - const nbnxn_atomdata_t::Params& nbatParams, - const DeviceContext& deviceContext) -{ - init_atomdata_first(atomData, nbatParams.numTypes, deviceContext); - init_nbparam(nbParams, ic, listParams, nbatParams, deviceContext); } - //! This function is documented in the header file NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, const interaction_const_t* ic, @@ -325,8 +278,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, /* local/non-local GPU streams */ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal), "Local non-bonded stream should be initialized to use GPU for non-bonded."); - nb->deviceStreams[InteractionLocality::Local] = - &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + nb->deviceStreams[InteractionLocality::Local] = &localStream; if (nb->bUseTwoStreams) { @@ -344,7 +297,10 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, init_timings(nb->timings); } - nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(), *nb->deviceContext_); + const nbnxn_atomdata_t::Params& nbatParams = nbat->params(); + const DeviceContext& deviceContext = *nb->deviceContext_; + init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream); + init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext); /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var. * TODO: decide about NVIDIA @@ -361,9 +317,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nbnxn_gpu_compile_kernels(nb); nbnxn_gpu_init_kernels(nb); - /* clear energy and shift force outputs */ - nbnxn_ocl_clear_e_fshift(nb); - if (debug) { fprintf(debug, "Initialized OpenCL data structures.\n"); @@ -372,38 +325,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, return nb; } -/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array. - */ -static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) -{ - if (natoms_clear == 0) - { - return; - } - - NBAtomData* atomData = nb->atdat; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - - clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream); -} - -//! This function is documented in the header file -void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) -{ - nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms); - /* clear shift force array and energies if the outputs were - used in the current step */ - if (computeVirial) - { - nbnxn_ocl_clear_e_fshift(nb); - } - - /* kick off buffer clearing kernel to ensure concurrency with constraints/update */ - cl_int gmx_unused cl_error; - cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream()); - GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); -} - //! This function is documented in the header file void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { @@ -497,8 +418,6 @@ void gpu_free(NbnxmGpu* nb) kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]); free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count); - free_kernel(&(nb->kernel_zero_e_fshift)); - /* Free atdat */ freeDeviceBuffer(&(nb->atdat->xq)); freeDeviceBuffer(&(nb->atdat->f)); diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl index 8aa7f45120..1f6cabf615 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -32,23 +32,6 @@ * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ -/* Auxiliary kernels */ - -/* Very few data */ -__kernel void zero_e_fshift(__global float* fshift, __global float* e_lj, __global float* e_el, const unsigned int Nbuf) -{ - unsigned int tidx = get_global_id(0); - if (tidx < Nbuf) - { - fshift[tidx] = 0.0F; - } - if (tidx == 0) - { - *e_lj = 0.0F; - *e_el = 0.0F; - } -} - /* Generate pruning kernels. */ #define HAVE_FRESH_LIST 1 #include "nbnxm_ocl_kernel_pruneonly.clh" diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index c802c4199f..925f94b117 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -156,10 +156,9 @@ struct NbnxmGpu /**< auxiliary kernels implementing memset-like functions */ ///@{ - cl_kernel kernel_memset_f = nullptr; - cl_kernel kernel_memset_f2 = nullptr; - cl_kernel kernel_memset_f3 = nullptr; - cl_kernel kernel_zero_e_fshift = nullptr; + cl_kernel kernel_memset_f = nullptr; + cl_kernel kernel_memset_f2 = nullptr; + cl_kernel kernel_memset_f3 = nullptr; ///@} //! true if doing both local/non-local NB work on GPU diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp index cc4f9f3a6b..2f37a0c011 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -57,28 +57,13 @@ namespace Nbnxm { -//! This function is documented in the header file -void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) -{ - NBAtomData* adat = nb->atdat; - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - // Clear forces - clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream); - // Clear shift force array and energies if the outputs were used in the current step - if (computeVirial) - { - clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream); - clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream); - clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream); - } -} - /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */ -static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext) +static void initAtomdataFirst(NBAtomData* atomdata, + int numTypes, + const DeviceContext& deviceContext, + const DeviceStream& localStream) { - const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - NBAtomData* atomdata = nb->atdat; - atomdata->numTypes = numTypes; + atomdata->numTypes = numTypes; allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext); atomdata->shiftVecUploaded = false; @@ -179,8 +164,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, /* local/non-local GPU streams */ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal), "Local non-bonded stream should be initialized to use GPU for non-bonded."); - nb->deviceStreams[InteractionLocality::Local] = - &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal); + nb->deviceStreams[InteractionLocality::Local] = &localStream; // In general, it's not strictly necessary to use 2 streams for SYCL, since they are // out-of-order. But for the time being, it will be less disruptive to keep them. if (nb->bUseTwoStreams) @@ -200,7 +185,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, const DeviceContext& deviceContext = *nb->deviceContext_; initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext); - initAtomdataFirst(nb, nbatParams.numTypes, deviceContext); + initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream); return nb; } -- 2.22.0