Unify gpu_init_atomdata(...) function
authorArtem Zhmurov <zhmurov@gmail.com>
Tue, 16 Mar 2021 12:53:21 +0000 (15:53 +0300)
committerPaul Bauer <paul.bauer.q@gmail.com>
Tue, 16 Mar 2021 16:44:11 +0000 (16:44 +0000)
Refs #2608

src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp

index e6eb1c2d51a7cc00ba31df83998d1119569eb6c3..804a8ea18066e61b104872e660be1ef056a95530 100644 (file)
@@ -92,14 +92,14 @@ namespace Nbnxm
  */
 static unsigned int gpu_min_ci_balanced_factor = 44;
 
-/* Fw. decl. */
-static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb);
-
 /*! Initializes the atomdata structure first time, it only gets filled at
     pair-search. */
-static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData*          ad,
+                                int                  nTypes,
+                                const DeviceContext& deviceContext,
+                                const DeviceStream&  localStream)
 {
-    ad->numTypes = ntypes;
+    ad->numTypes = nTypes;
     allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
     ad->shiftVecUploaded = false;
 
@@ -107,6 +107,10 @@ static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext&
     allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
     allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
 
+    clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
+    clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
+
     /* initialize to nullptr poiters to data that is not allocated here and will
        need reallocation in nbnxn_cuda_init_atomdata */
     ad->xq = nullptr;
@@ -174,19 +178,6 @@ static void init_nbparam(NBParamGpu*                     nbp,
     }
 }
 
-/*! Initializes simulation constant data. */
-static void cuda_init_const(NbnxmGpu*                       nb,
-                            const interaction_const_t*      ic,
-                            const PairlistParams&           listParams,
-                            const nbnxn_atomdata_t::Params& nbatParams)
-{
-    init_atomdata_first(nb->atdat, nbatParams.numTypes, *nb->deviceContext_);
-    init_nbparam(nb->nbparam, ic, listParams, nbatParams, *nb->deviceContext_);
-
-    /* clear energy and shift force outputs */
-    nbnxn_cuda_clear_e_fshift(nb);
-}
-
 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
                    const interaction_const_t*      ic,
                    const PairlistParams&           listParams,
@@ -218,8 +209,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     /* local/non-local GPU streams */
     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    nb->deviceStreams[InteractionLocality::Local] =
-            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    nb->deviceStreams[InteractionLocality::Local] = &localStream;
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
@@ -251,7 +242,10 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     /* pick L1 cache configuration */
     cuda_set_cacheconfig();
 
-    cuda_init_const(nb, ic, listParams, nbat->params());
+    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
+    const DeviceContext&            deviceContext = *nb->deviceContext_;
+    init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+    init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
 
     nb->atomIndicesSize       = 0;
     nb->atomIndicesSize_alloc = 0;
@@ -289,36 +283,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
     }
 }
 
-/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
-static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
-{
-    NBAtomData*         adat        = nb->atdat;
-    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-    clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream);
-}
-
-/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
-static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
-{
-    NBAtomData*         adat        = nb->atdat;
-    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
-    clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
-    clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
-    clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
-}
-
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
-    nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms);
-    /* clear shift force array and energies if the outputs were
-       used in the current step */
-    if (computeVirial)
-    {
-        nbnxn_cuda_clear_e_fshift(nb);
-    }
-}
-
 void gpu_free(NbnxmGpu* nb)
 {
     if (nb == nullptr)
index 50519ced6d63d38f7cabfc4cce7c13be23834a6c..b86b785b94238e764c042adad5dfbf4e459dc30a 100644 (file)
@@ -69,6 +69,7 @@
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
@@ -429,6 +430,22 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     issueClFlushInStream(localStream);
 }
 
+void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
+{
+    NBAtomData*         adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+    // Clear forces
+    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
+    // Clear shift force array and energies if the outputs were used in the current step
+    if (computeVirial)
+    {
+        clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+        clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+        clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
+    }
+    issueClFlushInStream(localStream);
+}
+
 //! This function is documented in the header file
 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 {
index 36e538d22bedfb93c9d106233f6884120d35186c..f666910d12425c9990b48a57233204c72d2d6fdb 100644 (file)
@@ -103,7 +103,10 @@ static unsigned int gpu_min_ci_balanced_factor = 50;
 /*! \brief Initializes the atomdata structure first time, it only gets filled at
     pair-search.
  */
-static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData*          ad,
+                                int                  ntypes,
+                                const DeviceContext& deviceContext,
+                                const DeviceStream&  localStream)
 {
     ad->numTypes = ntypes;
 
@@ -114,6 +117,10 @@ static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext&
     allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
     allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
 
+    clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
+    clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
+
     /* initialize to nullptr pointers to data that is not allocated here and will
        need reallocation in nbnxn_gpu_init_atomdata */
     ad->xq = nullptr;
@@ -209,40 +216,6 @@ static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name)
     return kernel;
 }
 
-/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
- */
-static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
-{
-
-    cl_int           cl_error;
-    NBAtomData*      adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
-
-    size_t local_work_size[3]  = { 1, 1, 1 };
-    size_t global_work_size[3] = { 1, 1, 1 };
-
-    cl_int shifts = SHIFTS * 3;
-
-    cl_int arg_no;
-
-    cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift;
-
-    local_work_size[0] = 64;
-    // Round the total number of threads up from the array size
-    global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
-
-    arg_no   = 0;
-    cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift));
-    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ));
-    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec));
-    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
-    GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
-
-    cl_error = clEnqueueNDRangeKernel(
-            ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
-    GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
-}
-
 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 static void nbnxn_gpu_init_kernels(NbnxmGpu* nb)
 {
@@ -263,28 +236,8 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb)
     nb->kernel_pruneonly[epruneFirst] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
     nb->kernel_pruneonly[epruneRolling] =
             nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
-
-    /* Init auxiliary kernels */
-    nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
-}
-
-/*! \brief Initializes simulation constant data.
- *
- *  Initializes members of the atomdata and nbparam structs and
- *  clears e/fshift output buffers.
- */
-static void nbnxn_ocl_init_const(NBAtomData*                     atomData,
-                                 NBParamGpu*                     nbParams,
-                                 const interaction_const_t*      ic,
-                                 const PairlistParams&           listParams,
-                                 const nbnxn_atomdata_t::Params& nbatParams,
-                                 const DeviceContext&            deviceContext)
-{
-    init_atomdata_first(atomData, nbatParams.numTypes, deviceContext);
-    init_nbparam(nbParams, ic, listParams, nbatParams, deviceContext);
 }
 
-
 //! This function is documented in the header file
 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
                    const interaction_const_t*      ic,
@@ -325,8 +278,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     /* local/non-local GPU streams */
     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    nb->deviceStreams[InteractionLocality::Local] =
-            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    nb->deviceStreams[InteractionLocality::Local] = &localStream;
 
     if (nb->bUseTwoStreams)
     {
@@ -344,7 +297,10 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
         init_timings(nb->timings);
     }
 
-    nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(), *nb->deviceContext_);
+    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
+    const DeviceContext&            deviceContext = *nb->deviceContext_;
+    init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+    init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
 
     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
      * TODO: decide about NVIDIA
@@ -361,9 +317,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     nbnxn_gpu_compile_kernels(nb);
     nbnxn_gpu_init_kernels(nb);
 
-    /* clear energy and shift force outputs */
-    nbnxn_ocl_clear_e_fshift(nb);
-
     if (debug)
     {
         fprintf(debug, "Initialized OpenCL data structures.\n");
@@ -372,38 +325,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     return nb;
 }
 
-/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
- */
-static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
-{
-    if (natoms_clear == 0)
-    {
-        return;
-    }
-
-    NBAtomData*         atomData    = nb->atdat;
-    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
-    clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream);
-}
-
-//! This function is documented in the header file
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
-    nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms);
-    /* clear shift force array and energies if the outputs were
-       used in the current step */
-    if (computeVirial)
-    {
-        nbnxn_ocl_clear_e_fshift(nb);
-    }
-
-    /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
-    cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
-    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-}
-
 //! This function is documented in the header file
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
@@ -497,8 +418,6 @@ void gpu_free(NbnxmGpu* nb)
     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
     free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
 
-    free_kernel(&(nb->kernel_zero_e_fshift));
-
     /* Free atdat */
     freeDeviceBuffer(&(nb->atdat->xq));
     freeDeviceBuffer(&(nb->atdat->f));
index 8aa7f451204069fc62bbca8953c548566f067fe1..1f6cabf615b20677b3cea9c681449ce25ca1e568 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-/* Auxiliary kernels */
-
-/* Very few data */
-__kernel void zero_e_fshift(__global float* fshift, __global float* e_lj, __global float* e_el, const unsigned int Nbuf)
-{
-    unsigned int tidx = get_global_id(0);
-    if (tidx < Nbuf)
-    {
-        fshift[tidx] = 0.0F;
-    }
-    if (tidx == 0)
-    {
-        *e_lj = 0.0F;
-        *e_el = 0.0F;
-    }
-}
-
 /* Generate pruning kernels. */
 #define HAVE_FRESH_LIST 1
 #include "nbnxm_ocl_kernel_pruneonly.clh"
index c802c4199fd865f6bdbcf8b8aec940035d5187c9..925f94b117419340c1440d657f62c2a1381a8f23 100644 (file)
@@ -156,10 +156,9 @@ struct NbnxmGpu
 
     /**< auxiliary kernels implementing memset-like functions */
     ///@{
-    cl_kernel kernel_memset_f      = nullptr;
-    cl_kernel kernel_memset_f2     = nullptr;
-    cl_kernel kernel_memset_f3     = nullptr;
-    cl_kernel kernel_zero_e_fshift = nullptr;
+    cl_kernel kernel_memset_f  = nullptr;
+    cl_kernel kernel_memset_f2 = nullptr;
+    cl_kernel kernel_memset_f3 = nullptr;
     ///@}
 
     //! true if doing both local/non-local NB work on GPU
index cc4f9f3a6bfbb62637bb97bcdef05609d97f1e68..2f37a0c011c90f8a9b7e7575b62d06ed1b92bad1 100644 (file)
 namespace Nbnxm
 {
 
-//! This function is documented in the header file
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
-    NBAtomData*         adat        = nb->atdat;
-    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-    // Clear forces
-    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
-    // Clear shift force array and energies if the outputs were used in the current step
-    if (computeVirial)
-    {
-        clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
-        clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
-        clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
-    }
-}
-
 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
-static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
+static void initAtomdataFirst(NBAtomData*          atomdata,
+                              int                  numTypes,
+                              const DeviceContext& deviceContext,
+                              const DeviceStream&  localStream)
 {
-    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-    NBAtomData*         atomdata    = nb->atdat;
-    atomdata->numTypes              = numTypes;
+    atomdata->numTypes = numTypes;
     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
     atomdata->shiftVecUploaded = false;
 
@@ -179,8 +164,8 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     /* local/non-local GPU streams */
     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    nb->deviceStreams[InteractionLocality::Local] =
-            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    nb->deviceStreams[InteractionLocality::Local] = &localStream;
     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
     // out-of-order. But for the time being, it will be less disruptive to keep them.
     if (nb->bUseTwoStreams)
@@ -200,7 +185,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     const DeviceContext&            deviceContext = *nb->deviceContext_;
 
     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
-    initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
+    initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 
     return nb;
 }