*/
static unsigned int gpu_min_ci_balanced_factor = 44;
-/* Fw. decl. */
-static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb);
-
/*! Initializes the atomdata structure first time, it only gets filled at
pair-search. */
-static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad,
+ int nTypes,
+ const DeviceContext& deviceContext,
+ const DeviceStream& localStream)
{
- ad->numTypes = ntypes;
+ ad->numTypes = nTypes;
allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
ad->shiftVecUploaded = false;
allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
+ clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
+ clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
+
/* initialize to nullptr poiters to data that is not allocated here and will
need reallocation in nbnxn_cuda_init_atomdata */
ad->xq = nullptr;
}
}
-/*! Initializes simulation constant data. */
-static void cuda_init_const(NbnxmGpu* nb,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t::Params& nbatParams)
-{
- init_atomdata_first(nb->atdat, nbatParams.numTypes, *nb->deviceContext_);
- init_nbparam(nb->nbparam, ic, listParams, nbatParams, *nb->deviceContext_);
-
- /* clear energy and shift force outputs */
- nbnxn_cuda_clear_e_fshift(nb);
-}
-
NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
const interaction_const_t* ic,
const PairlistParams& listParams,
/* local/non-local GPU streams */
GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
"Local non-bonded stream should be initialized to use GPU for non-bonded.");
- nb->deviceStreams[InteractionLocality::Local] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ nb->deviceStreams[InteractionLocality::Local] = &localStream;
if (nb->bUseTwoStreams)
{
init_plist(nb->plist[InteractionLocality::NonLocal]);
/* pick L1 cache configuration */
cuda_set_cacheconfig();
- cuda_init_const(nb, ic, listParams, nbat->params());
+ const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+ init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+ init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
nb->atomIndicesSize = 0;
nb->atomIndicesSize_alloc = 0;
}
}
-/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
-static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
-{
- NBAtomData* adat = nb->atdat;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
- clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream);
-}
-
-/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
-static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
-{
- NBAtomData* adat = nb->atdat;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
- clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
- clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
-}
-
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
- nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms);
- /* clear shift force array and energies if the outputs were
- used in the current step */
- if (computeVirial)
- {
- nbnxn_cuda_clear_e_fshift(nb);
- }
-}
-
void gpu_free(NbnxmGpu* nb)
{
if (nb == nullptr)
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
issueClFlushInStream(localStream);
}
+void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
+{
+ NBAtomData* adat = nb->atdat;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+ // Clear forces
+ clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
+ // Clear shift force array and energies if the outputs were used in the current step
+ if (computeVirial)
+ {
+ clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+ clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
+ }
+ issueClFlushInStream(localStream);
+}
+
//! This function is documented in the header file
gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
{
/*! \brief Initializes the atomdata structure first time, it only gets filled at
pair-search.
*/
-static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad,
+ int ntypes,
+ const DeviceContext& deviceContext,
+ const DeviceStream& localStream)
{
ad->numTypes = ntypes;
allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
+ clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
+ clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
+
/* initialize to nullptr pointers to data that is not allocated here and will
need reallocation in nbnxn_gpu_init_atomdata */
ad->xq = nullptr;
return kernel;
}
-/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
- */
-static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
-{
-
- cl_int cl_error;
- NBAtomData* adat = nb->atdat;
- cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
-
- size_t local_work_size[3] = { 1, 1, 1 };
- size_t global_work_size[3] = { 1, 1, 1 };
-
- cl_int shifts = SHIFTS * 3;
-
- cl_int arg_no;
-
- cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift;
-
- local_work_size[0] = 64;
- // Round the total number of threads up from the array size
- global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
-
- arg_no = 0;
- cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift));
- cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ));
- cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec));
- cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
- GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
-
- cl_error = clEnqueueNDRangeKernel(
- ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
- GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
-}
-
/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
static void nbnxn_gpu_init_kernels(NbnxmGpu* nb)
{
nb->kernel_pruneonly[epruneFirst] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
nb->kernel_pruneonly[epruneRolling] =
nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
-
- /* Init auxiliary kernels */
- nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
-}
-
-/*! \brief Initializes simulation constant data.
- *
- * Initializes members of the atomdata and nbparam structs and
- * clears e/fshift output buffers.
- */
-static void nbnxn_ocl_init_const(NBAtomData* atomData,
- NBParamGpu* nbParams,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t::Params& nbatParams,
- const DeviceContext& deviceContext)
-{
- init_atomdata_first(atomData, nbatParams.numTypes, deviceContext);
- init_nbparam(nbParams, ic, listParams, nbatParams, deviceContext);
}
-
//! This function is documented in the header file
NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
const interaction_const_t* ic,
/* local/non-local GPU streams */
GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
"Local non-bonded stream should be initialized to use GPU for non-bonded.");
- nb->deviceStreams[InteractionLocality::Local] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ nb->deviceStreams[InteractionLocality::Local] = &localStream;
if (nb->bUseTwoStreams)
{
init_timings(nb->timings);
}
- nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(), *nb->deviceContext_);
+ const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+ init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+ init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
/* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
* TODO: decide about NVIDIA
nbnxn_gpu_compile_kernels(nb);
nbnxn_gpu_init_kernels(nb);
- /* clear energy and shift force outputs */
- nbnxn_ocl_clear_e_fshift(nb);
-
if (debug)
{
fprintf(debug, "Initialized OpenCL data structures.\n");
return nb;
}
-/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
- */
-static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
-{
- if (natoms_clear == 0)
- {
- return;
- }
-
- NBAtomData* atomData = nb->atdat;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-
- clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream);
-}
-
-//! This function is documented in the header file
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
- nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms);
- /* clear shift force array and energies if the outputs were
- used in the current step */
- if (computeVirial)
- {
- nbnxn_ocl_clear_e_fshift(nb);
- }
-
- /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
- cl_int gmx_unused cl_error;
- cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
- GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-}
-
//! This function is documented in the header file
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
- free_kernel(&(nb->kernel_zero_e_fshift));
-
/* Free atdat */
freeDeviceBuffer(&(nb->atdat->xq));
freeDeviceBuffer(&(nb->atdat->f));
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* Auxiliary kernels */
-
-/* Very few data */
-__kernel void zero_e_fshift(__global float* fshift, __global float* e_lj, __global float* e_el, const unsigned int Nbuf)
-{
- unsigned int tidx = get_global_id(0);
- if (tidx < Nbuf)
- {
- fshift[tidx] = 0.0F;
- }
- if (tidx == 0)
- {
- *e_lj = 0.0F;
- *e_el = 0.0F;
- }
-}
-
/* Generate pruning kernels. */
#define HAVE_FRESH_LIST 1
#include "nbnxm_ocl_kernel_pruneonly.clh"
/**< auxiliary kernels implementing memset-like functions */
///@{
- cl_kernel kernel_memset_f = nullptr;
- cl_kernel kernel_memset_f2 = nullptr;
- cl_kernel kernel_memset_f3 = nullptr;
- cl_kernel kernel_zero_e_fshift = nullptr;
+ cl_kernel kernel_memset_f = nullptr;
+ cl_kernel kernel_memset_f2 = nullptr;
+ cl_kernel kernel_memset_f3 = nullptr;
///@}
//! true if doing both local/non-local NB work on GPU
namespace Nbnxm
{
-//! This function is documented in the header file
-void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
-{
- NBAtomData* adat = nb->atdat;
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
- // Clear forces
- clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
- // Clear shift force array and energies if the outputs were used in the current step
- if (computeVirial)
- {
- clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
- clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
- }
-}
-
/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
-static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
+static void initAtomdataFirst(NBAtomData* atomdata,
+ int numTypes,
+ const DeviceContext& deviceContext,
+ const DeviceStream& localStream)
{
- const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
- NBAtomData* atomdata = nb->atdat;
- atomdata->numTypes = numTypes;
+ atomdata->numTypes = numTypes;
allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
atomdata->shiftVecUploaded = false;
/* local/non-local GPU streams */
GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
"Local non-bonded stream should be initialized to use GPU for non-bonded.");
- nb->deviceStreams[InteractionLocality::Local] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ nb->deviceStreams[InteractionLocality::Local] = &localStream;
// In general, it's not strictly necessary to use 2 streams for SYCL, since they are
// out-of-order. But for the time being, it will be less disruptive to keep them.
if (nb->bUseTwoStreams)
const DeviceContext& deviceContext = *nb->deviceContext_;
initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
- initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
+ initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
return nb;
}