return &dd->comm->zones;
}
+int dd_numAtomsZones(const gmx_domdec_t &dd)
+{
+ return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
+}
+
int dd_numHomeAtoms(const gmx_domdec_t &dd)
{
return dd.comm->atomRanges.numHomeAtoms();
/*! \brief Returns a pointer to the gmx_domdec_zones_t struct */
struct gmx_domdec_zones_t *domdec_zones(struct gmx_domdec_t *dd);
+/*! \brief Returns the range for atoms in zones*/
+int dd_numAtomsZones(const gmx_domdec_t &dd);
+
/*! \brief Returns the number of home atoms */
int dd_numHomeAtoms(const gmx_domdec_t &dd);
#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
#define GMX_DOMDEC_GPUHALOEXCHANGE_H
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/classhelpers.h"
* \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory
* \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory
*/
- void reinitHalo(rvec *d_coordinateBuffer,
- rvec *d_forcesBuffer);
+ void reinitHalo(DeviceBuffer<float> d_coordinateBuffer,
+ DeviceBuffer<float> d_forcesBuffer);
+
/*! \brief GPU halo exchange of coordinates buffer.
*
GpuHaloExchange::~GpuHaloExchange() = default;
/*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
- rvec * /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
+ DeviceBuffer<float> /* d_forcesBuffer */)
{
GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}
GpuHaloExchange::~GpuHaloExchange() = default;
-void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer,
- rvec *d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float> d_coordinatesBuffer,
+ DeviceBuffer<float> d_forcesBuffer)
{
impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
}
bool GPU_FUNC_ARGUMENT(useGpuForceReduction)) GPU_FUNC_TERM;
/*! \brief
- * Launches H2D input transfers for PME on GPU.
- *
- * \param[in] pme The PME data structure.
- * \param[in] coordinatesHost The array of local atoms' coordinates.
- * \param[in] wcycle The wallclock counter.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
- const rvec *GPU_FUNC_ARGUMENT(coordinatesHost),
- gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
-
-/*! \brief
- * Launches first stage of PME on GPU - spreading kernel, and D2H grid transfer if needed.
+ * Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
* \param[in] wcycle The wallclock counter.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
- gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
*/
GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+/*! \brief Set pointer to device copy of coordinate data.
+ * \param[in] pme The PME data structure.
+ * \param[in] d_x The pointer to the positions buffer to be set
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
/*! \brief Get pointer to device copy of force data.
* \param[in] pme The PME data structure.
* \returns Pointer to force data
*/
GPU_FUNC_QUALIFIER void *pme_gpu_get_device_stream(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+/*! \brief Returns the pointer to the GPU context.
+ * \param[in] pme The PME data structure.
+ * \returns Pointer to GPU context object.
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_context(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
* \param[in] pme The PME data structure.
* \returns Pointer to sychronizer
}
}
-void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t *pme,
- const rvec *coordinatesHost,
- gmx_wallcycle *wcycle)
-{
- GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
- PmeGpu *pmeGpu = pme->gpu;
-
- // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
- wallcycle_start(wcycle, ewcLAUNCH_GPU);
- // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
- pme_gpu_copy_input_coordinates(pmeGpu, coordinatesHost);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-}
-
void pme_gpu_launch_spread(gmx_pme_t *pme,
gmx_wallcycle *wcycle)
{
return pme_gpu_get_kernelparam_forces(pme->gpu);
}
+void pme_gpu_set_device_x(const gmx_pme_t *pme,
+ DeviceBuffer<float> d_x)
+{
+ GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
+ GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+ pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
+}
+
void *pme_gpu_get_device_stream(const gmx_pme_t *pme)
{
if (!pme || !pme_gpu_active(pme))
return pme_gpu_get_stream(pme->gpu);
}
+void *pme_gpu_get_device_context(const gmx_pme_t *pme)
+{
+ if (!pme || !pme_gpu_active(pme))
+ {
+ return nullptr;
+ }
+ return pme_gpu_get_context(pme->gpu);
+}
+
GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
{
if (!pme || !pme_gpu_active(pme))
}
}
-void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
-{
- GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
-#if GMX_DOUBLE
- GMX_RELEASE_ASSERT(false, "Only single precision is supported");
- GMX_UNUSED_VALUE(h_coordinates);
-#else
- const float *h_coordinatesFloat = reinterpret_cast<const float *>(h_coordinates);
- copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat,
- 0, pmeGpu->kernelParams->atoms.nAtoms * DIM,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
- // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
- // Remove after adding the required event-based sync between the above H2D and the transform kernel
- pme_gpu_synchronize(pmeGpu);
-#endif
-}
-
void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
{
freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
pme_gpu_free_energy_virial(pmeGpu);
pme_gpu_free_bspline_values(pmeGpu);
pme_gpu_free_forces(pmeGpu);
- pme_gpu_free_coordinates(pmeGpu);
pme_gpu_free_coefficients(pmeGpu);
pme_gpu_free_spline_data(pmeGpu);
pme_gpu_free_grid_indices(pmeGpu);
if (haveToRealloc)
{
- pme_gpu_realloc_coordinates(pmeGpu);
pme_gpu_realloc_forces(pmeGpu);
pme_gpu_realloc_spline_data(pmeGpu);
pme_gpu_realloc_grid_indices(pmeGpu);
}
}
+/*! \brief Check the validity of the device buffer.
+ *
+ * Checks if the buffer is not nullptr and, when possible, if it is big enough.
+ *
+ * \todo Split and move this function to gpu_utils.
+ *
+ * \param[in] buffer Device buffer to be checked.
+ * \param[in] requiredSize Number of elements that the buffer will have to accommodate.
+ *
+ * \returns If the device buffer can be set.
+ */
+template<typename T>
+static bool checkDeviceBuffer(gmx_unused DeviceBuffer<T> buffer, gmx_unused int requiredSize)
+{
+#if GMX_GPU == GMX_GPU_CUDA
+ GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr");
+ return buffer != nullptr;
+#elif GMX_GPU == GMX_GPU_OPENCL
+ size_t size;
+ int retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, NULL);
+ GMX_ASSERT(retval == CL_SUCCESS, gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str());
+ GMX_ASSERT(static_cast<int>(size) >= requiredSize, "Number of atoms in device buffer is smaller then required size.");
+ return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
+#elif GMX_GPU == GMX_GPU_NONE
+ GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
+ return false;
+#endif
+}
+
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu *pmeGpu, DeviceBuffer<float> d_x)
+{
+ GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was initialized.");
+
+ GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms), "The device-side buffer can not be set.");
+
+ pmeGpu->kernelParams->atoms.d_coordinates = d_x;
+}
+
void * pme_gpu_get_stream(const PmeGpu *pmeGpu)
{
if (pmeGpu)
}
}
+void * pme_gpu_get_context(const PmeGpu *pmeGpu)
+{
+ if (pmeGpu)
+ {
+ return static_cast<void *>(&pmeGpu->archSpecific->context);
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
{
if (pmeGpu && pmeGpu->kernelParams)
*/
void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu);
-/*! \libinternal \brief
- * Copies the input coordinates from the CPU buffer onto the GPU.
- *
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] h_coordinates Input coordinates (XYZ rvec array).
- *
- * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
- const rvec *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM;
-
/*! \libinternal \brief
* Frees the coordinates on the GPU.
*
*/
GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+/*! \brief Sets the device pointer to coordinate data
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] d_x Pointer to coordinate data
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
/*! \brief Return pointer to device copy of force data.
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to force data
*/
GPU_FUNC_QUALIFIER void * pme_gpu_get_stream(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+/*! \brief Return pointer to GPU context (for OpenCL builds).
+ * \param[in] pmeGpu The PME GPU structure.
+ * \returns Pointer to context object.
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_context(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to sync object
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/forceoutput.h"
#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/fatalerror.h"
std::vector<gmx_pme_t *> pmedata;
pmedata.push_back(pme);
- auto pme_pp = gmx_pme_pp_init(cr);
+ auto pme_pp = gmx_pme_pp_init(cr);
//TODO the variable below should be queried from the task assignment info
- const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+ const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+ const void *commandStream = useGpuForPme ? pme_gpu_get_device_context(pme) : nullptr;
+ const void *gpuContext = useGpuForPme ? pme_gpu_get_device_stream(pme) : nullptr;
+ const int paddingSize = pme_gpu_get_padding_size(pme);
if (useGpuForPme)
{
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
}
+ // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds
+ auto stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(commandStream, gpuContext, GpuApiCallBehavior::Sync, paddingSize);
+
clear_nrnb(mynrnb);
count = 0;
if (atomSetChanged)
{
gmx_pme_reinit_atoms(pme, natoms, pme_pp->chargeA.data());
+ if (useGpuForPme)
+ {
+ stateGpu->reinit(natoms, natoms);
+ pme_gpu_set_device_x(pme, stateGpu->getCoordinates());
+ }
}
if (ret == pmerecvqxRESETCOUNTERS)
//TODO this should be set properly by gmx_pme_recv_coeffs_coords,
// or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
- pme_gpu_copy_coordinates_to_gpu(pme, as_rvec_array(pme_pp->x.data()), wcycle);
+ stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+
pme_gpu_launch_spread(pme, wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
TestReferenceData refData;
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
+ std::shared_ptr<StatePropagatorDataGpu> stateGpu;
CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+ EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box, stateGpu), NotImplementedError);
continue;
}
));
PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
- context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
+ context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box, stateGpu);
/* Setting some more inputs */
pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
+ std::shared_ptr<StatePropagatorDataGpu> stateGpu;
CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
+ EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box, stateGpu), NotImplementedError);
continue;
}
/* Running the test */
PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
- context->getPmeGpuProgram(), coordinates, charges, box);
+ context->getPmeGpuProgram(), coordinates, charges, box, stateGpu);
const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
}
//! PME initialization with atom data
-PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
- CodePath mode,
- const gmx_device_info_t *gpuInfo,
- PmeGpuProgramHandle pmeGpuProgram,
- const CoordinatesVector &coordinates,
- const ChargesVector &charges,
- const Matrix3x3 &box
+PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
+ CodePath mode,
+ const gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
+ const CoordinatesVector &coordinates,
+ const ChargesVector &charges,
+ const Matrix3x3 &box,
+ std::shared_ptr<StatePropagatorDataGpu> stateGpu
)
{
const index atomCount = coordinates.size();
// We need to set atc->n for passing the size in the tests
atc->setNumAtoms(atomCount);
gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
- pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+
+ // TODO: Pin the host buffer and use async memory copies
+ stateGpu = std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(pmeSafe.get()),
+ pme_gpu_get_device_context(pmeSafe.get()),
+ GpuApiCallBehavior::Sync,
+ pme_gpu_get_padding_size(pmeSafe.get()));
+ stateGpu->reinit(atomCount, atomCount);
+ stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()), gmx::StatePropagatorDataGpu::AtomLocality::All);
+ pme_gpu_set_kernelparam_coordinates(pmeSafe->gpu, stateGpu->getCoordinates());
+
break;
default:
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_gpu_internal.h"
#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/unique_cptr.h"
const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}},
real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F);
//! PME initialization with atom data and system box
-PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
- CodePath mode,
- const gmx_device_info_t *gpuInfo,
- PmeGpuProgramHandle pmeGpuProgram,
- const CoordinatesVector &coordinates,
- const ChargesVector &charges,
- const Matrix3x3 &box
+PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
+ CodePath mode,
+ const gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
+ const CoordinatesVector &coordinates,
+ const ChargesVector &charges,
+ const Matrix3x3 &box,
+ std::shared_ptr<StatePropagatorDataGpu> stateGpu
);
//! PME spline computation and charge spreading
void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode,
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
*
* \param[in] pmedata The PME structure
* \param[in] box The box matrix
- * \param[in] x Coordinate array
* \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
* \param[in] useGpuForceReduction True if GPU-based force reduction is active this step
*/
static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
const matrix box,
- const rvec x[],
const StepWorkload &stepWork,
int pmeFlags,
bool useGpuForceReduction,
gmx_wallcycle_t wcycle)
{
pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
- pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle);
pme_gpu_launch_spread(pmedata, wcycle);
}
int legacyFlags,
const DDBalanceRegionHandler &ddBalanceRegionHandler)
{
- int i, j;
- double mu[2*DIM];
- gmx_bool bFillGrid, bCalcCGCM;
- gmx_bool bUseGPU, bUseOrEmulGPU;
- nonbonded_verlet_t *nbv = fr->nbv.get();
- interaction_const_t *ic = fr->ic;
+ int i, j;
+ double mu[2*DIM];
+ gmx_bool bFillGrid, bCalcCGCM;
+ gmx_bool bUseGPU, bUseOrEmulGPU;
+ nonbonded_verlet_t *nbv = fr->nbv.get();
+ interaction_const_t *ic = fr->ic;
+ gmx::StatePropagatorDataGpu *stateGpu = fr->stateGpu;
// TODO remove the code below when the legacy flags are not in use anymore
/* modify force flag if not doing nonbonded */
}
#endif /* GMX_MPI */
+ // Coordinates on the device are needed if PME or BufferOps are offloaded.
+ // The local coordinates can be copied right away.
+ // NOTE: Consider moving this copy to right after they are updated and constrained,
+ // if the later is not offloaded.
+ if (useGpuPme || useGpuXBufOps == BufferOpsUseGpu::True)
+ {
+ if (stepWork.doNeighborSearch)
+ {
+ stateGpu->reinit(mdatoms->homenr, cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
+ if (useGpuPme)
+ {
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
+ }
+ }
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ }
+
if (useGpuPme)
{
- launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
}
/* do gridding for pair search */
{
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
- // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
- if (!useGpuPme)
- {
- nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::Local, false,
- x.unpaddedArrayRef());
- }
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
- useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+ stateGpu->getCoordinates());
}
else
{
wallcycle_stop(wcycle, ewcNS);
if (ddUsesGpuDirectCommunication)
{
- rvec* d_x = static_cast<rvec *> (nbv->get_gpu_xrvec());
- rvec* d_f = static_cast<rvec *> (nbv->get_gpu_frvec());
- gpuHaloExchange->reinitHalo(d_x, d_f);
+ gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
}
}
else
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
//non-local part of coordinate buffer must be copied back to host for CPU work
- nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+ stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
}
else
// The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
if (!useGpuPme && !ddUsesGpuDirectCommunication)
{
- nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
- x.unpaddedArrayRef());
+ stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
- useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+ stateGpu->getCoordinates());
}
else
{
// which are a dependency for the GPU force reduction.
bool haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
- rvec *f = as_rvec_array(forceWithShiftForces.force().data());
if (haveNonLocalForceContribInCpuBuffer)
{
- nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
+ stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
- nbv->getDeviceForces(),
+ stateGpu->getForces(),
pme_gpu_get_device_f(fr->pmedata),
pme_gpu_get_f_ready_synchronizer(fr->pmedata),
useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
- nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
+ stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
else
{
if (stepWork.computeForces)
{
- gmx::ArrayRef<gmx::RVec> force = forceOut.forceWithShiftForces().force();
- rvec *f = as_rvec_array(force.data());
if (useGpuForcesHaloExchange)
{
if (haveCpuLocalForces)
{
- nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+ stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
- bool accumulateHaloForces = haveCpuLocalForces;
- gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
+ gpuHaloExchange->communicateHaloForces(haveCpuLocalForces);
}
else
{
// - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
// of the halo exchange. In that case the copy is instead performed above, before the exchange.
// These should be unified.
- rvec *f = as_rvec_array(forceWithShift.data());
if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
{
- nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+ stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
if (useGpuForcesHaloExchange)
{
nbv->stream_local_wait_for_nonlocal();
}
nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
- nbv->getDeviceForces(),
+ stateGpu->getForces(),
pme_gpu_get_device_f(fr->pmedata),
pme_gpu_get_f_ready_synchronizer(fr->pmedata),
useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
- nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
+ // This function call synchronizes the local stream
nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
+ stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
else
{
#ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
#define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/mdtypes/group.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
* This will extract temperature scaling factors from tcstat, transform them into the plain
* array and call the normal integrate method.
*
- * \param[in] dt Timestep
+ * \param[in] dt Timestep.
* \param[in] updateVelocities If the velocities should be constrained.
* \param[in] computeVirial If virial should be updated.
* \param[out] virial Place to save virial tensor.
float dtPressureCouple,
const matrix velocityScalingMatrix);
- /*! \brief
- * Update data-structures (e.g. after NB search step).
+ /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
*
- * \param[in] idef System topology
- * \param[in] md Atoms data.
- * \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling.
+ * \param[in,out] d_x Device buffer with coordinates.
+ * \param[in,out] d_v Device buffer with velocities.
+ * \param[in] d_f Device buffer with forces.
+ * \param[in] idef System topology
+ * \param[in] md Atoms data.
+ * \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling.
*/
- void set(const t_idef &idef,
- const t_mdatoms &md,
- int numTempScaleValues);
+ void set(DeviceBuffer<float> d_x,
+ DeviceBuffer<float> d_v,
+ DeviceBuffer<float> d_f,
+ const t_idef &idef,
+ const t_mdatoms &md,
+ int numTempScaleValues);
/*! \brief
* Update PBC data.
*/
void setPbc(const t_pbc *pbc);
- /*! \brief
- * Copy coordinates from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_x CPU pointer where coordinates should be copied from.
- */
- void copyCoordinatesToGpu(const rvec *h_x);
-
- /*! \brief
- * Copy velocities from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_v CPU pointer where velocities should be copied from.
- */
- void copyVelocitiesToGpu(const rvec *h_v);
-
- /*! \brief
- * Copy forces from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_f CPU pointer where forces should be copied from.
- */
- void copyForcesToGpu(const rvec *h_f);
-
- /*! \brief
- * Copy coordinates from GPU to CPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[out] h_xp CPU pointer where coordinates should be copied to.
- */
- void copyCoordinatesFromGpu(rvec *h_xp);
-
- /*! \brief
- * Copy velocities from GPU to CPU.
- *
- * The velocities are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_v Pointer to velocities data.
- */
- void copyVelocitiesFromGpu(rvec *h_v);
-
- /*! \brief
- * Copy forces from GPU to CPU.
- *
- * The forces are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_f Pointer to forces data.
- */
- void copyForcesFromGpu(rvec *h_f);
-
- /*! \brief
- * Set the internal GPU-memory d_x, d_xp and d_v pointers.
- *
- * Data is not copied. The data are assumed to be in float3/fvec format
- * (float3 is used internally, but the data layout should be identical).
- *
- * \param[in] d_x Pointer to the coordinates for the input (on GPU)
- * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
- * \param[in] d_v Pointer to the velocities (on GPU)
- * \param[in] d_f Pointer to the forces (on GPU)
- */
- void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
private:
class Impl;
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
}
-void UpdateConstrainCuda::set(gmx_unused const t_idef &idef,
- gmx_unused const t_mdatoms &md,
- gmx_unused const int numTempScaleValues)
+void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float> d_x,
+ gmx_unused DeviceBuffer<float> d_v,
+ gmx_unused const DeviceBuffer<float> d_f,
+ gmx_unused const t_idef &idef,
+ gmx_unused const t_mdatoms &md,
+ gmx_unused const int numTempScaleValues)
{
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
}
GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
}
-void UpdateConstrainCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(gmx_unused const rvec *h_f)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(gmx_unused rvec *h_f)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::setXVFPointers(gmx_unused rvec *d_x,
- gmx_unused rvec *d_xp,
- gmx_unused rvec *d_v,
- gmx_unused rvec *d_f)
-{
- GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
} // namespace gmx
#endif /* GMX_GPU != GMX_GPU_CUDA */
}
}
+ // TODO: This should be eliminated
+ cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice);
+
return;
}
{
}
-void UpdateConstrainCuda::Impl::set(const t_idef &idef,
- const t_mdatoms &md,
- const int numTempScaleValues)
+void UpdateConstrainCuda::Impl::set(DeviceBuffer<float> d_x,
+ DeviceBuffer<float> d_v,
+ const DeviceBuffer<float> d_f,
+ const t_idef &idef,
+ const t_mdatoms &md,
+ const int numTempScaleValues)
{
+ GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
+ GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
+ GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
+
+ d_x_ = reinterpret_cast<float3*>(d_x);
+ d_v_ = reinterpret_cast<float3*>(d_v);
+ d_f_ = reinterpret_cast<float3*>(d_f);
+
numAtoms_ = md.nr;
- reallocateDeviceBuffer(&d_x_, numAtoms_, &numX_, &numXAlloc_, nullptr);
reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
- reallocateDeviceBuffer(&d_v_, numAtoms_, &numV_, &numVAlloc_, nullptr);
- reallocateDeviceBuffer(&d_f_, numAtoms_, &numF_, &numFAlloc_, nullptr);
reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_,
&numInverseMasses_, &numInverseMassesAlloc_, nullptr);
settleCuda_->setPbc(pbc);
}
-void UpdateConstrainCuda::Impl::copyCoordinatesToGpu(const rvec *h_x)
-{
- copyToDeviceBuffer(&d_x_, (float3*)h_x, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
-{
- copyToDeviceBuffer(&d_v_, (float3*)h_v, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesToGpu(const rvec *h_f)
-{
- copyToDeviceBuffer(&d_f_, (float3*)h_f, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
-{
- copyFromDeviceBuffer((float3*)h_xp, &d_xp_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
-{
- copyFromDeviceBuffer((float3*)h_v, &d_v_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesFromGpu(rvec *h_f)
-{
- copyFromDeviceBuffer((float3*)h_f, &d_f_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
- d_x_ = (float3*)d_x;
- d_xp_ = (float3*)d_xp;
- d_v_ = (float3*)d_v;
- d_f_ = (float3*)d_f;
-}
-
UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir,
const gmx_mtop_t &mtop,
const void *commandStream)
gmx::ArrayRef<const t_grp_tcstat> tcstat,
const bool doPressureCouple,
const float dtPressureCouple,
- const matrix pRVScalingMatrix)
+ const matrix velocityScalingMatrix)
{
impl_->integrate(dt, updateVelocities, computeVirial, virialScaled,
doTempCouple, tcstat,
- doPressureCouple, dtPressureCouple, pRVScalingMatrix);
+ doPressureCouple, dtPressureCouple, velocityScalingMatrix);
}
-void UpdateConstrainCuda::set(const t_idef &idef,
- const t_mdatoms &md,
- const int numTempScaleValues)
+void UpdateConstrainCuda::set(DeviceBuffer<float> d_x,
+ DeviceBuffer<float> d_v,
+ const DeviceBuffer<float> d_f,
+ const t_idef &idef,
+ const t_mdatoms &md,
+ const int numTempScaleValues)
{
- impl_->set(idef, md, numTempScaleValues);
+ impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
}
void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
impl_->setPbc(pbc);
}
-void UpdateConstrainCuda::copyCoordinatesToGpu(const rvec *h_x)
-{
- impl_->copyCoordinatesToGpu(h_x);
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(const rvec *h_v)
-{
- impl_->copyVelocitiesToGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(const rvec *h_f)
-{
- impl_->copyForcesToGpu(h_f);
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(rvec *h_xp)
-{
- impl_->copyCoordinatesFromGpu(h_xp);
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(rvec *h_v)
-{
- impl_->copyVelocitiesFromGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(rvec *h_f)
-{
- impl_->copyForcesFromGpu(h_f);
-}
-
-void UpdateConstrainCuda::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
- impl_->setXVFPointers(d_x, d_xp, d_v, d_f);
-}
-
} //namespace gmx
* 2. This is the temperature coupling step.
* Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
*
- * \param[in] dt Timestep
+ * \param[in] dt Timestep.
* \param[in] updateVelocities If the velocities should be constrained.
* \param[in] computeVirial If virial should be updated.
* \param[out] virial Place to save virial tensor.
* \param[in] dtPressureCouple Period between pressure coupling steps
* \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix
*/
- void integrate(const real dt,
- const bool updateVelocities,
- const bool computeVirial,
+ void integrate(real dt,
+ bool updateVelocities,
+ bool computeVirial,
tensor virial,
- const bool doTempCouple,
+ bool doTempCouple,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
- const float dtPressureCouple,
+ bool doPressureCouple,
+ float dtPressureCouple,
const matrix velocityScalingMatrix);
- /*! \brief
- * Update data-structures (e.g. after NB search step).
+ /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
*
- * \param[in] idef System topology
- * \param[in] md Atoms data.
- * \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling.
+ * \param[in,out] d_x Device buffer with coordinates.
+ * \param[in,out] d_v Device buffer with velocities.
+ * \param[in] d_f Device buffer with forces.
+ * \param[in] idef System topology
+ * \param[in] md Atoms data.
+ * \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling.
*/
- void set(const t_idef &idef,
- const t_mdatoms &md,
- const int numTempScaleValues);
+ void set(DeviceBuffer<float> d_x,
+ DeviceBuffer<float> d_v,
+ const DeviceBuffer<float> d_f,
+ const t_idef &idef,
+ const t_mdatoms &md,
+ const int numTempScaleValues);
/*! \brief
* Update PBC data.
*/
void setPbc(const t_pbc *pbc);
- /*! \brief
- * Copy coordinates from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_x CPU pointer where coordinates should be copied from.
- */
- void copyCoordinatesToGpu(const rvec *h_x);
-
- /*! \brief
- * Copy velocities from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_v CPU pointer where velocities should be copied from.
- */
- void copyVelocitiesToGpu(const rvec *h_v);
-
- /*! \brief
- * Copy forces from CPU to GPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_f CPU pointer where forces should be copied from.
- */
- void copyForcesToGpu(const rvec *h_f);
-
- /*! \brief
- * Copy coordinates from GPU to CPU.
- *
- * The data are assumed to be in float3/fvec format (single precision).
- *
- * \param[out] h_xp CPU pointer where coordinates should be copied to.
- */
- void copyCoordinatesFromGpu(rvec *h_xp);
-
- /*! \brief
- * Copy velocities from GPU to CPU.
- *
- * The velocities are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_v Pointer to velocities data.
- */
- void copyVelocitiesFromGpu(rvec *h_v);
-
- /*! \brief
- * Copy forces from GPU to CPU.
- *
- * The forces are assumed to be in float3/fvec format (single precision).
- *
- * \param[in] h_f Pointer to forces data.
- */
- void copyForcesFromGpu(rvec *h_f);
-
- /*! \brief
- * Set the internal GPU-memory x, xprime and v pointers.
- *
- * Data is not copied. The data are assumed to be in float3/fvec format
- * (float3 is used internally, but the data layout should be identical).
- *
- * \param[in] d_x Pointer to the coordinates for the input (on GPU)
- * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
- * \param[in] d_v Pointer to the velocities (on GPU)
- * \param[in] d_f Pointer to the forces (on GPU)
- */
- void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
-
private:
//! CUDA stream
//! Number of atoms
int numAtoms_;
- //! Coordinates before the timestep (on GPU)
+ //! Local copy of the pointer to the device positions buffer
float3 *d_x_;
- //! Number of elements in coordinates buffer
- int numX_ = -1;
- //! Allocation size for the coordinates buffer
- int numXAlloc_ = -1;
+ //! Local copy of the pointer to the device velocities buffer
+ float3 *d_v_;
+ //! Local copy of the pointer to the device forces buffer
+ float3 *d_f_;
- //! Coordinates after the timestep (on GPU).
+ //! Device buffer for intermediate positions (maintained internally)
float3 *d_xp_;
//! Number of elements in shifted coordinates buffer
int numXp_ = -1;
//! Allocation size for the shifted coordinates buffer
int numXpAlloc_ = -1;
- //! Velocities of atoms (on GPU)
- float3 *d_v_;
- //! Number of elements in velocities buffer
- int numV_ = -1;
- //! Allocation size for the velocities buffer
- int numVAlloc_ = -1;
-
- //! Forces, exerted by atoms (on GPU)
- float3 *d_f_;
- //! Number of elements in forces buffer
- int numF_ = -1;
- //! Allocation size for the forces buffer
- int numFAlloc_ = -1;
//! 1/mass for all atoms (GPU)
real *d_inverseMasses_;
#include "gromacs/mdtypes/observableshistory.h"
#include "gromacs/mdtypes/pullhistory.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/modularsimulator/energyelement.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
upd.setNumAtoms(state->natoms);
}
+/*****************************************************************************************/
+// TODO: The following block of code should be refactored, once:
+// 1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
+// 2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
+// stream owned by the StatePropagatorDataGpu
bool useGpuForPme = (fr->pmedata != nullptr) && (pme_run_mode(fr->pmedata) != PmeRunMode::CPU);
bool useGpuForNonbonded = fr->nbv->useGpu();
+ // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
+ bool useGpuForBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
if (useGpuForUpdate)
{
integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, nullptr);
}
- if (fr->nbv->useGpu())
+ if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
{
- changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+ changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
}
+ if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+ {
+ changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
+ }
+ if (useGpuForUpdate)
+ {
+ changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
+ }
+/*****************************************************************************************/
// NOTE: The global state is no longer used at this point.
// But state_global is still used as temporary storage space for writing
if (useGpuForUpdate)
{
+ StatePropagatorDataGpu *stateGpu = fr->stateGpu;
if (bNS)
{
- integrator->set(top.idef, *mdatoms, ekind->ngtc);
+ integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
+ top.idef, *mdatoms, ekind->ngtc);
t_pbc pbc;
set_pbc(&pbc, epbcXYZ, state->box);
integrator->setPbc(&pbc);
}
- integrator->copyCoordinatesToGpu(state->x.rvec_array());
- integrator->copyVelocitiesToGpu(state->v.rvec_array());
- integrator->copyForcesToGpu(as_rvec_array(f.data()));
+
+ stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+ stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+ stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
bool doTempCouple = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir,
doTempCouple, ekind->tcstat,
doPressureCouple, ir->nstpcouple*ir->delta_t, M);
-
- integrator->copyCoordinatesFromGpu(state->x.rvec_array());
- integrator->copyVelocitiesFromGpu(state->v.rvec_array());
+ stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+ stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+ stateGpu->synchronizeStream();
}
else
{
#include "gromacs/mdtypes/observableshistory.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist_tuning.h"
fcd->orires.nr != 0,
fcd->disres.nsystems != 0);
+ const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr;
+ const void *gpuContext = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr;
+ const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
+
+ const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
+ false,
+ inputrec, doRerun, vsite.get(), ms, replExParams,
+ fcd, static_cast<int>(filenames.size()), filenames.data(),
+ &observablesHistory, membed);
+
+ const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
+ GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
+
+ // We initialize GPU state even for the CPU runs so we will have a more verbose
+ // error if someone will try accessing it from the CPU codepath
+ gmx::StatePropagatorDataGpu stateGpu(commandStream,
+ gpuContext,
+ transferKind,
+ paddingSize);
+ fr->stateGpu = &stateGpu;
+
// TODO This is not the right place to manage the lifetime of
// this data structure, but currently it's the easiest way to
// make it work.
SimulatorBuilder simulatorBuilder;
// build and run simulator object based on user-input
- const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
- false,
- inputrec, doRerun, vsite.get(), ms, replExParams,
- fcd, static_cast<int>(filenames.size()), filenames.data(),
- &observablesHistory, membed);
auto simulator = simulatorBuilder.build(
inputIsCompatibleWithModularSimulator,
fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
# To help us fund GROMACS development, we humbly ask that you cite
# the research papers on the package. Check out http://www.gromacs.org.
-file(GLOB MDTYPES_SOURCES *.cpp)
+file(GLOB MDTYPES_SOURCES
+ df_history.cpp
+ group.cpp
+ iforceprovider.cpp
+ inputrec.cpp
+ md_enums.cpp
+ observableshistory.cpp
+ state.cpp)
+
+if(GMX_USE_CUDA OR GMX_USE_OPENCL)
+ gmx_add_libgromacs_sources(
+ state_propagator_data_gpu_impl_gpu.cpp
+ )
+ if(GMX_USE_CUDA)
+ gmx_compile_cpp_as_cuda(
+ state_propagator_data_gpu_impl_gpu.cpp
+ )
+ endif()
+else()
+ gmx_add_libgromacs_sources(
+ state_propagator_data_gpu_impl.cpp
+ )
+endif()
+
+
set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${MDTYPES_SOURCES} PARENT_SCOPE)
if(GMX_INSTALL_LEGACY_API)
{
class GpuBonded;
class ForceProviders;
+class StatePropagatorDataGpu;
}
/* macros for the cginfo data in forcerec
struct ewald_corr_thread_t *ewc_t = nullptr;
gmx::ForceProviders *forceProviders = nullptr;
+
+ // The stateGpu object is created in runner, forcerec just keeps the copy of the pointer.
+ // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
+ // general StatePropagatorData object that is passed around
+ gmx::StatePropagatorDataGpu *stateGpu = nullptr;
};
/* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of interfaces for GPU state data propagator object.
+ *
+ * This object stores and manages positions, velocities and forces for
+ * all particles in the system on the GPU.
+ *
+ * \todo Add cycle counters.
+ * \todo Add synchronization points.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \inlibraryapi
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu
+{
+ public:
+
+ /*! \brief Atom locality indicator: local, non-local, all.
+ *
+ * \todo This should be managed by a separate object, since the localities
+ * are used here and in buffer ops.
+ */
+ enum class AtomLocality : int
+ {
+ Local = 0, //!< Local atoms
+ NonLocal = 1, //!< Non-local atoms
+ All = 2, //!< Both local and non-local atoms
+ Count = 3 //!< The number of atom locality types
+ };
+
+ /*! \brief Constructor
+ *
+ * The buffers are reallocated only at the reinit call, the padding is
+ * used there for the coordinates buffer. It is needed for PME and added at
+ * the end of the buffer. It is assumed that if the rank has PME duties on the
+ * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+ * coordinates buffer is not split into local and non-local ranges. For other
+ * ranks, the padding size is zero. This works because only one rank ever does
+ * PME work on the GPU, and if that rank also does PP work that is the only
+ * rank. So all coordinates are always transferred.
+ *
+ * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+ * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+ * does not run on the GPU).
+ *
+ * \todo Make \p CommandStream visible in the CPU parts of the code so we
+ * will not have to pass a void*.
+ * \todo Make \p Context visible in CPU parts of the code so we will not
+ * have to pass a void*.
+ *
+ * \param[in] commandStream GPU stream, nullptr allowed.
+ * \param[in] gpuContext GPU context, nullptr allowed.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] paddingSize Padding size for coordinates buffer.
+ */
+ StatePropagatorDataGpu(const void *commandStream,
+ const void *gpuContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize);
+
+ ~StatePropagatorDataGpu();
+
+ /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * The coordinates buffer is reallocated with the padding added at the end. The
+ * size of padding is set by the constructor.
+ *
+ * \param[in] numAtomsLocal Number of atoms in local domain.
+ * \param[in] numAtomsAll Total number of atoms to handle.
+ */
+ void reinit(int numAtomsLocal, int numAtomsAll);
+
+ /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+ *
+ * \todo There are at least three versions of the function with this functionality in the code:
+ * this one and two more in NBNXM. These should be unified in a shape of a general function
+ * in DD.
+ *
+ * \param[in] atomLocality If all, local or non-local ranges are needed.
+ *
+ * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+ */
+ std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+
+
+ /*! \brief Get the positions buffer on the GPU.
+ *
+ * \returns GPU positions buffer.
+ */
+ DeviceBuffer<float> getCoordinates();
+
+ /*! \brief Copy positions to the GPU memory.
+ *
+ * \param[in] h_x Positions in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec> h_x,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy positions from the GPU memory.
+ *
+ * \param[in] h_x Positions buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality);
+
+
+ /*! \brief Get the velocities buffer on the GPU.
+ *
+ * \returns GPU velocities buffer.
+ */
+ DeviceBuffer<float> getVelocities();
+
+ /*! \brief Copy velocities to the GPU memory.
+ *
+ * \param[in] h_v Velocities in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec> h_v,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy velocities from the GPU memory.
+ *
+ * \param[in] h_v Velocities buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v,
+ AtomLocality atomLocality);
+
+
+ /*! \brief Get the force buffer on the GPU.
+ *
+ * \returns GPU force buffer.
+ */
+ DeviceBuffer<float> getForces();
+
+ /*! \brief Copy forces to the GPU memory.
+ *
+ * \param[in] h_f Forces in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy forces from the GPU memory.
+ *
+ * \param[in] h_f Forces buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
+ AtomLocality atomLocality);
+ /*! \brief Synchronize the underlying GPU stream
+ */
+ void synchronizeStream();
+
+ /*! \brief Getter for the number of local atoms.
+ *
+ * \returns The number of local atoms.
+ */
+ int numAtomsLocal();
+
+ /*! \brief Getter for the total number of atoms.
+ *
+ * \returns The total number of atoms.
+ */
+ int numAtomsAll();
+
+ private:
+ class Impl;
+ gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief The CPU stub for the state propagator data class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+
+#if GMX_GPU == GMX_GPU_NONE
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+};
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* commandStream */,
+ const void * /* gpuContext */,
+ GpuApiCallBehavior /* transferKind */,
+ int /* paddingSize */)
+ : impl_(nullptr)
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu()
+{
+}
+
+void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */,
+ int /* numAtomsAll */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return std::make_tuple(0, 0);
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_x */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_x */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_v */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_f */,
+ AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return 0;
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+ GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+ return 0;
+}
+
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of low-level functions and fields of GPU state propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+ public:
+
+ Impl();
+
+
+ /*! \brief Constructor
+ *
+ * The buffers are reallocated only at the reinit call, the padding is
+ * used there for the coordinates buffer. It is needed for PME and added at
+ * the end of the buffer. It is assumed that if the rank has PME duties on the
+ * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+ * coordinates buffer is not split into local and non-local ranges. For other
+ * ranks, the padding size is zero. This works because only one rank ever does
+ * PME work on the GPU, and if that rank also does PP work that is the only
+ * rank. So all coordinates are always transferred.
+ *
+ * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+ * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+ * does not run on the GPU).
+ *
+ * \todo Make CommandStream visible in the CPU parts of the code so we
+ * will not have to pass a void*.
+ * \todo Make a Context object visible in CPU parts of the code so we
+ * will not have to pass a void*.
+ *
+ * \param[in] commandStream GPU stream, nullptr allowed.
+ * \param[in] gpuContext GPU context, nullptr allowed.
+ * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
+ * \param[in] paddingSize Padding size for coordinates buffer.
+ */
+ Impl(const void *commandStream,
+ const void *gpuContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize);
+
+ ~Impl();
+
+
+ /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * The coordinates buffer is reallocated with the padding added at the end. The
+ * size of padding is set by the constructor.
+ *
+ * \param[in] numAtomsLocal Number of atoms in local domain.
+ * \param[in] numAtomsAll Total number of atoms to handle.
+ */
+ void reinit(int numAtomsLocal, int numAtomsAll);
+
+ /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+ *
+ * \todo There are at least three versions of the function with this functionality in the code:
+ * this one and two more in NBNXM. These should be unified in a shape of a general function
+ * in DD.
+ *
+ * \param[in] atomLocality If all, local or non-local ranges are needed.
+ *
+ * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+ */
+ std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+
+
+ /*! \brief Get the positions buffer on the GPU.
+ *
+ * \returns GPU positions buffer.
+ */
+ DeviceBuffer<float> getCoordinates();
+
+ /*! \brief Copy positions to the GPU memory.
+ *
+ * \param[in] h_x Positions in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec> h_x,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy positions from the GPU memory.
+ *
+ * \param[in] h_x Positions buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality);
+
+
+ /*! \brief Get the velocities buffer on the GPU.
+ *
+ * \returns GPU velocities buffer.
+ */
+ DeviceBuffer<float> getVelocities();
+
+ /*! \brief Copy velocities to the GPU memory.
+ *
+ * \param[in] h_v Velocities in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec> h_v,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy velocities from the GPU memory.
+ *
+ * \param[in] h_v Velocities buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v,
+ AtomLocality atomLocality);
+
+
+ /*! \brief Get the force buffer on the GPU.
+ *
+ * \returns GPU force buffer.
+ */
+ DeviceBuffer<float> getForces();
+
+ /*! \brief Copy forces to the GPU memory.
+ *
+ * \param[in] h_f Forces in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f,
+ AtomLocality atomLocality);
+
+ /*! \brief Copy forces from the GPU memory.
+ *
+ * \param[in] h_f Forces buffer in the host memory.
+ * \param[in] atomLocality Locality of the particles to copy.
+ */
+ void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
+ AtomLocality atomLocality);
+
+ /*! \brief Synchronize the underlying GPU stream
+ */
+ void synchronizeStream();
+
+ /*! \brief Getter for the number of local atoms.
+ *
+ * \returns The number of local atoms.
+ */
+ int numAtomsLocal();
+
+ /*! \brief Getter for the total number of atoms.
+ *
+ * \returns The total number of atoms.
+ */
+ int numAtomsAll();
+
+ private:
+
+ /*! \brief GPU stream.
+ * \todo The stream should be set to non-nullptr once the synchronization points are restored
+ */
+ CommandStream commandStream_ = nullptr;
+ /*! \brief GPU context (for OpenCL builds)
+ * \todo Make a Context class usable in CPU code
+ */
+ Context gpuContext_ = nullptr;
+ //! Default GPU calls behavior
+ GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
+ //! Padding size for the coordinates buffer
+ int paddingSize_ = 0;
+
+ //! Number of local atoms
+ int numAtomsLocal_ = -1;
+ //! Total number of atoms
+ int numAtomsAll_ = -1;
+
+ //! Device positions buffer
+ DeviceBuffer<float> d_x_;
+ //! Number of particles saved in the positions buffer
+ int d_xSize_ = -1;
+ //! Allocation size for the positions buffer
+ int d_xCapacity_ = -1;
+
+ //! Device velocities buffer
+ DeviceBuffer<float> d_v_;
+ //! Number of particles saved in the velocities buffer
+ int d_vSize_ = -1;
+ //! Allocation size for the velocities buffer
+ int d_vCapacity_ = -1;
+
+ //! Device force buffer
+ DeviceBuffer<float> d_f_;
+ //! Number of particles saved in the force buffer
+ int d_fSize_ = -1;
+ //! Allocation size for the force buffer
+ int d_fCapacity_ = -1;
+
+ /*! \brief Performs the copy of data from host to device buffer.
+ *
+ * \todo Template on locality.
+ *
+ * \param[in,out] d_data Device-side buffer.
+ * \param[in,out] h_data Host-side buffer.
+ * \param[in] dataSize Device-side data allocation size.
+ * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ */
+ void copyToDevice(DeviceBuffer<float> d_data,
+ const gmx::ArrayRef<const gmx::RVec> h_data,
+ int dataSize,
+ AtomLocality atomLocality);
+
+ /*! \brief Performs the copy of data from device to host buffer.
+ *
+ * \param[in,out] h_data Host-side buffer.
+ * \param[in,out] d_data Device-side buffer.
+ * \param[in] dataSize Device-side data allocation size.
+ * \param[in] atomLocality If all, local or non-local ranges should be copied.
+ */
+ void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
+ DeviceBuffer<float> d_data,
+ int dataSize,
+ AtomLocality atomLocality);
+};
+
+} // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Definitions of interfaces for GPU state data propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#if GMX_GPU != GMX_GPU_NONE
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/cudautils.cuh"
+#endif
+#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/oclutils.h"
+#endif
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+#include "state_propagator_data_gpu_impl.h"
+
+namespace gmx
+{
+
+StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream,
+ gmx_unused const void *gpuContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize) :
+ transferKind_(transferKind),
+ paddingSize_(paddingSize)
+{
+
+ GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
+
+ // Set the stream-context pair for the OpenCL builds,
+ // use the nullptr stream for CUDA builds
+#if GMX_GPU == GMX_GPU_OPENCL
+ if (commandStream != nullptr)
+ {
+ commandStream_ = *static_cast<const CommandStream*>(commandStream);
+ }
+ if (gpuContext != nullptr)
+ {
+ gpuContext_ = *static_cast<const Context*>(gpuContext);
+ }
+#endif
+
+}
+
+StatePropagatorDataGpu::Impl::~Impl()
+{
+}
+
+void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
+{
+#if GMX_GPU == GMX_GPU_OPENCL
+ GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+ numAtomsLocal_ = numAtomsLocal;
+ numAtomsAll_ = numAtomsAll;
+
+ int numAtomsPadded;
+ if (paddingSize_ > 0)
+ {
+ numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1 ) / paddingSize_ )*paddingSize_;
+ }
+ else
+ {
+ numAtomsPadded = numAtomsAll_;
+ }
+
+ reallocateDeviceBuffer(&d_x_, DIM*numAtomsPadded, &d_xSize_, &d_xCapacity_, gpuContext_);
+
+ const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
+ if (paddingAllocationSize > 0)
+ {
+ clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_);
+ }
+
+ reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, gpuContext_);
+ reallocateDeviceBuffer(&d_f_, DIM*numAtomsAll_, &d_fSize_, &d_fCapacity_, gpuContext_);
+
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+{
+ int atomsStartAt = 0;
+ int numAtomsToCopy = 0;
+ switch (atomLocality)
+ {
+ case AtomLocality::All:
+ atomsStartAt = 0;
+ numAtomsToCopy = numAtomsAll_;
+ break;
+ case AtomLocality::Local:
+ atomsStartAt = 0;
+ numAtomsToCopy = numAtomsLocal_;
+ break;
+ case AtomLocality::NonLocal:
+ atomsStartAt = numAtomsLocal_;
+ numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
+ break;
+ default:
+ GMX_RELEASE_ASSERT(false, "Wrong range of atoms requested in GPU state data manager. Should be All, Local or NonLocal.");
+ }
+ GMX_ASSERT(atomsStartAt >= 0, "The first elemtnt to copy has negative index. Probably, the GPU propagator state was not initialized.");
+ GMX_ASSERT(numAtomsToCopy >= 0, "Number of atoms to copy is negative. Probably, the GPU propagator state was not initialized.");
+ return std::make_tuple(atomsStartAt, numAtomsToCopy);
+}
+
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float> d_data,
+ const gmx::ArrayRef<const gmx::RVec> h_data,
+ int dataSize,
+ AtomLocality atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+ GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+ GMX_UNUSED_VALUE(dataSize);
+
+ GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
+
+ int atomsStartAt, numAtomsToCopy;
+ std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+ int elementsStartAt = atomsStartAt*DIM;
+ int numElementsToCopy = numAtomsToCopy*DIM;
+
+ if (numAtomsToCopy != 0)
+ {
+ GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+ GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+ // TODO: Use the proper stream
+ copyToDeviceBuffer(&d_data, reinterpret_cast<const float *>(&h_data.data()[atomsStartAt]),
+ elementsStartAt, numElementsToCopy,
+ commandStream_, transferKind_, nullptr);
+ }
+}
+
+void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
+ DeviceBuffer<float> d_data,
+ int dataSize,
+ AtomLocality atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+ GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+ GMX_UNUSED_VALUE(dataSize);
+
+ GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
+
+ int atomsStartAt, numAtomsToCopy;
+ std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+ int elementsStartAt = atomsStartAt*DIM;
+ int numElementsToCopy = numAtomsToCopy*DIM;
+
+ if (numAtomsToCopy != 0)
+ {
+ GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+ GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+ // TODO: Use the proper stream
+ copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
+ elementsStartAt, numElementsToCopy,
+ commandStream_, transferKind_, nullptr);
+
+ }
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+{
+ return d_x_;
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
+ AtomLocality atomLocality)
+{
+ copyToDevice(d_x_, h_x, d_xSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality)
+{
+ copyFromDevice(h_x, d_x_, d_xSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+{
+ return d_v_;
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
+ AtomLocality atomLocality)
+{
+ copyToDevice(d_v_, h_v, d_vSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v,
+ AtomLocality atomLocality)
+{
+ copyFromDevice(h_v, d_v_, d_vSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+{
+ return d_f_;
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
+ AtomLocality atomLocality)
+{
+ copyToDevice(d_f_, h_f, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f,
+ AtomLocality atomLocality)
+{
+ copyFromDevice(h_f, d_f_, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::synchronizeStream()
+{
+ gpuStreamSynchronize(commandStream_);
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsLocal()
+{
+ return numAtomsLocal_;
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsAll()
+{
+ return numAtomsAll_;
+}
+
+
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void *commandStream,
+ const void *gpuContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize)
+ : impl_(new Impl(commandStream,
+ gpuContext,
+ transferKind,
+ paddingSize))
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
+
+
+void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
+{
+ return impl_->reinit(numAtomsLocal, numAtomsAll);
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+{
+ return impl_->getAtomRangesFromAtomLocality(atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+ return impl_->getCoordinates();
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
+ AtomLocality atomLocality)
+{
+ return impl_->copyCoordinatesToGpu(h_x, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x,
+ AtomLocality atomLocality)
+{
+ return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+ return impl_->getVelocities();
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
+ AtomLocality atomLocality)
+{
+ return impl_->copyVelocitiesToGpu(h_v, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v,
+ AtomLocality atomLocality)
+{
+ return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+ return impl_->getForces();
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
+ AtomLocality atomLocality)
+{
+ return impl_->copyForcesToGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f,
+ AtomLocality atomLocality)
+{
+ return impl_->copyForcesFromGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+ return impl_->synchronizeStream();
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+ return impl_->numAtomsLocal();
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+ return impl_->numAtomsAll();
+}
+
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
}
}
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet,
- const Nbnxm::AtomLocality locality,
- bool fillLocal,
- nbnxn_atomdata_t *nbat,
- gmx_nbnxn_gpu_t *gpu_nbv,
- const rvec *coordinatesHost)
-{
- int gridBegin = 0;
- int gridEnd = 0;
- getAtomRanges(gridSet, locality, &gridBegin, &gridEnd);
-
- if (fillLocal)
- {
- nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
- }
-
- for (int g = gridBegin; g < gridEnd; g++)
- {
- nbnxn_gpu_copy_x_to_gpu(gridSet.grids()[g],
- gpu_nbv,
- locality,
- coordinatesHost);
- }
-}
-
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
- return Nbnxm::nbnxn_gpu_get_x_gpu(gpu_nbv);
-}
-
/* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
const Nbnxm::AtomLocality locality,
bool fillLocal,
gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> coordinatesDevice)
+ DeviceBuffer<float> d_x)
{
int gridBegin = 0;
nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
fillLocal && g == 0,
gpu_nbv,
- coordinatesDevice,
+ d_x,
locality,
g,
gridSet.numColumnsMax());
accumulateForce);
}
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
- return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv);
-}
-
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat,
gmx::ArrayRef<gmx::RVec> fshift)
{
struct t_mdatoms;
struct tMPI_Atomic;
-enum class BufferOpsUseGpu;
-
class GpuEventSynchronizer;
namespace Nbnxm
const rvec *coordinates,
nbnxn_atomdata_t *nbat);
-/*! \brief Copies the coordinates to the GPU (in plain rvec format)
- *
- * This function copied data to the gpu so that the transformation to the NBNXM format can be done on the GPU.
- *
- * \param[in] gridSet The grids data.
- * \param[in] locality If local or non local coordinates should be copied.
- * \param[in] fillLocal If the local filler particle coordinates should be zeroed.
- * \param[in] nbat Data in NBNXM format, used to zero coordinates of filler particles.
- * \param[in] gpu_nbv The NBNXM GPU data structure.
- * \param[in] coordinatesHost Coordinates to be copied (in plain rvec format).
- */
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet,
- Nbnxm::AtomLocality locality,
- bool fillLocal,
- nbnxn_atomdata_t *nbat,
- gmx_nbnxn_gpu_t *gpu_nbv,
- const rvec *coordinatesHost);
-
-/*!\brief Getter for the GPU coordinates buffer
- *
- * \param[in] gpu_nbv The NBNXM GPU data structure.
- */
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
/*! \brief Transform coordinates to xbat layout on GPU
*
* Creates a GPU copy of the coordinates buffer using short-range ordering.
* As input, uses coordinates in plain rvec format in GPU memory.
*
- * \param[in] gridSet The grids data.
- * \param[in] locality If the transformation should be applied to local or non local coordinates.
- * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed.
- * \param[in,out] gpu_nbv The NBNXM GPU data structure.
- * \param[in] coordinatesDevice Coordinates to be copied (in plain rvec format).
+ * \param[in] gridSet The grids data.
+ * \param[in] locality If the transformation should be applied to local or non local coordinates.
+ * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed.
+ * \param[in,out] gpu_nbv The NBNXM GPU data structure.
+ * \param[in] d_x Coordinates to be copied (in plain rvec format).
*/
void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
Nbnxm::AtomLocality locality,
bool fillLocal,
gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> coordinatesDevice);
+ DeviceBuffer<float> d_x);
/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
*
bool useGpuFPmeReduction,
bool accumulateForce);
-/*!\brief Getter for the GPU forces buffer
- *
- * \todo Will be removed when the buffer management is lifted out of the NBNXM
- *
- * \param[in] gpu_nbv The NBNXM GPU data structure.
- *
- * \returns Device forces buffer
- */
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
/* Add the fshift force stored in nbat to fshift */
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat,
gmx::ArrayRef<gmx::RVec> fshift);
}
}
-/* X buffer operations on GPU: copies coordinates to the GPU in rvec format. */
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid &grid,
- gmx_nbnxn_gpu_t *nb,
- const Nbnxm::AtomLocality locality,
- const rvec *coordinatesHost)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- bool bDoTime = nb->bDoTime;
-
- Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
- int numCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
- int copyAtomStart = grid.srcAtomBegin();
-
- cudaStream_t stream = nb->stream[interactionLoc];
-
- // empty domain avoid launching zero-byte copy
- if (numCopyAtoms == 0)
- {
- return;
- }
- GMX_ASSERT(coordinatesHost, "Need a valid host pointer");
-
- if (bDoTime)
- {
- nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
- }
-
- rvec *devicePtrDest = reinterpret_cast<rvec *> (nb->xrvec[copyAtomStart]);
- const rvec *devicePtrSrc = reinterpret_cast<const rvec *> (coordinatesHost[copyAtomStart]);
- copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
- stream, GpuApiCallBehavior::Async, nullptr);
-
- if (interactionLoc == Nbnxm::InteractionLocality::Local)
- {
- nb->xAvailableOnDevice->markEvent(stream);
- }
-
- if (bDoTime)
- {
- nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
- }
-}
-
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t *nb)
-{
- return reinterpret_cast< DeviceBuffer<float> >(nb->xrvec);
-}
-
/* X buffer operations on GPU: performs conversion from rvec to nb format. */
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid,
bool setFillerCoords,
gmx_nbnxn_gpu_t *nb,
- DeviceBuffer<float> coordinatesDevice,
+ DeviceBuffer<float> d_x,
const Nbnxm::AtomLocality locality,
int gridId,
int numColumnsMax)
if (numAtoms != 0)
{
// TODO: This will only work with CUDA
- GMX_ASSERT(coordinatesDevice, "Need a valid device pointer");
+ GMX_ASSERT(d_x, "Need a valid device pointer");
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
&numColumns,
&xqPtr,
&setFillerCoords,
- &coordinatesDevice,
+ &d_x,
&d_atomIndices,
&d_cxy_na,
&d_cxy_ind,
}
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
-{
- return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
-}
-
-void nbnxn_launch_copy_f_to_gpu(const AtomLocality atomLocality,
- const Nbnxm::GridSet &gridSet,
- gmx_nbnxn_gpu_t *nb,
- rvec *f)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- cudaStream_t stream = nb->stream[iLocality];
-
- bool bDoTime = nb->bDoTime;
- cu_timers_t *t = nb->timers;
-
- int atomStart = 0, numCopyAtoms = 0;
-
- nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
- // Avoiding launching copy with no work
- if (numCopyAtoms == 0)
- {
- return;
- }
- GMX_ASSERT(f, "Need a valid f pointer");
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
- }
-
- rvec *ptrDest = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
- rvec *ptrSrc = reinterpret_cast<rvec *> (f[atomStart]);
- //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, numCopyAtoms,
- // stream, GpuApiCallBehavior::Async, nullptr);
- //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
- cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
- stream);
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
- }
-
- return;
-}
-
-void nbnxn_launch_copy_f_from_gpu(const AtomLocality atomLocality,
- const Nbnxm::GridSet &gridSet,
- gmx_nbnxn_gpu_t *nb,
- rvec *f)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- cudaStream_t stream = nb->stream[iLocality];
-
- bool bDoTime = nb->bDoTime;
- cu_timers_t *t = nb->timers;
- int atomStart, numCopyAtoms;
-
- nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
- // Avoiding launching copy with no work
- if (numCopyAtoms == 0)
- {
- return;
- }
- GMX_ASSERT(f, "Need a valid f pointer");
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
- }
-
- GMX_ASSERT(nb->frvec, "Need a valid nb->frvec pointer");
- rvec *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
- rvec *ptrSrc = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
- //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, numCopyAtoms,
- // stream, GpuApiCallBehavior::Async, nullptr);
- //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
- cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
- stream);
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
- }
-
- return;
-}
-
-void nbnxn_launch_copy_x_from_gpu(const AtomLocality atomLocality,
- const Nbnxm::GridSet &gridSet,
- gmx_nbnxn_gpu_t *nb,
- rvec *x)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(x, "Need a valid x pointer");
-
- const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- cudaStream_t stream = nb->stream[iLocality];
-
- bool bDoTime = nb->bDoTime;
- cu_timers_t *t = nb->timers;
- int atomStart, nAtoms;
-
- nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
- }
-
- GMX_ASSERT(nb->xrvec, "Need a valid nb->xrvec pointer");
- rvec *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
- rvec *ptrSrc = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
- copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
- stream, GpuApiCallBehavior::Async, stream);
-
- if (atomLocality == AtomLocality::NonLocal)
- {
- nb->xNonLocalCopyD2HDone->markEvent(stream);
- }
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
- }
-
- return;
-}
-
void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality,
gmx_nbnxn_gpu_t *nb)
{
}
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
- return static_cast<void *> (gpu_nbv->xrvec);
-}
-
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
- return static_cast<void *> (gpu_nbv->frvec);
-}
-
void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb)
{
return static_cast<void*> (nb->xAvailableOnDevice);
cuda_init_const(nb, ic, listParams, nbat->params());
- nb->natoms = 0;
- nb->natoms_alloc = 0;
nb->atomIndicesSize = 0;
nb->atomIndicesSize_alloc = 0;
nb->ncxy_na = 0;
nb->ncxy_na_alloc = 0;
nb->ncxy_ind = 0;
nb->ncxy_ind_alloc = 0;
- nb->nfrvec = 0;
- nb->nfrvec_alloc = 0;
nb->ncell = 0;
nb->ncell_alloc = 0;
const int atomIndicesSize = gridSet.atomIndices().size();
const int *cxy_na = grid.cxy_na().data();
const int *cxy_ind = grid.cxy_ind().data();
- // TODO Should be done once per gridset
- const int numRealAtomsTotal = gridSet.numRealAtomsTotal();
- reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
if (atomIndicesSize > 0)
cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
- reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
-
if (natoms_total > 0)
{
reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
bool bUseTwoStreams;
//! atom data
cu_atomdata_t *atdat;
- //! coordinates in rvec format
- rvec *xrvec;
- //! number of atoms
- int natoms;
- //! number of atoms allocated in device buffer
- int natoms_alloc;
- //! force in rvec format
- rvec *frvec;
- //! number of atoms in force buffer
- int nfrvec;
- //! number of atoms allocated in force buffer
- int nfrvec_alloc;
//! f buf ops cell index mapping
int *cell;
//! number of indices in cell buffer
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
-
-void nonbonded_verlet_t::copyCoordinatesToGpu(const Nbnxm::AtomLocality locality,
- const bool fillLocal,
- gmx::ArrayRef<const gmx::RVec> coordinatesHost)
-{
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
-
- nbnxn_atomdata_copy_x_to_gpu(pairSearch_->gridSet(), locality, fillLocal,
- nbat.get(), gpu_nbv,
- as_rvec_array(coordinatesHost.data()));
-
- wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
-}
-
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceCoordinates()
-{
- return nbnxn_atomdata_get_x_gpu(gpu_nbv);
-}
-
void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality,
const bool fillLocal,
- DeviceBuffer<float> coordinatesDevice)
+ DeviceBuffer<float> d_x)
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
gpu_nbv,
- coordinatesDevice);
+ d_x);
wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceForces()
-{
- return nbnxn_atomdata_get_f_gpu(gpu_nbv);
-}
-
real nonbonded_verlet_t::pairlistInnerRadius() const
{
return pairlistSets_->params().rlistInner;
Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
}
-void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
- nbnxn_launch_copy_f_to_gpu(locality,
- pairSearch_->gridSet(),
- gpu_nbv,
- f);
-}
-
-void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
- nbnxn_launch_copy_f_from_gpu(locality,
- pairSearch_->gridSet(),
- gpu_nbv,
- f);
-}
-
-void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
-{
- nbnxn_launch_copy_x_from_gpu(locality,
- pairSearch_->gridSet(),
- gpu_nbv,
- x);
-}
-
void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
{
nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
}
-void* nonbonded_verlet_t::get_gpu_xrvec()
-{
- return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
-}
-
void* nonbonded_verlet_t::get_x_on_device_event()
{
return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
}
-void* nonbonded_verlet_t::get_gpu_frvec()
-{
- return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
-}
-
void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
{
Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> coordinates);
- /*!\brief Copy coordinates to the GPU memory.
- *
- * This function uses the internal NBNXM GPU pointer to copy coordinates in the plain rvec format
- * into the GPU memory.
- *
- * \todo This function will be removed in future patches as the management of the device buffers
- * is moved to a separate object.
- *
- * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
- * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
- * \param[in] coordinatesHost Coordinates in plain rvec format to be transformed.
- */
- void copyCoordinatesToGpu(Nbnxm::AtomLocality locality,
- bool fillLocal,
- gmx::ArrayRef<const gmx::RVec> coordinatesHost);
-
- /*!\brief Getter for the GPU coordinates buffer.
- *
- * \todo This function will be removed in future patches as the management of the device buffers
- * is moved to a separate object.
- *
- * \returns The coordinates buffer in plain rvec format.
- */
- DeviceBuffer<float> getDeviceCoordinates();
-
/*!\brief Convert the coordinates to NBNXM format on the GPU for the given locality
*
* The API function for the transformation of the coordinates from one layout to another in the GPU memory.
*
- * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
- * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
- * \param[in] coordinatesDevice GPU coordinates buffer in plain rvec format to be transformed.
+ * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
+ * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
+ * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
*/
void convertCoordinatesGpu(Nbnxm::AtomLocality locality,
bool fillLocal,
- DeviceBuffer<float> coordinatesDevice);
+ DeviceBuffer<float> d_x);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
bool useGpuFPmeReduction,
bool accumulateForce);
- /*!\brief Getter for the GPU force buffer.
- *
- * \todo This function will be removed in future patches as the management of the device buffers
- * is moved to a separate object.
- *
- * \returns The force buffer in plain rvec format.
- */
- DeviceBuffer<float> getDeviceForces();
-
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
void atomdata_init_add_nbat_f_to_f_gpu();
- /*! \brief H2D transfer of force buffer*/
- void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
- /*! \brief D2H transfer of force buffer*/
- void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
- /*! \brief D2H transfer of coordinate buffer*/
- void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
/*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
*
* FIXME: need more details: when should be called / after which operation, etc.
*/
void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
- /*! \brief return GPU pointer to x in rvec format */
- void* get_gpu_xrvec();
-
/*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
void* get_x_on_device_event();
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM;
-/*! \brief Copy coordinates from host to device memory.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*
- * \param[in] grid Grid to be copied.
+ * \param[in] grid Grid to be converted.
+ * \param[in] setFillerCoords If the filler coordinates are used.
* \param[in,out] gpu_nbv The nonbonded data GPU structure.
+ * \param[in] d_x Device-side coordinates in plain rvec format.
* \param[in] locality Copy coordinates for local or non-local atoms.
- * \param[in] coordinatesHost Host-side coordinates in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid gmx_unused &grid,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- Nbnxm::AtomLocality gmx_unused locality,
- const rvec gmx_unused *coordinatesHost) CUDA_FUNC_TERM;
-
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in] gpu_nbv The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-
-/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
- *
- * \param[in] grid Grid to be converted.
- * \param[in] setFillerCoords If the filler coordinates are used.
- * \param[in,out] gpu_nbv The nonbonded data GPU structure.
- * \param[in] coordinatesDevice Device-side coordinates in plain rvec format.
- * \param[in] locality Copy coordinates for local or non-local atoms.
- * \param[in] gridId Index of the grid being converted.
- * \param[in] numColumnsMax Maximum number of columns in the grid.
+ * \param[in] gridId Index of the grid being converted.
+ * \param[in] numColumnsMax Maximum number of columns in the grid.
*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
bool gmx_unused setFillerCoords,
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- DeviceBuffer<float> gmx_unused coordinatesDevice,
+ DeviceBuffer<float> gmx_unused d_x,
Nbnxm::AtomLocality gmx_unused locality,
int gmx_unused gridId,
int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
bool gmx_unused useGpuFPmeReduction,
bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in] gpu_nbv The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-/*! \brief Copy force buffer from CPU to GPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_to_gpu(AtomLocality gmx_unused atomLocality,
- const Nbnxm::GridSet gmx_unused &gridSet,
- gmx_nbnxn_gpu_t gmx_unused *nb,
- rvec gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Copy force buffer from GPU to CPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_from_gpu(AtomLocality gmx_unused atomLocality,
- const Nbnxm::GridSet gmx_unused &gridSet,
- gmx_nbnxn_gpu_t gmx_unused *nb,
- rvec gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
- * \param[in] atomLocality Locality for data trasnfer
- * \param[in] gridSet The Grid Set data object
- * \param[in] nb The nonbonded data GPU structure
- * \param[out] x Coordinate buffer on CPU
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_x_from_gpu(AtomLocality gmx_unused atomLocality,
- const Nbnxm::GridSet gmx_unused &gridSet,
- gmx_nbnxn_gpu_t gmx_unused *nb,
- rvec gmx_unused *x) CUDA_FUNC_TERM;
-
/*! \brief Wait for GPU stream to complete */
CUDA_FUNC_QUALIFIER
void nbnxn_wait_for_gpu_force_reduction(AtomLocality gmx_unused atomLocality,
CUDA_FUNC_QUALIFIER
void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-/*! \brief return GPU pointer to x in rvec format
- * \param[in] nb The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
/*! \brief Wait for non-local copy of coordinate buffer from device to host
* \param[in] nb The nonbonded data GPU structure
*/
CUDA_FUNC_QUALIFIER
void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
-/*! \brief return GPU pointer to f in rvec format
- * \param[in] nb The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
/*! \brief Ensure local stream waits for non-local stream
* \param[in] nb The nonbonded data GPU structure
*/