/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/ewald/pme_spread.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/device_management.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
+#include "testutils/test_hardware_environment.h"
#include "testutils/testasserts.h"
-#include "testhardwarecontexts.h"
+class DeviceContext;
namespace gmx
{
bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
{
- bool implemented;
- gmx_mtop_t mtop;
+ bool implemented;
switch (mode)
{
case CodePath::CPU: implemented = true; break;
case CodePath::GPU:
implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
- && pme_gpu_supports_input(*inputRec, mtop, nullptr));
+ && pme_gpu_supports_input(*inputRec, nullptr));
break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
//! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
- const CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- const real ewaldCoeff_q,
- const real ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec* inputRec,
+ const CodePath mode,
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream,
+ const PmeGpuProgram* pmeGpuProgram,
+ const Matrix3x3& box,
+ const real ewaldCoeff_q,
+ const real ewaldCoeff_lj)
{
const MDLogger dummyLogger;
const auto runMode = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
t_commrec dummyCommrec = { 0 };
NumPmeDomains numPmeDomains = { 1, 1 };
- gmx_pme_t* pmeDataRaw =
- gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
- ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+ gmx_pme_t* pmeDataRaw = gmx_pme_init(&dummyCommrec,
+ numPmeDomains,
+ inputRec,
+ false,
+ false,
+ true,
+ ewaldCoeff_q,
+ ewaldCoeff_lj,
+ 1,
+ runMode,
+ nullptr,
+ deviceContext,
+ deviceStream,
+ pmeGpuProgram,
+ dummyLogger);
PmeSafePointer pme(pmeDataRaw); // taking ownership
// TODO get rid of this with proper matrix type
return pme;
}
-//! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec,
- const CodePath mode,
- const DeviceInformation* deviceInfo,
- const PmeGpuProgram* pmeGpuProgram,
- const Matrix3x3& box,
- const real ewaldCoeff_q,
- const real ewaldCoeff_lj)
-{
- return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
- // hiding the fact that PME actually needs to know the number of atoms in advance
-}
-
PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
{
const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
- return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+ return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
}
//! Make a GPU state-propagator manager
std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t& pme,
- const DeviceContext& deviceContext)
+ const DeviceContext* deviceContext,
+ const DeviceStream* deviceStream)
{
// TODO: Pin the host buffer and use async memory copies
// TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
// restrict one from using other constructor here.
- return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
- GpuApiCallBehavior::Sync,
- pme_gpu_get_block_size(&pme), nullptr);
+ return std::make_unique<StatePropagatorDataGpu>(
+ deviceStream, *deviceContext, GpuApiCallBehavior::Sync, pme_gpu_get_block_size(&pme), nullptr);
}
//! PME initialization with atom data
atc = &(pme->atc[0]);
atc->x = coordinates;
atc->coefficient = charges;
- gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+ gmx_pme_reinit_atoms(pme, atomCount, charges, {});
/* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
break;
atc = &(pme->atc[0]);
// We need to set atc->n for passing the size in the tests
atc->setNumAtoms(atomCount);
- gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+ gmx_pme_reinit_atoms(pme, atomCount, charges, {});
stateGpu->reinit(atomCount, atomCount);
stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
- gmx::AtomLocality::All);
+ gmx::AtomLocality::Local);
pme_gpu_set_kernelparam_coordinates(pme->gpu, stateGpu->getCoordinates());
break;
switch (mode)
{
case CodePath::CPU:
- gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused,
- paddedGridSize);
+ gmx_parallel_3dfft_real_limits(
+ pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
break;
case CodePath::GPU:
{
const size_t gridIndex = 0;
IVec gridOffsetUnused, complexOrderUnused;
- gmx_parallel_3dfft_complex_limits(pme->pfft_setup[gridIndex], complexOrderUnused, gridSize,
- gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
+ gmx_parallel_3dfft_complex_limits(
+ pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
}
//! Getting the PME grid memory buffer and its sizes - template definition
IVec& /*unused*/) //NOLINT(google-runtime-references)
{
GMX_THROW(InternalError("Deleted function call"));
- // explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
+ // explicitly deleting general template does not compile in clang, see https://llvm.org/bugs/show_bug.cgi?id=17537
}
//! Getting the PME real grid memory buffer and its sizes
PmeAtomComm* atc = &(pme->atc[0]);
const size_t gridIndex = 0;
const bool computeSplinesForZeroCharges = true;
- real* fftgrid = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+ real** fftgrid = spreadCharges ? pme->fftgrid : nullptr;
real* pmegrid = pme->pmegrid[gridIndex].grid.grid;
switch (mode)
{
case CodePath::CPU:
- spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges,
- fftgrid, computeSplinesForZeroCharges, gridIndex);
+ spread_on_grid(pme,
+ atc,
+ &pme->pmegrid[gridIndex],
+ computeSplines,
+ spreadCharges,
+ fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
+ computeSplinesForZeroCharges,
+ gridIndex);
if (spreadCharges && !pme->bUseThreads)
{
wrap_periodic_pmegrid(pme, pmegrid);
- copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
+ copy_pmegrid_to_fftgrid(
+ pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
}
break;
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
case CodePath::GPU:
{
+ const real lambdaQ = 1.0;
// no synchronization needed as x is transferred in the PME stream
GpuEventSynchronizer* xReadyOnDevice = nullptr;
- pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+
+ bool useGpuDirectComm = false;
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+ pme_gpu_spread(pme->gpu,
+ xReadyOnDevice,
+ fftgrid,
+ computeSplines,
+ spreadCharges,
+ lambdaQ,
+ useGpuDirectComm,
+ pmeCoordinateReceiverGpu);
}
break;
+#endif
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
t_complex* h_grid = pmeGetComplexGridInternal(pme);
const bool useLorentzBerthelot = false;
const size_t threadIndex = 0;
+ const size_t gridIndex = 0;
switch (mode)
{
case CodePath::CPU:
break;
case PmeSolveAlgorithm::LennardJones:
- solve_pme_lj_yzx(pme, &h_grid, useLorentzBerthelot, cellVolume,
- computeEnergyAndVirial, pme->nthread, threadIndex);
+ solve_pme_lj_yzx(pme,
+ &h_grid,
+ useLorentzBerthelot,
+ cellVolume,
+ computeEnergyAndVirial,
+ pme->nthread,
+ threadIndex);
break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
switch (method)
{
case PmeSolveAlgorithm::Coulomb:
- pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial);
+ pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
const size_t threadIndex = 0;
const size_t gridIndex = 0;
real* pmegrid = pme->pmegrid[gridIndex].grid.grid;
- real* fftgrid = pme->fftgrid[gridIndex];
+ real** fftgrid = pme->fftgrid;
switch (mode)
{
// something which is normally done in serial spline computation (make_thread_local_ind())
atc->spline[threadIndex].n = atomCount;
}
- copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex);
+ copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
unwrap_periodic_pmegrid(pme, pmegrid);
gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
break;
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
case CodePath::GPU:
{
// Variable initialization needs a non-switch scope
const bool computeEnergyAndVirial = false;
- PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+ const real lambdaQ = 1.0;
+ PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
GMX_ASSERT(forces.size() == output.forces_.size(),
"Size of force buffers did not match");
- pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+ pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
}
break;
+#endif
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
/*!\brief Return the number of atoms per warp */
static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
{
- if (pmeGpu->settings.useOrderThreadsPerAtom)
- {
- return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
- }
- else
- {
- return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom;
- }
+ const int order = pmeGpu->common->pme_order;
+ const int threadsPerAtom =
+ (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? order : order * order);
+ return pmeGpu->programHandle_->warpSize() / threadsPerAtom;
}
/*! \brief Rearranges the atom spline data between the GPU and host layouts.
switch (mode)
{
case CodePath::GPU:
- memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices, gridLineIndices.data(),
+ memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices,
+ gridLineIndices.data(),
atomCount * sizeof(gridLineIndices[0]));
break;
{
case CodePath::GPU: // intentional absence of break, the grid will be copied from the host buffer in testing mode
case CodePath::CPU:
- std::memset(grid, 0,
- paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
+ std::memset(grid, 0, paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
for (const auto& gridValue : gridValues)
{
for (int i = 0; i < DIM; i++)
//! Getting the reciprocal energy and virial
PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
{
- PmeOutput output;
+ PmeOutput output;
+ const real lambdaQ = 1.0;
switch (mode)
{
case CodePath::CPU:
case CodePath::GPU:
switch (method)
{
- case PmeSolveAlgorithm::Coulomb: pme_gpu_getEnergyAndVirial(*pme, &output); break;
+ case PmeSolveAlgorithm::Coulomb:
+ pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
+ break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
return output;
}
+const char* codePathToString(CodePath codePath)
+{
+ switch (codePath)
+ {
+ case CodePath::CPU: return "CPU";
+ case CodePath::GPU: return "GPU";
+ default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+ }
+}
+
+PmeTestHardwareContext::PmeTestHardwareContext() : codePath_(CodePath::CPU) {}
+
+PmeTestHardwareContext::PmeTestHardwareContext(TestDevice* testDevice) :
+ codePath_(CodePath::GPU), testDevice_(testDevice)
+{
+ setActiveDevice(testDevice_->deviceInfo());
+ pmeGpuProgram_ = buildPmeGpuProgram(testDevice_->deviceContext());
+}
+
+//! Returns a human-readable context description line
+std::string PmeTestHardwareContext::description() const
+{
+ switch (codePath_)
+ {
+ case CodePath::CPU: return "CPU";
+ case CodePath::GPU: return "GPU (" + testDevice_->description() + ")";
+ default: return "Unknown code path.";
+ }
+}
+
+void PmeTestHardwareContext::activate() const
+{
+ if (codePath_ == CodePath::GPU)
+ {
+ setActiveDevice(testDevice_->deviceInfo());
+ }
+}
+
+std::vector<std::unique_ptr<PmeTestHardwareContext>> createPmeTestHardwareContextList()
+{
+ std::vector<std::unique_ptr<PmeTestHardwareContext>> pmeTestHardwareContextList;
+ // Add CPU
+ pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>());
+ // Add GPU devices
+ const auto& testDeviceList = getTestHardwareEnvironment()->getTestDeviceList();
+ for (const auto& testDevice : testDeviceList)
+ {
+ pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>(testDevice.get()));
+ }
+ return pmeTestHardwareContextList;
+}
+
} // namespace test
} // namespace gmx