}
}
-void pme_gpu_gather(const PmeGpu *pmeGpu,
- float *h_forces,
+void pme_gpu_gather(PmeGpu *pmeGpu,
PmeForceOutputHandling forceTreatment,
const float *h_grid
)
/* Copying the input CPU forces for reduction */
if (forceTreatment != PmeForceOutputHandling::Set)
{
- pme_gpu_copy_input_forces(pmeGpu, h_forces);
+ pme_gpu_copy_input_forces(pmeGpu);
}
cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
CU_LAUNCH_ERR("pme_gather_kernel");
pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
- pme_gpu_copy_output_forces(pmeGpu, h_forces);
+ pme_gpu_copy_output_forces(pmeGpu);
}
return kernelParamsPtr;
}
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU)
+{
+ return pmeGPU->staging.h_forces;
+}
+
void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial)
{
for (int j = 0; j < c_virialAndEnergyCount; j++)
#include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
#include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros
+#include "gromacs/utility/arrayref.h"
#include "pme-gpu-types.h" // for the inline functions accessing PmeGpu members
*
* \param[in] pmeGPU The PME GPU structure.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Frees the GPU buffer for the PME forces.
* To be called e.g. after the bonded calculations.
*
* \param[in] pmeGPU The PME GPU structure.
- * \param[in] h_forces The input forces rvec buffer.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU),
- const float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
*
* \param[in] pmeGPU The PME GPU structure.
- * \param[out] h_forces The output forces rvec buffer.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU),
- float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
* A GPU force gathering function.
*
* \param[in] pmeGpu The PME GPU structure.
- * \param[in,out] h_forces The host buffer with input and output forces.
* \param[in] forceTreatment Tells how data in h_forces should be treated.
* TODO: determine efficiency/balance of host/device-side reductions.
* \param[in] h_grid The host-side grid buffer (used only in testing mode)
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_gather(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- float *CUDA_FUNC_ARGUMENT(h_forces),
+CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
const float *CUDA_FUNC_ARGUMENT(h_grid)
) CUDA_FUNC_TERM
/* A block of C++ functions that live in pme-gpu-internal.cpp */
+/*! \libinternal \brief
+ * Returns the GPU gathering staging forces buffer.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns The input/output forces.
+ */
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU);
+
/*! \libinternal \brief
* Returns the output virial and energy of the PME solving.
* Should be called after pme_gpu_finish_computation.
#include "gromacs/ewald/pme.h"
#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/basedefinitions.h"
*/
struct PmeGpuStaging
{
+ //TODO pin me with whatever method we settle on
+ //! Host-side force buffer
+ std::vector < gmx::RVec, gmx::HostAllocator < gmx::RVec>> h_forces;
+
/*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
float *h_virialAndEnergy;
/*! \brief B-spline values intermediate host-side buffer. */
void pme_gpu_launch_gather(const gmx_pme_t *pme,
gmx_wallcycle_t gmx_unused wcycle,
- rvec *forces,
PmeForceOutputHandling forceTreatment)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
const unsigned int gridIndex = 0;
real *fftgrid = pme->fftgrid[gridIndex];
- pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces), forceTreatment, reinterpret_cast<float *>(fftgrid));
+ pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
- gmx_wallcycle_t wcycle,
- matrix vir_q,
- real *energy_q)
+void
+pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
pme_gpu_finish_computation(pme->gpu);
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+ *forces = pme_gpu_get_forces(pme->gpu);
+
if (haveComputedEnergyAndVirial)
{
- if (pme->doCoulomb)
+ if (pme_gpu_performs_solve(pme->gpu))
{
- if (pme_gpu_performs_solve(pme->gpu))
- {
- pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q);
- }
- else
- {
- get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q);
- }
+ pme_gpu_get_energy_virial(pme->gpu, energy, virial);
}
else
{
- *energy_q = 0;
+ get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
}
}
- /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */
}
/*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
+ const rvec *f,
matrix vir_q, real energy_q,
matrix vir_lj, real energy_lj,
real dvdlambda_q, real dvdlambda_lj,
{
ind_start = ind_end;
ind_end = ind_start + receiver.numAtoms;
- if (MPI_Isend(pme_pp->f[ind_start], (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
+ if (MPI_Isend(const_cast<void *>(static_cast<const void *>(f[ind_start])),
+ (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
receiver.rankId, 0,
pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]) != 0)
{
#else
gmx_call("MPI not enabled");
GMX_UNUSED_VALUE(pme_pp);
+ GMX_UNUSED_VALUE(f);
GMX_UNUSED_VALUE(vir_q);
GMX_UNUSED_VALUE(energy_q);
GMX_UNUSED_VALUE(vir_lj);
// from mdatoms for the other call to gmx_pme_do), so we have
// fewer lines of code and less parameter passing.
const int pmeFlags = GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0);
+ gmx::ArrayRef<const gmx::RVec> forces;
if (runMode != PmeRunMode::CPU)
{
const bool boxChanged = true;
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
- pme_gpu_launch_gather(pme, wcycle, as_rvec_array(pme_pp->f.data()), PmeForceOutputHandling::Set);
- pme_gpu_wait_for_gpu(pme, wcycle, vir_q, &energy_q);
+ pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+ pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
}
else
{
vir_q, vir_lj,
&energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
pmeFlags);
+ forces = pme_pp->f;
}
cycles = wallcycle_stop(wcycle, ewcPMEMESH);
- gmx_pme_send_force_vir_ener(pme_pp.get(),
+ gmx_pme_send_force_vir_ener(pme_pp.get(), as_rvec_array(forces.data()),
vir_q, energy_q, vir_lj, energy_lj,
dvdlambda_q, dvdlambda_lj, cycles);
&pmeGPU->archSpecific->splineValuesSizeAlloc);
}
-void pme_gpu_realloc_forces(const PmeGpu *pmeGPU)
+void pme_gpu_realloc_forces(PmeGpu *pmeGPU)
{
const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
&pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
+ pmeGPU->staging.h_forces.reserve(pmeGPU->nAtomsAlloc);
+ pmeGPU->staging.h_forces.resize(pmeGPU->kernelParams->atoms.nAtoms);
}
void pme_gpu_free_forces(const PmeGpu *pmeGPU)
cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
}
-void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces)
+void pme_gpu_copy_input_forces(PmeGpu *pmeGPU)
{
- GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
- cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, pmeGPU->staging.h_forces.data(), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
}
-void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
+void pme_gpu_copy_output_forces(PmeGpu *pmeGPU)
{
- GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
- cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+ cu_copy_D2H(pmeGPU->staging.h_forces.data(), pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
}
void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
#include "gromacs/math/vectypes.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
*
* \param[in] pme The PME data structure.
* \param[in] wcycle The wallclock counter.
- * \param[in,out] forces The array of local atoms' resulting forces.
- * \param[in] forceTreatment Tells how data in h_forces should be treated. The gathering kernel either stores
+ * \param[in] forceTreatment Tells how data should be treated. The gathering kernel either stores
* the output reciprocal forces into the host array, or copies its contents to the GPU first
* and accumulates. The reduction is non-atomic.
*/
void pme_gpu_launch_gather(const gmx_pme_t *pme,
gmx_wallcycle_t wcycle,
- rvec *forces,
PmeForceOutputHandling forceTreatment);
/*! \brief
* (if they were to be computed).
*
* \param[in] pme The PME data structure.
- * \param[in] wcycle The wallclock counter.
- * \param[out] vir_q The output virial matrix.
- * \param[out] energy_q The output energy.
+ * \param[out] wcycle The wallclock counter.
+ * \param[out] forces The output forces.
+ * \param[out] virial The output virial matrix.
+ * \param[out] energy The output energy.
*/
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
- gmx_wallcycle_t wcycle,
- matrix vir_q,
- real *energy_q);
+void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy);
#endif
break;
case CodePath::CUDA:
- pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces.begin()), inputTreatment, reinterpret_cast<float *>(fftgrid));
- break;
+ {
+ // Variable initialization needs a non-switch scope
+ auto stagingForces = pme_gpu_get_forces(pme->gpu);
+ GMX_ASSERT(forces.size() == stagingForces.size(), "Size of force buffers did not match");
+ if (forceReductionWithInput)
+ {
+ for (size_t i = 0; i != forces.size(); ++i)
+ {
+ stagingForces[i] = forces[i];
+ }
+ }
+ pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+ for (size_t i = 0; i != forces.size(); ++i)
+ {
+ forces[i] = stagingForces[i];
+ }
+ }
+ break;
default:
GMX_THROW(InternalError("Test not implemented for this mode"));
{
fr->forceBufferForDirectVirialContributions->resize(natoms_f_novirsum);
}
-
- if (fr->ic->cutoff_scheme == ecutsVERLET)
- {
- fr->forceBufferIntermediate->resize(ncg_home);
- }
}
static real cutoff_inf(real cutoff)
fr->forceBufferForDirectVirialContributions = new std::vector<gmx::RVec>;
}
- fr->forceBufferIntermediate = new std::vector<gmx::RVec>; //TODO add proper conditionals
-
if (fr->cutoff_scheme == ecutsGROUP &&
ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
{
* This function only implements setting the output forces (no accumulation).
*
* \param[in] pmedata The PME structure
- * \param[out] pmeGpuForces The array of where the output forces are copied
* \param[in] wcycle The wallcycle structure
*/
static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
- ArrayRef<RVec> pmeGpuForces,
gmx_wallcycle_t wcycle)
{
pme_gpu_launch_complex_transforms(pmedata, wcycle);
- pme_gpu_launch_gather(pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), PmeForceOutputHandling::Set);
+ pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
}
static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
// TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Hybrid));
- // a comment for uncrustify
- const ArrayRef<RVec> pmeGpuForces = *fr->forceBufferIntermediate;
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
// X copy/transform to allow overlap.
// Note that this is advantageous for the case where NB and PME
// tasks run on the same device, but may not be ideal otherwise.
- launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+ launchPmeGpuFftAndGather(fr->pmedata, wcycle);
}
if (bUseGPU)
// PME GPU - intermediate CPU work in mixed mode
// TODO - move this below till after do_force_lowlevel() / special forces?
// (to allow overlap of spread/drid D2H with some CPU work)
- launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+ launchPmeGpuFftAndGather(fr->pmedata, wcycle);
}
/* Communicate coordinates and sum dipole if necessary +
if (useGpuPme)
{
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
matrix vir_Q;
real Vlr_q;
- pme_gpu_wait_for_gpu(fr->pmedata, wcycle, vir_Q, &Vlr_q);
-
- pme_gpu_reduce_outputs(wcycle, &forceWithVirial,
- pmeGpuForces,
- enerd, vir_Q, Vlr_q);
+ pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+ pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
}
if (bUseOrEmulGPU)
gmx_bool haveDirectVirialContributions;
#ifdef __cplusplus
/* TODO: Replace the pointer by an object once we got rid of C */
- std::vector<gmx::RVec> *forceBufferForDirectVirialContributions;
- /* This buffer is currently only used for storing the PME GPU output until reduction.
- * TODO: Pagelock/pin it
- * TODO: Replace the pointer by an object once we got rid of C */
- std::vector<gmx::RVec> *forceBufferIntermediate;
+ std::vector<gmx::RVec> *forceBufferForDirectVirialContributions;
#else
void *forceBufferForDirectVirialContributions_dummy;
- void *forceBufferIntermediate_dummy;
#endif
/* Data for PPPM/PME/Ewald */