#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2014,2015,2016,2017, by the GROMACS development team, led by
+# Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# the research papers on the package. Check out http://www.gromacs.org.
file(GLOB EWALD_SOURCES *.cpp)
-set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${EWALD_SOURCES} PARENT_SCOPE)
+file(GLOB EWALD_GPU_SHARED_SOURCES pme-gpu*.cpp)
+file(GLOB EWALD_CUDA_SOURCES *.cu)
+
if (GMX_USE_CUDA)
- file(GLOB EWALD_CUDA_SOURCES *.cu)
gmx_add_libgromacs_sources(${EWALD_CUDA_SOURCES})
+else ()
+ # Removing the GPU PME plain C++ fles from the build target
+ # TODO: not remove them for GMX_USE_OPENCL.
+ foreach (GPU_ONLY_SOURCE ${EWALD_GPU_SHARED_SOURCES})
+ list(REMOVE_ITEM EWALD_SOURCES ${GPU_ONLY_SOURCE})
+ endforeach()
endif()
+set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${EWALD_SOURCES} PARENT_SCOPE)
+
if (BUILD_TESTING)
add_subdirectory(tests)
endif()
#include "pme-gpu-internal.h"
-#include "config.h"
-
#include <list>
#include <string>
pmeGpu->common->boxScaler = pme->boxScaler;
}
-/*! \brief \libinternal
- * Finds out if PME with given inputs is possible to run on GPU.
- *
- * \param[in] pme The PME structure.
- * \param[out] error The error message if the input is not supported on GPU.
- * \returns True if this PME input is possible to run on GPU, false otherwise.
- */
-static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
-{
- std::list<std::string> errorReasons;
- if (pme->nnodes != 1)
- {
- errorReasons.push_back("PME decomposition");
- }
- if (pme->pme_order != 4)
- {
- errorReasons.push_back("interpolation orders other than 4");
- }
- if (pme->bFEP)
- {
- errorReasons.push_back("free energy calculations (multiple grids)");
- }
- if (pme->doLJ)
- {
- errorReasons.push_back("Lennard-Jones PME");
- }
-#if GMX_DOUBLE
- {
- errorReasons.push_back("double precision");
- }
-#endif
-#if GMX_GPU != GMX_GPU_CUDA
- {
- errorReasons.push_back("non-CUDA build of GROMACS");
- }
-#endif
-
- bool inputSupported = errorReasons.empty();
- if (!inputSupported && error)
- {
- std::string regressionTestMarker = "PME GPU does not support";
- // this prefix is tested for in the regression tests script gmxtest.pl
- *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
- }
- return inputSupported;
-}
-
/*! \libinternal \brief
* Initializes the PME GPU data at the beginning of the run.
*
*/
static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
{
- std::string errorString;
- bool canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
- if (!canRunOnGpu)
- {
- GMX_THROW(gmx::NotImplementedError(errorString));
- }
-
pme->gpu = new PmeGpu();
PmeGpu *pmeGpu = pme->gpu;
changePinningPolicy(&pmeGpu->staging.h_forces, gmx::PinningPolicy::CanBePinned);
*/
inline void pme_gpu_set_testing(PmeGpu *pmeGpu, bool testing)
{
- pmeGpu->settings.copyAllOutputs = testing;
- pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
+ if (pmeGpu)
+ {
+ pmeGpu->settings.copyAllOutputs = testing;
+ pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
+ }
}
/*! \libinternal \brief
* \param[in] pmeGpu The PME GPU structure.
* \returns The input/output forces.
*/
-gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu);
+CUDA_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(gmx::EmptyArrayRef())
/*! \libinternal \brief
* Returns the output virial and energy of the PME solving.
* \param[out] energy The output energy.
* \param[out] virial The output virial matrix.
*/
-void pme_gpu_get_energy_virial(const PmeGpu *pmeGpu, real *energy, matrix virial);
+CUDA_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
+ real *CUDA_FUNC_ARGUMENT(energy),
+ matrix CUDA_FUNC_ARGUMENT(virial)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
* \param[in] pmeGpu The PME GPU structure.
* \param[in] box The unit cell box.
*/
-void pme_gpu_update_input_box(PmeGpu *pmeGpu, const matrix box);
+CUDA_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
+ const matrix CUDA_FUNC_ARGUMENT(box)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
* \param[in] dimIndex Dimension index.
* \param[in] transform Layout transform type
*/
-void pme_gpu_transform_spline_atom_data(const PmeGpu *pmeGpu, const pme_atomcomm_t *atc,
- PmeSplineDataType type, int dimIndex, PmeLayoutTransform transform);
+CUDA_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
+ const pme_atomcomm_t *CUDA_FUNC_ARGUMENT(atc),
+ PmeSplineDataType CUDA_FUNC_ARGUMENT(type),
+ int CUDA_FUNC_ARGUMENT(dimIndex),
+ PmeLayoutTransform CUDA_FUNC_ARGUMENT(transform)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
* \param[out] gridSize Pointer to the grid dimensions to fill in.
* \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
*/
-void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
+CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
+ gmx::IVec *CUDA_FUNC_ARGUMENT(gridSize),
+ gmx::IVec *CUDA_FUNC_ARGUMENT(paddedGridSize)) CUDA_FUNC_TERM
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
* \param[in,out] gpuInfo The GPU information structure.
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Destroys the PME GPU data at the end of the run.
*
* \param[in] pmeGpu The PME GPU structure.
*/
-void pme_gpu_destroy(PmeGpu *pmeGpu);
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
* This is a function that should only be called in the beginning of the run and on domain decomposition.
* Should be called before the pme_gpu_set_io_ranges.
*/
-void pme_gpu_reinit_atoms(PmeGpu *pmeGpu,
- const int nAtoms,
- const real *charges);
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
+ const int CUDA_FUNC_ARGUMENT(nAtoms),
+ const real *CUDA_FUNC_ARGUMENT(charges)) CUDA_FUNC_TERM
/*! \brief \libinternal
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
#include "gmxpre.h"
-#include "config.h"
-
#include <list>
#include "gromacs/ewald/ewald-utils.h"
#include "pme-internal.h"
#include "pme-solve.h"
-PmeRunMode pme_run_mode(const gmx_pme_t *pme)
-{
- GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
- return pme->runMode;
-}
-
-bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
-{
- std::list<std::string> errorReasons;
- if (!EEL_PME(ir->coulombtype))
- {
- errorReasons.push_back("systems that do not use PME for electrostatics");
- }
- if (ir->pme_order != 4)
- {
- errorReasons.push_back("interpolation orders other than 4");
- }
- if (ir->efep != efepNO)
- {
- errorReasons.push_back("free energy calculations (multiple grids)");
- }
- if (EVDW_PME(ir->vdwtype))
- {
- errorReasons.push_back("Lennard-Jones PME");
- }
-#if GMX_DOUBLE
- {
- errorReasons.push_back("double precision");
- }
-#endif
-#if GMX_GPU != GMX_GPU_CUDA
- {
- errorReasons.push_back("non-CUDA build of GROMACS");
- }
-#endif
- if (ir->cutoff_scheme == ecutsGROUP)
- {
- errorReasons.push_back("group cutoff scheme");
- }
- if (EI_TPI(ir->eI))
- {
- errorReasons.push_back("test particle insertion");
- }
-
- bool inputSupported = errorReasons.empty();
- if (!inputSupported && error)
- {
- std::string regressionTestMarker = "PME GPU does not support";
- // this prefix is tested for in the regression tests script gmxtest.pl
- *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
- }
- return inputSupported;
-}
-
void pme_gpu_reset_timings(const gmx_pme_t *pme)
{
if (pme_gpu_active(pme))
#include <cmath>
#include <algorithm>
+#include <list>
#include "gromacs/ewald/ewald-utils.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "pme-spline-work.h"
#include "pme-spread.h"
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
+{
+ std::list<std::string> errorReasons;
+ if (!EEL_PME(ir->coulombtype))
+ {
+ errorReasons.push_back("systems that do not use PME for electrostatics");
+ }
+ if (ir->pme_order != 4)
+ {
+ errorReasons.push_back("interpolation orders other than 4");
+ }
+ if (ir->efep != efepNO)
+ {
+ errorReasons.push_back("free energy calculations (multiple grids)");
+ }
+ if (EVDW_PME(ir->vdwtype))
+ {
+ errorReasons.push_back("Lennard-Jones PME");
+ }
+#if GMX_DOUBLE
+ {
+ errorReasons.push_back("double precision");
+ }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+ {
+ errorReasons.push_back("non-CUDA build of GROMACS");
+ }
+#endif
+ if (ir->cutoff_scheme == ecutsGROUP)
+ {
+ errorReasons.push_back("group cutoff scheme");
+ }
+ if (EI_TPI(ir->eI))
+ {
+ errorReasons.push_back("test particle insertion");
+ }
+
+ bool inputSupported = errorReasons.empty();
+ if (!inputSupported && error)
+ {
+ std::string regressionTestMarker = "PME GPU does not support";
+ // this prefix is tested for in the regression tests script gmxtest.pl
+ *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+ }
+ return inputSupported;
+}
+
+/*! \brief \libinternal
+ * Finds out if PME with given inputs is possible to run on GPU.
+ * This function is an internal final check, validating the whole PME structure on creation,
+ * but it still duplicates the preliminary checks from the above (externally exposed) pme_gpu_supports_input() - just in case.
+ *
+ * \param[in] pme The PME structure.
+ * \param[out] error The error message if the input is not supported on GPU.
+ * \returns True if this PME input is possible to run on GPU, false otherwise.
+ */
+static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
+{
+ std::list<std::string> errorReasons;
+ if (pme->nnodes != 1)
+ {
+ errorReasons.push_back("PME decomposition");
+ }
+ if (pme->pme_order != 4)
+ {
+ errorReasons.push_back("interpolation orders other than 4");
+ }
+ if (pme->bFEP)
+ {
+ errorReasons.push_back("free energy calculations (multiple grids)");
+ }
+ if (pme->doLJ)
+ {
+ errorReasons.push_back("Lennard-Jones PME");
+ }
+#if GMX_DOUBLE
+ {
+ errorReasons.push_back("double precision");
+ }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+ {
+ errorReasons.push_back("non-CUDA build of GROMACS");
+ }
+#endif
+
+ bool inputSupported = errorReasons.empty();
+ if (!inputSupported && error)
+ {
+ std::string regressionTestMarker = "PME GPU does not support";
+ // this prefix is tested for in the regression tests script gmxtest.pl
+ *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+ }
+ return inputSupported;
+}
+
+PmeRunMode pme_run_mode(const gmx_pme_t *pme)
+{
+ GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
+ return pme->runMode;
+}
+
/*! \brief Number of bytes in a cache line.
*
* Must also be a multiple of the SIMD and SIMD4 register size, to
pme->lb_buf2 = nullptr;
pme->lb_buf_nalloc = 0;
- pme_gpu_reinit(pme.get(), gpuInfo);
+ if (pme_gpu_active(pme.get()))
+ {
+ if (!pme->gpu)
+ {
+ // Initial check of validity of the data
+ std::string errorString;
+ bool canRunOnGpu = pme_gpu_check_restrictions(pme.get(), &errorString);
+ if (!canRunOnGpu)
+ {
+ GMX_THROW(gmx::NotImplementedError(errorString));
+ }
+ }
+
+ pme_gpu_reinit(pme.get(), gpuInfo);
+ }
pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
#include <string>
+#include "gromacs/gpu_utils/gpu_macros.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/timing/walltime_accounting.h"
#include "gromacs/utility/arrayref.h"
return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
}
+// The following functions are all the PME GPU entry points,
+// currently inlining to nothing on non-CUDA builds.
+
/*! \brief
* Resets the PME GPU timings. To be called at the reset step.
*
* \param[in] pme The PME structure.
*/
-void pme_gpu_reset_timings(const gmx_pme_t *pme);
+CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme)) CUDA_FUNC_TERM
/*! \brief
* Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
* \param[in] pme The PME structure.
* \param[in] timings The gmx_wallclock_gpu_pme_t structure.
*/
-void pme_gpu_get_timings(const gmx_pme_t *pme,
- gmx_wallclock_gpu_pme_t *timings);
+CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
/* The main PME GPU functions */
* \param[in] flags The combination of flags to affect this PME computation.
* The flags are the GMX_PME_ flags from pme.h.
*/
-void pme_gpu_prepare_computation(gmx_pme_t *pme,
- bool needToUpdateBox,
- const matrix box,
- gmx_wallcycle *wcycle,
- int flags);
+CUDA_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ bool CUDA_FUNC_ARGUMENT(needToUpdateBox),
+ const matrix CUDA_FUNC_ARGUMENT(box),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
+ int CUDA_FUNC_ARGUMENT(flags)) CUDA_FUNC_TERM
/*! \brief
* Launches first stage of PME on GPU - H2D input transfers, spreading kernel, and D2H grid transfer if needed.
* \param[in] x The array of local atoms' coordinates.
* \param[in] wcycle The wallclock counter.
*/
-void pme_gpu_launch_spread(gmx_pme_t *pme,
- const rvec *x,
- gmx_wallcycle *wcycle);
+CUDA_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ const rvec *CUDA_FUNC_ARGUMENT(x),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
* \param[in] pme The PME data structure.
* \param[in] wcycle The wallclock counter.
*/
-void pme_gpu_launch_complex_transforms(gmx_pme_t *pme,
- gmx_wallcycle *wcycle);
+CUDA_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
/*! \brief
* Launches last stage of PME on GPU - force gathering and D2H force transfer.
* the output reciprocal forces into the host array, or copies its contents to the GPU first
* and accumulates. The reduction is non-atomic.
*/
-void pme_gpu_launch_gather(const gmx_pme_t *pme,
- gmx_wallcycle *wcycle,
- PmeForceOutputHandling forceTreatment);
+CUDA_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
+ PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment)) CUDA_FUNC_TERM
/*! \brief
* Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
* \param[out] virial The output virial matrix.
* \param[out] energy The output energy.
*/
-void pme_gpu_wait_finish_task(const gmx_pme_t *pme,
- gmx_wallcycle *wcycle,
- gmx::ArrayRef<const gmx::RVec> *forces,
- matrix virial,
- real *energy);
+CUDA_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
+ gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
+ matrix CUDA_FUNC_ARGUMENT(virial),
+ real *CUDA_FUNC_ARGUMENT(energy)) CUDA_FUNC_TERM
/*! \brief
* Attempts to complete PME GPU tasks.
*
* \param[in] completionKind Indicates whether PME task completion should only be checked rather than waited for
* \returns True if the PME GPU tasks have completed
*/
-bool pme_gpu_try_finish_task(const gmx_pme_t *pme,
- gmx_wallcycle *wcycle,
- gmx::ArrayRef<const gmx::RVec> *forces,
- matrix virial,
- real *energy,
- GpuTaskCompletion completionKind);
+CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
+ gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
+ matrix CUDA_FUNC_ARGUMENT(virial),
+ real *CUDA_FUNC_ARGUMENT(energy),
+ GpuTaskCompletion CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const size_t atomCount = atc->n;
GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
- IVec paddedGridSizeUnused, gridSize;
+ IVec paddedGridSizeUnused, gridSize(0, 0, 0);
pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
for (const auto &index : gridLineIndices)
GridOrdering gridOrdering,
const SparseGridValuesInput<ValueType> &gridValues)
{
- IVec gridSize, paddedGridSize;
+ IVec gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
ValueType *grid;
pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
template<typename ValueType>
static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme, CodePath mode, GridOrdering gridOrdering)
{
- IVec gridSize, paddedGridSize;
+ IVec gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
ValueType *grid;
pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
SparseGridValuesOutput<ValueType> gridValues;
{
real energy = 0.0f;
Matrix3x3 virial;
- matrix virialTemp; //TODO get rid of
+ matrix virialTemp = {{0}}; //TODO get rid of
switch (mode)
{
case CodePath::CPU:
{
gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
matrix vir_Q;
- real Vlr_q;
+ real Vlr_q = 0.0;
pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
}