pme-solve.cpp
pme-spline-work.cpp
pme-spread.cpp
+ # Files that implement stubs
+ pme-gpu-program.cpp
)
if (GMX_USE_CUDA)
gmx_add_libgromacs_sources(
pme-solve.cu
pme-spread.cu
pme-timings.cu
+ pme-gpu-program-impl.cu
# GPU-specific sources
pme-gpu.cpp
pme-gpu-internal.cpp
)
+else()
+ gmx_add_libgromacs_sources(
+ # Files that implement stubs
+ pme-gpu-program-impl.cpp
+ )
endif()
if (BUILD_TESTING)
#include "gromacs/utility/gmxassert.h"
#include "pme.cuh"
+#include "pme-gpu-program-impl.h"
#include "pme-timings.cuh"
//! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
}
}
+//! Kernel instantiations
+template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+
void pme_gpu_gather(PmeGpu *pmeGpu,
PmeForceOutputHandling forceTreatment,
const float *h_grid
GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
}
- constexpr bool wrapX = true;
- constexpr bool wrapY = true;
- GMX_UNUSED_VALUE(wrapX);
- GMX_UNUSED_VALUE(wrapY);
-
// TODO test different cache configs
int timingId = gtPME_GATHER;
- void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
- pme_gather_kernel<4, true, wrapX, wrapY> :
- pme_gather_kernel<4, false, wrapX, wrapY>;
+ // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pmeGpu->programHandle_->impl_->gatherKernel :
+ pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
pme_gpu_start_timing(pmeGpu, timingId);
auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
/*! \libinternal \brief
* Initializes the PME GPU data at the beginning of the run.
+ * TODO: this should become PmeGpu::PmeGpu()
*
- * \param[in,out] pme The PME structure.
- * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in,out] pme The PME structure.
+ * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
*/
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+static void pme_gpu_init(gmx_pme_t *pme,
+ gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram)
{
pme->gpu = new PmeGpu();
PmeGpu *pmeGpu = pme->gpu;
pme_gpu_set_testing(pmeGpu, false);
pmeGpu->deviceInfo = gpuInfo;
+ GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
+ pmeGpu->programHandle_ = pmeGpuProgram;
pme_gpu_init_internal(pmeGpu);
pme_gpu_init_sync_events(pmeGpu);
}
}
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+void pme_gpu_reinit(gmx_pme_t *pme,
+ gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram)
{
if (!pme_gpu_active(pme))
{
if (!pme->gpu)
{
/* First-time initialization */
- pme_gpu_init(pme, gpuInfo);
+ pme_gpu_init(pme, gpuInfo, pmeGpuProgram);
}
else
{
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
*
- * \param[in,out] pme The PME structure.
- * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in,out] pme The PME structure.
+ * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in] pmeGpuProgram The PME GPU program data
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo)) CUDA_FUNC_TERM
+ gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo),
+ PmeGpuProgramHandle CUDA_FUNC_ARGUMENT(pmeGpuProgram)) CUDA_FUNC_TERM
/*! \libinternal \brief
* Destroys the PME GPU data at the end of the run.
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl for non-GPU builds.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *) {};
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+#include "pme-gpu-internal.h" // for GridOrdering enum
+#include "pme-gpu-types-host.h"
+
+//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
+template <
+ const int order,
+ const bool computeSplines,
+ const bool spreadCharges,
+ const bool wrapX,
+ const bool wrapY
+ >
+void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template<
+ GridOrdering gridOrdering,
+ bool computeEnergyAndVirial
+ >
+void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template <
+ const int order,
+ const bool overwriteForces,
+ const bool wrapX,
+ const bool wrapY
+ >
+void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
+{
+ // PME interpolation order
+ constexpr int pmeOrder = 4;
+ GMX_UNUSED_VALUE(pmeOrder);
+ // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+ constexpr bool wrapX = true;
+ constexpr bool wrapY = true;
+ GMX_UNUSED_VALUE(wrapX);
+ GMX_UNUSED_VALUE(wrapY);
+ splineAndSpreadKernel = pme_spline_and_spread_kernel<pmeOrder, true, true, wrapX, wrapY>;
+ splineKernel = pme_spline_and_spread_kernel<pmeOrder, true, false, wrapX, wrapY>;
+ spreadKernel = pme_spline_and_spread_kernel<pmeOrder, false, true, wrapX, wrapY>;
+ gatherKernel = pme_gather_kernel<pmeOrder, true, wrapX, wrapY>;
+ gatherReduceWithInputKernel = pme_gather_kernel<pmeOrder, false, wrapX, wrapY>;
+ solveXYZKernel = pme_solve_kernel<GridOrdering::XYZ, false>;
+ solveXYZEnergyKernel = pme_solve_kernel<GridOrdering::XYZ, true>;
+ solveYZXKernel = pme_solve_kernel<GridOrdering::YZX, false>;
+ solveYZXEnergyKernel = pme_solve_kernel<GridOrdering::YZX, true>;
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl()
+{
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares PmeGpuProgramImpl, which stores PME GPU (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+
+#include "config.h"
+
+#include "gromacs/utility/classhelpers.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+// TODO uncomment when we learn to compile .cpp with CUDA compiler
+//! include "gromacs/gpu_utils/gputraits.cuh"
+using Context = void *;
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#elif GMX_GPU == GMX_GPU_NONE
+// TODO place in gputraits_stub.h
+using Context = void *;
+#endif
+
+struct gmx_device_info_t;
+
+/*! \internal
+ * \brief
+ * PME GPU persistent host program/kernel data, which should be initialized once for the whole execution.
+ *
+ * Primary purpose of this is to not recompile GPU kernels for each OpenCL unit test,
+ * while the relevant GPU context (e.g. cl_context) instance persists.
+ * In CUDA, this just assigns the kernel function pointers.
+ * This also implicitly relies on the fact that reasonable share of the kernels are always used.
+ * If there were more template parameters, even smaller share of all possible kernels would be used.
+ *
+ * \todo In future if we would need to react to either user input or
+ * auto-tuning to compile different kernels, then we might wish to
+ * revisit the number of kernels we pre-compile, and/or the management
+ * of their lifetime.
+ *
+ * This also doesn't manage cuFFT/clFFT kernels, which depend on the PME grid dimensions.
+ *
+ * TODO: pass cl_context to the constructor and not create it inside.
+ * See also Redmine #2522.
+ */
+struct PmeGpuProgramImpl
+{
+ /*! \brief
+ * This is a handle to the GPU context, which is just a dummy in CUDA,
+ * but is created/destroyed by this class in OpenCL.
+ * TODO: Later we want to be able to own the context at a higher level and not here,
+ * but this class would still need the non-owning context handle to build the kernels.
+ */
+ Context context;
+
+ //! Conveniently all the PME kernels use the same single argument type
+#if GMX_GPU == GMX_GPU_CUDA
+ using PmeKernelHandle = void(*)(const struct PmeGpuCudaKernelParams);
+#elif GMX_GPU == GMX_GPU_OPENCL
+ using PmeKernelHandle = cl_kernel;
+#else
+ using PmeKernelHandle = void *;
+#endif
+
+ //@{
+ /**
+ * Spread/spline kernels are compiled only for order of 4.
+ * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
+ * as a placeholder for implementing 1/2D decomposition.
+ */
+ PmeKernelHandle splineKernel;
+ PmeKernelHandle spreadKernel;
+ PmeKernelHandle splineAndSpreadKernel;
+ //@}
+
+ //@{
+ /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
+ * it can either reduce with previous forces in the host buffer, or ignore them.
+ */
+ PmeKernelHandle gatherReduceWithInputKernel;
+ PmeKernelHandle gatherKernel;
+ //@}
+
+ //@{
+ /** Solve kernel doesn't care about the interpolation order, but can optionally
+ * compute energy and virial, and supports XYZ and YZX grid orderings.
+ */
+ PmeKernelHandle solveYZXKernel;
+ PmeKernelHandle solveXYZKernel;
+ PmeKernelHandle solveYZXEnergyKernel;
+ PmeKernelHandle solveXYZEnergyKernel;
+ //@}
+
+ PmeGpuProgramImpl() = delete;
+ //! Constructor for the given device
+ explicit PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo);
+ ~PmeGpuProgramImpl();
+ GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program.h"
+
+#include "gromacs/compat/make_unique.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgram::PmeGpuProgram(const gmx_device_info_t *deviceInfo) :
+ impl_(gmx::compat::make_unique<PmeGpuProgramImpl>(deviceInfo))
+{
+}
+
+PmeGpuProgram::~PmeGpuProgram() = default;
+
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *deviceInfo)
+{
+ return gmx::compat::make_unique<PmeGpuProgram>(deviceInfo);
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief
+ * Declares PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ * \inlibraryapi
+ */
+
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_H
+
+#include <memory>
+
+struct PmeGpuProgramImpl;
+struct gmx_device_info_t;
+
+class PmeGpuProgram
+{
+ public:
+ explicit PmeGpuProgram(const gmx_device_info_t *deviceInfo);
+ ~PmeGpuProgram();
+
+ // TODO: design getters for information inside, if needed for PME, and make this private?
+ std::unique_ptr<PmeGpuProgramImpl> impl_;
+};
+
+/*! \brief This is an owning handle for the compiled PME GPU kernels.
+ */
+using PmeGpuProgramStorage = std::unique_ptr<PmeGpuProgram>;
+
+/*! \brief This is a handle for passing references to PME GPU program data.
+ * TODO: it should be a const reference, but for that the PmeGpu types need to be C++
+ */
+using PmeGpuProgramHandle = const PmeGpuProgram *;
+
+/*! \brief
+ * Factory function used to build persistent PME GPU program for the device at once.
+ * \todo This should shortly become GPU_FUNC to support OpenCL.
+ */
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *);
+
+#endif
#include <vector>
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
#include "gromacs/gpu_utils/gpu_utils.h" // for GpuApiCallBehavior
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/math/vectypes.h"
/*! \brief The information copied once per reinit from the CPU structure. */
std::shared_ptr<PmeShared> common; // TODO: make the CPU structure use the same type
+ //! A handle to the program created by buildPmeGpuProgram()
+ PmeGpuProgramHandle programHandle_;
+
/*! \brief The settings. */
PmeGpuSettings settings;
#include "gromacs/utility/gmxassert.h"
#include "pme.cuh"
+#include "pme-gpu-program-impl.h"
#include "pme-timings.cuh"
//! Solving kernel max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime
}
}
+//! Kernel instantiations
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+
void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
GridOrdering gridOrdering, bool computeEnergyAndVirial)
{
config.stream = pmeGpu->archSpecific->pmeStream;
int timingId = gtPME_SOLVE;
- void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (gridOrdering == GridOrdering::YZX)
{
kernelPtr = computeEnergyAndVirial ?
- pme_solve_kernel<GridOrdering::YZX, true> :
- pme_solve_kernel<GridOrdering::YZX, false>;
+ pmeGpu->programHandle_->impl_->solveYZXEnergyKernel :
+ pmeGpu->programHandle_->impl_->solveYZXKernel;
}
else if (gridOrdering == GridOrdering::XYZ)
{
kernelPtr = computeEnergyAndVirial ?
- pme_solve_kernel<GridOrdering::XYZ, true> :
- pme_solve_kernel<GridOrdering::XYZ, false>;
+ pmeGpu->programHandle_->impl_->solveXYZEnergyKernel :
+ pmeGpu->programHandle_->impl_->solveXYZKernel;
}
pme_gpu_start_timing(pmeGpu, timingId);
#include "gromacs/utility/gmxassert.h"
#include "pme.cuh"
+#include "pme-gpu-program-impl.h"
#include "pme-grid.h"
#include "pme-timings.cuh"
}
}
+//! Kernel instantiations
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+
void pme_gpu_spread(const PmeGpu *pmeGpu,
int gmx_unused gridIndex,
real *h_grid,
GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
}
- // These should later check for PME decomposition
- constexpr bool wrapX = true;
- constexpr bool wrapY = true;
- GMX_UNUSED_VALUE(wrapX);
- GMX_UNUSED_VALUE(wrapY);
-
int timingId;
- void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (computeSplines)
{
if (spreadCharges)
{
timingId = gtPME_SPLINEANDSPREAD;
- kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>;
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
}
else
{
timingId = gtPME_SPLINE;
- kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>;
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
}
}
else
{
timingId = gtPME_SPREAD;
- kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>;
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
}
pme_gpu_start_timing(pmeGpu, timingId);
PmeRunMode runMode,
PmeGpu *pmeGpu,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
const gmx::MDLogger & /*mdlog*/)
{
int use_threads, sum_use_threads, i;
}
}
- pme_gpu_reinit(pme.get(), gpuInfo);
+ pme_gpu_reinit(pme.get(), gpuInfo, pmeGpuProgram);
}
pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
*pmedata = gmx_pme_init(cr, numPmeDomains,
&irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
- pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+ pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, nullptr, dummyLogger);
//TODO this is mostly passing around current values
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
struct NumPmeDomains;
enum class GpuTaskCompletion;
+class PmeGpuProgram;
+//! Convenience name.
+using PmeGpuProgramHandle = const PmeGpuProgram *;
namespace gmx
{
*
* \throws gmx::InconsistentInputError if input grid sizes/PME order are inconsistent.
* \returns Pointer to newly allocated and initialized PME data.
+ *
+ * \todo We should evolve something like a \c GpuManager that holds \c
+ * gmx_device_info_t * and \c PmeGpuProgramHandle and perhaps other
+ * related things whose lifetime can/should exceed that of a task (or
+ * perhaps task manager). See Redmine #2522.
*/
gmx_pme_t *gmx_pme_init(const t_commrec *cr,
const NumPmeDomains &numPmeDomains,
PmeRunMode runMode,
PmeGpu *pmeGpu,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
const gmx::MDLogger &mdlog);
/*! \brief Destroys the PME data structure.*/
TestReferenceData refData;
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context.getCodePath();
+ CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+ EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
continue;
}
/* Describing the test uniquely */
SCOPED_TRACE(formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
", order %d, %zu atoms, %s",
- codePathToString(codePath), context.getDescription().c_str(),
+ codePathToString(codePath), context->getDescription().c_str(),
gridSize[XX], gridSize[YY], gridSize[ZZ],
pmeOrder,
atomCount,
(inputForceTreatment == PmeForceOutputHandling::ReduceWithInput) ? "with reduction" : "without reduction"
));
- PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), inputAtomData.coordinates, inputAtomData.charges, box);
+ PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+ context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
/* Setting some more inputs */
pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
TestReferenceData refData;
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context.getCodePath();
+ CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
+ EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
continue;
}
gridOrdering.second.c_str(),
computeEnergyAndVirial ? "with" : "without",
codePathToString(codePath),
- context.getDescription().c_str(),
+ context->getDescription().c_str(),
gridSize[XX], gridSize[YY], gridSize[ZZ],
ewaldCoeff_q, ewaldCoeff_lj
));
/* Running the test */
- PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context.getDeviceInfo(), box, ewaldCoeff_q, ewaldCoeff_lj);
+ PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
+ context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
const real cellVolume = box[0] * box[4] * box[8];
//FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
for (const auto &context : getPmeTestEnv()->getHardwareContexts())
{
- CodePath codePath = context.getCodePath();
+ CodePath codePath = context->getCodePath();
const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
if (!supportedInput)
{
/* Testing the failure for the unsupported input */
- EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, coordinates, charges, box), NotImplementedError);
+ EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
continue;
}
SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
", order %d, %zu atoms",
option.second.c_str(), codePathToString(codePath),
- context.getDescription().c_str(),
+ context->getDescription().c_str(),
gridSize[XX], gridSize[YY], gridSize[ZZ],
pmeOrder,
atomCount));
/* Running the test */
- PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), coordinates, charges, box);
+ PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+ context->getPmeGpuProgram(), coordinates, charges, box);
const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec,
CodePath mode,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
size_t atomCount,
const Matrix3x3 &box,
real ewaldCoeff_q = 1.0f,
t_commrec dummyCommrec = {0};
NumPmeDomains numPmeDomains = { 1, 1 };
gmx_pme_t *pmeDataRaw = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, atomCount, false, false, true,
- ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
+ ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, pmeGpuProgram, dummyLogger);
PmeSafePointer pme(pmeDataRaw); // taking ownership
// TODO get rid of this with proper matrix type
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
CodePath mode,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
const Matrix3x3 &box,
real ewaldCoeff_q,
real ewaldCoeff_lj
)
{
- return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+ return pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
// hiding the fact that PME actually needs to know the number of atoms in advance
}
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
CodePath mode,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
const Matrix3x3 &box
{
const size_t atomCount = coordinates.size();
GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
- PmeSafePointer pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+ PmeSafePointer pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
pme_atomcomm_t *atc = nullptr;
switch (mode)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
// PME stages
-// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
//! Simple PME initialization (no atom data)
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
CodePath mode = CodePath::CPU,
gmx_device_info_t *gpuInfo = nullptr,
+ PmeGpuProgramHandle pmeGpuProgram = nullptr,
const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
//! PME initialization with atom data and system box
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
CodePath mode,
gmx_device_info_t *gpuInfo,
+ PmeGpuProgramHandle pmeGpuProgram,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
const Matrix3x3 &box
#include "testhardwarecontexts.h"
+#include "gromacs/compat/make_unique.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/hw_info.h"
namespace test
{
+TestHardwareContext::~TestHardwareContext() = default;
+
const char *codePathToString(CodePath codePath)
{
switch (codePath)
void PmeTestEnvironment::SetUp()
{
- hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CPU, "", nullptr));
+ hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
hardwareInfo_ = hardwareInit();
if (!pme_gpu_supports_build(nullptr))
get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
std::string description = "(GPU " + std::string(stmp) + ") ";
// TODO should this be CodePath::GPU?
- hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CUDA, description.c_str(), deviceInfo));
+ hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>
+ (CodePath::CUDA, description.c_str(),
+ deviceInfo));
}
}
#include <gtest/gtest.h>
+#include "gromacs/ewald/pme-gpu-program.h"
#include "gromacs/hardware/detecthardware.h"
#include "gromacs/hardware/gpu_hw_info.h"
const char *codePathToString(CodePath codePath);
/*! \internal \brief
- * A structure to describe a hardware context - an abstraction over
- * gmx_device_info_t with a human-readable string.
+ * A structure to describe a hardware context that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
*/
struct TestHardwareContext
{
//! Hardware path for the code being tested.
- CodePath codePath_;
+ CodePath codePath_;
//! Readable description
- std::string description_;
+ std::string description_;
//! Device information pointer
- gmx_device_info_t *deviceInfo_;
+ gmx_device_info_t *deviceInfo_;
+ //! Persistent compiled GPU kernels for PME.
+ PmeGpuProgramStorage program_;
public:
//! Retuns the code path for this context.
CodePath getCodePath() const { return codePath_; }
//! Returns a human-readable context description line
- std::string getDescription() const{return description_; }
+ std::string getDescription() const{return description_; }
//! Returns the device info pointer
- gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+ gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+ //! Returns the persistent PME GPU kernels
+ PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
//! Constructs the context
TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) :
- codePath_(codePath), description_(description), deviceInfo_(deviceInfo){}
+ codePath_(codePath), description_(description), deviceInfo_(deviceInfo),
+ program_(buildPmeGpuProgram(deviceInfo_)) {}
+ ~TestHardwareContext();
};
-//! A container of hardware contexts
-typedef std::vector<TestHardwareContext> TestHardwareContexts;
+//! A container of handles to hardware contexts
+typedef std::vector < std::unique_ptr < TestHardwareContext>> TestHardwareContexts;
/*! \internal \brief
* This class performs one-time test initialization (enumerating the hardware)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "config.h"
+#include "gromacs/utility/basedefinitions.h" // for gmx_unused
+
/* These macros that let us define inlineable null implementations so
that non-GPU Gromacs can run with no overhead without conditionality
everywhere a GPU function is called. */
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/ewald/ewald-utils.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
#include "gromacs/fileio/checkpoint.h"
#include "gromacs/fileio/oenv.h"
#include "gromacs/fileio/tpxio.h"
}
gmx_device_info_t *pmeDeviceInfo = nullptr;
+ // Later, this program could contain kernels that might be later
+ // re-used as auto-tuning progresses, or subsequent simulations
+ // are invoked.
+ PmeGpuProgramStorage pmeGpuProgram;
// This works because only one task of each type is currently permitted.
- auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+ auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
if (pmeGpuTaskMapping != gpuTaskAssignment.end())
{
pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
init_gpu(mdlog, pmeDeviceInfo);
+ pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
}
/* getting number of PP/PME threads
mdrunOptions.reproducible,
ewaldcoeff_q, ewaldcoeff_lj,
nthreads_pme,
- pmeRunMode, nullptr, pmeDeviceInfo, mdlog);
+ pmeRunMode, nullptr,
+ pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}