Support persistent device context-derived data in PME tests
authorAleksei Iupinov <a.yupinov@gmail.com>
Thu, 26 Apr 2018 13:15:24 +0000 (15:15 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Mon, 28 May 2018 13:11:53 +0000 (15:11 +0200)
PME OpenCL will need to not recompile kernels for running each
unit test. With this in mind, a persistent PmeGpuProgram class
is tasked with GPU kernel setup, and passed around, using a typedef.
The purpose of the class is to hold the PME program data that should
only be set up once and live forever for the given device context.
PmeGpuProgramImpl structure is now tasked with managing
function pointers to the CUDA kernels' instances, and will later
be tasked with compiling OpenCL kernels.

Refs #2453, #2522

Change-Id: I85a01bfc92ec3a108825414b14e2be3731433c9a

23 files changed:
src/gromacs/ewald/CMakeLists.txt
src/gromacs/ewald/pme-gather.cu
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-internal.h
src/gromacs/ewald/pme-gpu-program-impl.cpp [new file with mode: 0644]
src/gromacs/ewald/pme-gpu-program-impl.cu [new file with mode: 0644]
src/gromacs/ewald/pme-gpu-program-impl.h [new file with mode: 0644]
src/gromacs/ewald/pme-gpu-program.cpp [new file with mode: 0644]
src/gromacs/ewald/pme-gpu-program.h [new file with mode: 0644]
src/gromacs/ewald/pme-gpu-types-host.h
src/gromacs/ewald/pme-solve.cu
src/gromacs/ewald/pme-spread.cu
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/pme.h
src/gromacs/ewald/tests/pmegathertest.cpp
src/gromacs/ewald/tests/pmesolvetest.cpp
src/gromacs/ewald/tests/pmesplinespreadtest.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/pmetestcommon.h
src/gromacs/ewald/tests/testhardwarecontexts.cpp
src/gromacs/ewald/tests/testhardwarecontexts.h
src/gromacs/gpu_utils/gpu_macros.h
src/gromacs/mdrun/runner.cpp

index 91cbfdc8bf0ebe9b361442037797466288712989..a8f444ce6f216f488b1470f519c4678c065c3687 100644 (file)
@@ -47,6 +47,8 @@ gmx_add_libgromacs_sources(
     pme-solve.cpp
     pme-spline-work.cpp
     pme-spread.cpp
+    # Files that implement stubs
+    pme-gpu-program.cpp
     )
 if (GMX_USE_CUDA)
     gmx_add_libgromacs_sources(
@@ -57,10 +59,16 @@ if (GMX_USE_CUDA)
         pme-solve.cu
         pme-spread.cu
         pme-timings.cu
+        pme-gpu-program-impl.cu
         # GPU-specific sources
         pme-gpu.cpp
         pme-gpu-internal.cpp
         )
+else()
+    gmx_add_libgromacs_sources(
+        # Files that implement stubs
+        pme-gpu-program-impl.cpp
+        )
 endif()
 
 if (BUILD_TESTING)
index 71d1687926a63104abab09ed1d4c3b943f2b1e99..67232973a93403dc7f23dadd1518ae302e3a4f76 100644 (file)
@@ -48,6 +48,7 @@
 #include "gromacs/utility/gmxassert.h"
 
 #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
 #include "pme-timings.cuh"
 
 //! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
@@ -417,6 +418,10 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     }
 }
 
+//! Kernel instantiations
+template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+
 void pme_gpu_gather(PmeGpu                *pmeGpu,
                     PmeForceOutputHandling forceTreatment,
                     const float           *h_grid
@@ -459,17 +464,13 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
     }
 
-    constexpr bool wrapX = true;
-    constexpr bool wrapY = true;
-    GMX_UNUSED_VALUE(wrapX);
-    GMX_UNUSED_VALUE(wrapY);
-
     // TODO test different cache configs
 
     int  timingId = gtPME_GATHER;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
-        pme_gather_kernel<4, true, wrapX, wrapY> :
-        pme_gather_kernel<4, false, wrapX, wrapY>;
+    // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+        pmeGpu->programHandle_->impl_->gatherKernel :
+        pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
 
     pme_gpu_start_timing(pmeGpu, timingId);
     auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
index 01eb64cba2e5962a22d3712ae40af26cc3795fa8..105f88fad72526e441614b21756680c72d128430 100644 (file)
@@ -210,11 +210,15 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
 
 /*! \libinternal \brief
  * Initializes the PME GPU data at the beginning of the run.
+ * TODO: this should become PmeGpu::PmeGpu()
  *
- * \param[in,out] pme       The PME structure.
- * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in,out] pme            The PME structure.
+ * \param[in,out] gpuInfo        The GPU information structure.
+ * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
  */
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+static void pme_gpu_init(gmx_pme_t          *pme,
+                         gmx_device_info_t  *gpuInfo,
+                         PmeGpuProgramHandle pmeGpuProgram)
 {
     pme->gpu          = new PmeGpu();
     PmeGpu *pmeGpu = pme->gpu;
@@ -230,6 +234,8 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
     pme_gpu_set_testing(pmeGpu, false);
 
     pmeGpu->deviceInfo = gpuInfo;
+    GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
+    pmeGpu->programHandle_ = pmeGpuProgram;
 
     pme_gpu_init_internal(pmeGpu);
     pme_gpu_init_sync_events(pmeGpu);
@@ -314,7 +320,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+void pme_gpu_reinit(gmx_pme_t          *pme,
+                    gmx_device_info_t  *gpuInfo,
+                    PmeGpuProgramHandle pmeGpuProgram)
 {
     if (!pme_gpu_active(pme))
     {
@@ -324,7 +332,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
     if (!pme->gpu)
     {
         /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo);
+        pme_gpu_init(pme, gpuInfo, pmeGpuProgram);
     }
     else
     {
index e6f5a3572d2251f91aa65a17eb9c3bd8e95e7428..06738dce1b147377e3be3d444ab2b5a17ea69e74 100644 (file)
@@ -671,12 +671,14 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARG
 /*! \libinternal \brief
  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
  *
- * \param[in,out] pme       The PME structure.
- * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in,out] pme             The PME structure.
+ * \param[in,out] gpuInfo         The GPU information structure.
+ * \param[in]     pmeGpuProgram   The PME GPU program data
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
 CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
-                                        gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo)) CUDA_FUNC_TERM
+                                        gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo),
+                                        PmeGpuProgramHandle CUDA_FUNC_ARGUMENT(pmeGpuProgram)) CUDA_FUNC_TERM
 
 /*! \libinternal \brief
  * Destroys the PME GPU data at the end of the run.
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cpp b/src/gromacs/ewald/pme-gpu-program-impl.cpp
new file mode 100644 (file)
index 0000000..1fc76b5
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl for non-GPU builds.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *) {};
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cu b/src/gromacs/ewald/pme-gpu-program-impl.cu
new file mode 100644 (file)
index 0000000..4d37929
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+#include "pme-gpu-internal.h" // for GridOrdering enum
+#include "pme-gpu-types-host.h"
+
+//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
+template <
+    const int order,
+    const bool computeSplines,
+    const bool spreadCharges,
+    const bool wrapX,
+    const bool wrapY
+    >
+void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template<
+    GridOrdering gridOrdering,
+    bool computeEnergyAndVirial
+    >
+void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template <
+    const int order,
+    const bool overwriteForces,
+    const bool wrapX,
+    const bool wrapY
+    >
+void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
+{
+    // PME interpolation order
+    constexpr int  pmeOrder = 4;
+    GMX_UNUSED_VALUE(pmeOrder);
+    // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+    constexpr bool wrapX = true;
+    constexpr bool wrapY = true;
+    GMX_UNUSED_VALUE(wrapX);
+    GMX_UNUSED_VALUE(wrapY);
+    splineAndSpreadKernel       = pme_spline_and_spread_kernel<pmeOrder, true, true, wrapX, wrapY>;
+    splineKernel                = pme_spline_and_spread_kernel<pmeOrder, true, false, wrapX, wrapY>;
+    spreadKernel                = pme_spline_and_spread_kernel<pmeOrder, false, true, wrapX, wrapY>;
+    gatherKernel                = pme_gather_kernel<pmeOrder, true, wrapX, wrapY>;
+    gatherReduceWithInputKernel = pme_gather_kernel<pmeOrder, false, wrapX, wrapY>;
+    solveXYZKernel              = pme_solve_kernel<GridOrdering::XYZ, false>;
+    solveXYZEnergyKernel        = pme_solve_kernel<GridOrdering::XYZ, true>;
+    solveYZXKernel              = pme_solve_kernel<GridOrdering::YZX, false>;
+    solveYZXEnergyKernel        = pme_solve_kernel<GridOrdering::YZX, true>;
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl()
+{
+}
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.h b/src/gromacs/ewald/pme-gpu-program-impl.h
new file mode 100644 (file)
index 0000000..ee551d1
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares PmeGpuProgramImpl, which stores PME GPU (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+
+#include "config.h"
+
+#include "gromacs/utility/classhelpers.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+// TODO uncomment when we learn to compile .cpp with CUDA compiler
+//! include "gromacs/gpu_utils/gputraits.cuh"
+using Context = void *;
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#elif GMX_GPU == GMX_GPU_NONE
+// TODO place in gputraits_stub.h
+using Context = void *;
+#endif
+
+struct gmx_device_info_t;
+
+/*! \internal
+ * \brief
+ * PME GPU persistent host program/kernel data, which should be initialized once for the whole execution.
+ *
+ * Primary purpose of this is to not recompile GPU kernels for each OpenCL unit test,
+ * while the relevant GPU context (e.g. cl_context) instance persists.
+ * In CUDA, this just assigns the kernel function pointers.
+ * This also implicitly relies on the fact that reasonable share of the kernels are always used.
+ * If there were more template parameters, even smaller share of all possible kernels would be used.
+ *
+ * \todo In future if we would need to react to either user input or
+ * auto-tuning to compile different kernels, then we might wish to
+ * revisit the number of kernels we pre-compile, and/or the management
+ * of their lifetime.
+ *
+ * This also doesn't manage cuFFT/clFFT kernels, which depend on the PME grid dimensions.
+ *
+ * TODO: pass cl_context to the constructor and not create it inside.
+ * See also Redmine #2522.
+ */
+struct PmeGpuProgramImpl
+{
+    /*! \brief
+     * This is a handle to the GPU context, which is just a dummy in CUDA,
+     * but is created/destroyed by this class in OpenCL.
+     * TODO: Later we want to be able to own the context at a higher level and not here,
+     * but this class would still need the non-owning context handle to build the kernels.
+     */
+    Context context;
+
+    //! Conveniently all the PME kernels use the same single argument type
+#if GMX_GPU == GMX_GPU_CUDA
+    using PmeKernelHandle = void(*)(const struct PmeGpuCudaKernelParams);
+#elif GMX_GPU == GMX_GPU_OPENCL
+    using PmeKernelHandle = cl_kernel;
+#else
+    using PmeKernelHandle = void *;
+#endif
+
+    //@{
+    /**
+     * Spread/spline kernels are compiled only for order of 4.
+     * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
+     * as a placeholder for implementing 1/2D decomposition.
+     */
+    PmeKernelHandle splineKernel;
+    PmeKernelHandle spreadKernel;
+    PmeKernelHandle splineAndSpreadKernel;
+    //@}
+
+    //@{
+    /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
+     * it can either reduce with previous forces in the host buffer, or ignore them.
+     */
+    PmeKernelHandle gatherReduceWithInputKernel;
+    PmeKernelHandle gatherKernel;
+    //@}
+
+    //@{
+    /** Solve kernel doesn't care about the interpolation order, but can optionally
+     * compute energy and virial, and supports XYZ and YZX grid orderings.
+     */
+    PmeKernelHandle solveYZXKernel;
+    PmeKernelHandle solveXYZKernel;
+    PmeKernelHandle solveYZXEnergyKernel;
+    PmeKernelHandle solveXYZEnergyKernel;
+    //@}
+
+    PmeGpuProgramImpl() = delete;
+    //! Constructor for the given device
+    explicit PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo);
+    ~PmeGpuProgramImpl();
+    GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
+};
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu-program.cpp b/src/gromacs/ewald/pme-gpu-program.cpp
new file mode 100644 (file)
index 0000000..50031f3
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program.h"
+
+#include "gromacs/compat/make_unique.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgram::PmeGpuProgram(const gmx_device_info_t *deviceInfo) :
+    impl_(gmx::compat::make_unique<PmeGpuProgramImpl>(deviceInfo))
+{
+}
+
+PmeGpuProgram::~PmeGpuProgram() = default;
+
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *deviceInfo)
+{
+    return gmx::compat::make_unique<PmeGpuProgram>(deviceInfo);
+}
diff --git a/src/gromacs/ewald/pme-gpu-program.h b/src/gromacs/ewald/pme-gpu-program.h
new file mode 100644 (file)
index 0000000..aa54a36
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief
+ * Declares PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ * \inlibraryapi
+ */
+
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_H
+
+#include <memory>
+
+struct PmeGpuProgramImpl;
+struct gmx_device_info_t;
+
+class PmeGpuProgram
+{
+    public:
+        explicit PmeGpuProgram(const gmx_device_info_t *deviceInfo);
+        ~PmeGpuProgram();
+
+        // TODO: design getters for information inside, if needed for PME, and make this private?
+        std::unique_ptr<PmeGpuProgramImpl> impl_;
+};
+
+/*! \brief This is an owning handle for the compiled PME GPU kernels.
+ */
+using PmeGpuProgramStorage = std::unique_ptr<PmeGpuProgram>;
+
+/*! \brief This is a handle for passing references to PME GPU program data.
+ * TODO: it should be a const reference, but for that the PmeGpu types need to be C++
+ */
+using PmeGpuProgramHandle = const PmeGpuProgram *;
+
+/*! \brief
+ * Factory function used to build persistent PME GPU program for the device at once.
+ * \todo This should shortly become GPU_FUNC to support OpenCL.
+ */
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *);
+
+#endif
index f74f90e53396d8586b0197bee527ee2ad6cb199b..ccc83940b1ad3e06ec167886610fc1d653c706a3 100644 (file)
@@ -54,6 +54,7 @@
 #include <vector>
 
 #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
 #include "gromacs/gpu_utils/gpu_utils.h"      // for GpuApiCallBehavior
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/vectypes.h"
@@ -171,6 +172,9 @@ struct PmeGpu
     /*! \brief The information copied once per reinit from the CPU structure. */
     std::shared_ptr<PmeShared> common; // TODO: make the CPU structure use the same type
 
+    //! A handle to the program created by buildPmeGpuProgram()
+    PmeGpuProgramHandle programHandle_;
+
     /*! \brief The settings. */
     PmeGpuSettings settings;
 
index 6a035dd4107c45001c7471f4287225db218c25ce..51516300d279acf9aac3a8cae877776f0cdf1ba5 100644 (file)
@@ -48,6 +48,7 @@
 #include "gromacs/utility/gmxassert.h"
 
 #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
 #include "pme-timings.cuh"
 
 //! Solving kernel max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime
@@ -404,6 +405,12 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
     }
 }
 
+//! Kernel instantiations
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+
 void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
                    GridOrdering gridOrdering, bool computeEnergyAndVirial)
 {
@@ -456,18 +463,18 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
     config.stream       = pmeGpu->archSpecific->pmeStream;
 
     int  timingId = gtPME_SOLVE;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (gridOrdering == GridOrdering::YZX)
     {
         kernelPtr = computeEnergyAndVirial ?
-            pme_solve_kernel<GridOrdering::YZX, true> :
-            pme_solve_kernel<GridOrdering::YZX, false>;
+            pmeGpu->programHandle_->impl_->solveYZXEnergyKernel :
+            pmeGpu->programHandle_->impl_->solveYZXKernel;
     }
     else if (gridOrdering == GridOrdering::XYZ)
     {
         kernelPtr = computeEnergyAndVirial ?
-            pme_solve_kernel<GridOrdering::XYZ, true> :
-            pme_solve_kernel<GridOrdering::XYZ, false>;
+            pmeGpu->programHandle_->impl_->solveXYZEnergyKernel :
+            pmeGpu->programHandle_->impl_->solveXYZKernel;
     }
 
     pme_gpu_start_timing(pmeGpu, timingId);
index 89df2c77f9f2eebff514d93c9f088caba66e7656..564bdfee079498266cece0c760fa3d85cbe8265d 100644 (file)
@@ -53,6 +53,7 @@
 #include "gromacs/utility/gmxassert.h"
 
 #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
 #include "pme-grid.h"
 #include "pme-timings.cuh"
 
@@ -482,6 +483,11 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
     }
 }
 
+//! Kernel instantiations
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+
 void pme_gpu_spread(const PmeGpu    *pmeGpu,
                     int gmx_unused   gridIndex,
                     real            *h_grid,
@@ -517,31 +523,25 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
         GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
     }
 
-    // These should later check for PME decomposition
-    constexpr bool wrapX = true;
-    constexpr bool wrapY = true;
-    GMX_UNUSED_VALUE(wrapX);
-    GMX_UNUSED_VALUE(wrapY);
-
     int  timingId;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
     if (computeSplines)
     {
         if (spreadCharges)
         {
             timingId  = gtPME_SPLINEANDSPREAD;
-            kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>;
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
         }
         else
         {
             timingId  = gtPME_SPLINE;
-            kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>;
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
         }
     }
     else
     {
         timingId  = gtPME_SPREAD;
-        kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>;
+        kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
     }
 
     pme_gpu_start_timing(pmeGpu, timingId);
index c18fa0881a396e968ea80bf08e8efb9bc84e90e4..8dc6f329c65fad846dbad8225c93a28a9e37aa23 100644 (file)
@@ -608,6 +608,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                         PmeRunMode           runMode,
                         PmeGpu              *pmeGpu,
                         gmx_device_info_t   *gpuInfo,
+                        PmeGpuProgramHandle  pmeGpuProgram,
                         const gmx::MDLogger  & /*mdlog*/)
 {
     int               use_threads, sum_use_threads, i;
@@ -954,7 +955,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
             }
         }
 
-        pme_gpu_reinit(pme.get(), gpuInfo);
+        pme_gpu_reinit(pme.get(), gpuInfo, pmeGpuProgram);
     }
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
@@ -1007,7 +1008,7 @@ void gmx_pme_reinit(struct gmx_pme_t **pmedata,
         NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
         *pmedata = gmx_pme_init(cr, numPmeDomains,
                                 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
-                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, nullptr, dummyLogger);
         //TODO this is mostly passing around current values
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
index 88ffa8d8ab812b1deb1c7804d6f4084e41cc166e..2b5a4925571124b867350eaefeb1fa00aefdbd57 100644 (file)
@@ -69,6 +69,9 @@ struct gmx_wallcycle;
 struct NumPmeDomains;
 
 enum class GpuTaskCompletion;
+class PmeGpuProgram;
+//! Convenience name.
+using PmeGpuProgramHandle = const PmeGpuProgram *;
 
 namespace gmx
 {
@@ -123,6 +126,11 @@ bool gmx_pme_check_restrictions(int pme_order,
  *
  * \throws   gmx::InconsistentInputError if input grid sizes/PME order are inconsistent.
  * \returns  Pointer to newly allocated and initialized PME data.
+ *
+ * \todo We should evolve something like a \c GpuManager that holds \c
+ * gmx_device_info_t * and \c PmeGpuProgramHandle and perhaps other
+ * related things whose lifetime can/should exceed that of a task (or
+ * perhaps task manager). See Redmine #2522.
  */
 gmx_pme_t *gmx_pme_init(const t_commrec *cr,
                         const NumPmeDomains &numPmeDomains,
@@ -134,6 +142,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr,
                         PmeRunMode runMode,
                         PmeGpu *pmeGpu,
                         gmx_device_info_t *gpuInfo,
+                        PmeGpuProgramHandle pmeGpuProgram,
                         const gmx::MDLogger &mdlog);
 
 /*! \brief Destroys the PME data structure.*/
index f89e8e4eaf03825267bcb2ecdfecc9606251c94e..6d1286f79429badb3781c6324933137cf627e969 100644 (file)
@@ -389,26 +389,27 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
             TestReferenceData refData;
             for (const auto &context : getPmeTestEnv()->getHardwareContexts())
             {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                 const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                 if (!supportedInput)
                 {
                     /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
                     continue;
                 }
 
                 /* Describing the test uniquely */
                 SCOPED_TRACE(formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
                                           ", order %d, %zu atoms, %s",
-                                          codePathToString(codePath), context.getDescription().c_str(),
+                                          codePathToString(codePath), context->getDescription().c_str(),
                                           gridSize[XX], gridSize[YY], gridSize[ZZ],
                                           pmeOrder,
                                           atomCount,
                                           (inputForceTreatment == PmeForceOutputHandling::ReduceWithInput) ? "with reduction" : "without reduction"
                                           ));
 
-                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), inputAtomData.coordinates, inputAtomData.charges, box);
+                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
 
                 /* Setting some more inputs */
                 pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
index a86b356ede3928f85e5b3d83afe8463d3f1b7251..6f418fabe205360305b249dbeb340016b21d89b7 100644 (file)
@@ -112,12 +112,12 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
             TestReferenceData refData;
             for (const auto &context : getPmeTestEnv()->getHardwareContexts())
             {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                 const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                 if (!supportedInput)
                 {
                     /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
+                    EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
                     continue;
                 }
 
@@ -136,13 +136,14 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
                                                   gridOrdering.second.c_str(),
                                                   computeEnergyAndVirial ? "with" : "without",
                                                   codePathToString(codePath),
-                                                  context.getDescription().c_str(),
+                                                  context->getDescription().c_str(),
                                                   gridSize[XX], gridSize[YY], gridSize[ZZ],
                                                   ewaldCoeff_q, ewaldCoeff_lj
                                                   ));
 
                         /* Running the test */
-                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context.getDeviceInfo(), box, ewaldCoeff_q, ewaldCoeff_lj);
+                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
+                                                              context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
                         pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
                         const real     cellVolume = box[0] * box[4] * box[8];
                         //FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
index 71988a6ef79508e8734907d5a4398ccc3a7101c8..01df43ed82c11fbc2d66627376993902980c7e5d 100644 (file)
@@ -123,12 +123,12 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
 
             for (const auto &context : getPmeTestEnv()->getHardwareContexts())
             {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                 const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                 if (!supportedInput)
                 {
                     /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, coordinates, charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
                     continue;
                 }
 
@@ -139,14 +139,15 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
                     SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
                                               ", order %d, %zu atoms",
                                               option.second.c_str(), codePathToString(codePath),
-                                              context.getDescription().c_str(),
+                                              context->getDescription().c_str(),
                                               gridSize[XX], gridSize[YY], gridSize[ZZ],
                                               pmeOrder,
                                               atomCount));
 
                     /* Running the test */
 
-                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), coordinates, charges, box);
+                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+                                                          context->getPmeGpuProgram(), coordinates, charges, box);
 
                     const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
                     const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
index b30a8c77a511a2bbe3660c4befdd596b902c0ad0..6d2890a88ab51ceb5a9264f53eab03c1e8773dd0 100644 (file)
@@ -102,6 +102,7 @@ gmx_uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
 static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                       CodePath                  mode,
                                       gmx_device_info_t        *gpuInfo,
+                                      PmeGpuProgramHandle       pmeGpuProgram,
                                       size_t                    atomCount,
                                       const Matrix3x3          &box,
                                       real                      ewaldCoeff_q = 1.0f,
@@ -113,7 +114,7 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
     t_commrec      dummyCommrec  = {0};
     NumPmeDomains  numPmeDomains = { 1, 1 };
     gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, atomCount, false, false, true,
-                                                ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
+                                                ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, pmeGpuProgram, dummyLogger);
     PmeSafePointer pme(pmeDataRaw); // taking ownership
 
     // TODO get rid of this with proper matrix type
@@ -150,12 +151,13 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
 PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
                             CodePath                  mode,
                             gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                             const Matrix3x3          &box,
                             real                      ewaldCoeff_q,
                             real                      ewaldCoeff_lj
                             )
 {
-    return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+    return pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
     // hiding the fact that PME actually needs to know the number of atoms in advance
 }
 
@@ -163,6 +165,7 @@ PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                             CodePath                  mode,
                             gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
                             const Matrix3x3          &box
@@ -170,7 +173,7 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
 {
     const size_t    atomCount = coordinates.size();
     GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
-    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
     pme_atomcomm_t *atc     = nullptr;
 
     switch (mode)
index cb981d6b70f563d2c2482a79936ff590712f1e46..4378d1528c4f854c548bbeb040e0cce48aaa6146 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -117,17 +117,18 @@ gmx_uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
 
 // PME stages
 
-// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
 //! Simple PME initialization (no atom data)
 PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
                             CodePath mode = CodePath::CPU,
                             gmx_device_info_t *gpuInfo = nullptr,
+                            PmeGpuProgramHandle pmeGpuProgram = nullptr,
                             const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
                             real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
 //! PME initialization with atom data and system box
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                             CodePath                  mode,
                             gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
                             const Matrix3x3          &box
index 29e4e31ea843eef53fbe634004779784281c73ce..b3c95b29eb7bf6e4c60d4af8834a3e874bc9f047 100644 (file)
@@ -44,6 +44,7 @@
 
 #include "testhardwarecontexts.h"
 
+#include "gromacs/compat/make_unique.h"
 #include "gromacs/ewald/pme.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/hw_info.h"
@@ -57,6 +58,8 @@ namespace gmx
 namespace test
 {
 
+TestHardwareContext::~TestHardwareContext() = default;
+
 const char *codePathToString(CodePath codePath)
 {
     switch (codePath)
@@ -106,7 +109,7 @@ static gmx_hw_info_t *hardwareInit()
 
 void PmeTestEnvironment::SetUp()
 {
-    hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CPU, "", nullptr));
+    hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
 
     hardwareInfo_ = hardwareInit();
     if (!pme_gpu_supports_build(nullptr))
@@ -125,7 +128,9 @@ void PmeTestEnvironment::SetUp()
         get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
         std::string description = "(GPU " + std::string(stmp) + ") ";
         // TODO should this be CodePath::GPU?
-        hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CUDA, description.c_str(), deviceInfo));
+        hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>
+                                           (CodePath::CUDA, description.c_str(),
+                                           deviceInfo));
     }
 }
 
index 09ab2d1c358d6707aebe2bbc5198802c96f935f7..e7723b709ff44b2fad3d79cec8cc8aeab18e3ae0 100644 (file)
@@ -48,6 +48,7 @@
 
 #include <gtest/gtest.h>
 
+#include "gromacs/ewald/pme-gpu-program.h"
 #include "gromacs/hardware/detecthardware.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 
@@ -66,32 +67,38 @@ enum class CodePath
 const char *codePathToString(CodePath codePath);
 
 /*! \internal \brief
- * A structure to describe a hardware context - an abstraction over
- * gmx_device_info_t with a human-readable string.
+ * A structure to describe a hardware context  that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
  */
 struct TestHardwareContext
 {
     //! Hardware path for the code being tested.
-    CodePath           codePath_;
+    CodePath                  codePath_;
     //! Readable description
-    std::string        description_;
+    std::string               description_;
     //! Device information pointer
-    gmx_device_info_t *deviceInfo_;
+    gmx_device_info_t        *deviceInfo_;
+    //! Persistent compiled GPU kernels for PME.
+    PmeGpuProgramStorage      program_;
 
     public:
         //! Retuns the code path for this context.
         CodePath getCodePath() const { return codePath_; }
         //! Returns a human-readable context description line
-        std::string        getDescription() const{return description_; }
+        std::string         getDescription() const{return description_; }
         //! Returns the device info pointer
-        gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+        gmx_device_info_t  *getDeviceInfo() const{return deviceInfo_; }
+        //! Returns the persistent PME GPU kernels
+        PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
         //! Constructs the context
         TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) :
-            codePath_(codePath), description_(description), deviceInfo_(deviceInfo){}
+            codePath_(codePath), description_(description), deviceInfo_(deviceInfo),
+            program_(buildPmeGpuProgram(deviceInfo_)) {}
+        ~TestHardwareContext();
 };
 
-//! A container of hardware contexts
-typedef std::vector<TestHardwareContext> TestHardwareContexts;
+//! A container of handles to hardware contexts
+typedef std::vector < std::unique_ptr < TestHardwareContext>> TestHardwareContexts;
 
 /*! \internal \brief
  * This class performs one-time test initialization (enumerating the hardware)
index 182c2035bead423318fee60f6690800b2f39c40f..e9d597a7548e0e4f3e095786b0351e3d1c573b9c 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -37,6 +37,8 @@
 
 #include "config.h"
 
+#include "gromacs/utility/basedefinitions.h" // for gmx_unused
+
 /* These macros that let us define inlineable null implementations so
    that non-GPU Gromacs can run with no overhead without conditionality
    everywhere a GPU function is called. */
index b08c74bc1e0b7697a71bcb3818db22ad43538ea5..3a77c003518ce8f69ca83d0a249f814dea22da61 100644 (file)
@@ -59,6 +59,7 @@
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/ewald/ewald-utils.h"
 #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
 #include "gromacs/fileio/checkpoint.h"
 #include "gromacs/fileio/oenv.h"
 #include "gromacs/fileio/tpxio.h"
@@ -1036,12 +1037,17 @@ int Mdrunner::mdrunner()
     }
 
     gmx_device_info_t *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
     // This works because only one task of each type is currently permitted.
-    auto               pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    auto                 pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
     if (pmeGpuTaskMapping != gpuTaskAssignment.end())
     {
         pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
         init_gpu(mdlog, pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
     }
 
     /* getting number of PP/PME threads
@@ -1234,7 +1240,8 @@ int Mdrunner::mdrunner()
                                        mdrunOptions.reproducible,
                                        ewaldcoeff_q, ewaldcoeff_lj,
                                        nthreads_pme,
-                                       pmeRunMode, nullptr, pmeDeviceInfo, mdlog);
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
             }
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         }