Support persistent device context-derived data in PME tests

author Aleksei Iupinov <a.yupinov@gmail.com>

Thu, 26 Apr 2018 13:15:24 +0000 (15:15 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 28 May 2018 13:11:53 +0000 (15:11 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Thu, 26 Apr 2018 13:15:24 +0000 (15:15 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 28 May 2018 13:11:53 +0000 (15:11 +0200)
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt

index 91cbfdc8bf0ebe9b361442037797466288712989..a8f444ce6f216f488b1470f519c4678c065c3687 100644 (file)
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -47,6 +47,8 @@ gmx_add_libgromacs_sources(
      pme-solve.cpp
      pme-spline-work.cpp
      pme-spread.cpp
+    # Files that implement stubs
+    pme-gpu-program.cpp
      )
  if (GMX_USE_CUDA)
      gmx_add_libgromacs_sources(
@@ -57,10 +59,16 @@ if (GMX_USE_CUDA)
          pme-solve.cu
          pme-spread.cu
          pme-timings.cu
+        pme-gpu-program-impl.cu
          # GPU-specific sources
          pme-gpu.cpp
          pme-gpu-internal.cpp
          )
+else()
+    gmx_add_libgromacs_sources(
+        # Files that implement stubs
+        pme-gpu-program-impl.cpp
+        )
  endif()
  
  if (BUILD_TESTING)
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu

index 71d1687926a63104abab09ed1d4c3b943f2b1e99..67232973a93403dc7f23dadd1518ae302e3a4f76 100644 (file)
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -48,6 +48,7 @@
  #include "gromacs/utility/gmxassert.h"
  
  #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
  #include "pme-timings.cuh"
  
  //! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
@@ -417,6 +418,10 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      }
  }
  
+//! Kernel instantiations
+template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+
  void pme_gpu_gather(PmeGpu                *pmeGpu,
                      PmeForceOutputHandling forceTreatment,
                      const float           *h_grid
@@ -459,17 +464,13 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
          GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
      }
  
-    constexpr bool wrapX = true;
-    constexpr bool wrapY = true;
-    GMX_UNUSED_VALUE(wrapX);
-    GMX_UNUSED_VALUE(wrapY);
-
      // TODO test different cache configs
  
      int  timingId = gtPME_GATHER;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
-        pme_gather_kernel<4, true, wrapX, wrapY> :
-        pme_gather_kernel<4, false, wrapX, wrapY>;
+    // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+        pmeGpu->programHandle_->impl_->gatherKernel :
+        pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
  
      pme_gpu_start_timing(pmeGpu, timingId);
      auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 01eb64cba2e5962a22d3712ae40af26cc3795fa8..105f88fad72526e441614b21756680c72d128430 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -210,11 +210,15 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
  
  /*! \libinternal \brief
   * Initializes the PME GPU data at the beginning of the run.
+ * TODO: this should become PmeGpu::PmeGpu()
   *
- * \param[in,out] pme       The PME structure.
- * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in,out] pme            The PME structure.
+ * \param[in,out] gpuInfo        The GPU information structure.
+ * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
   */
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+static void pme_gpu_init(gmx_pme_t          *pme,
+                         gmx_device_info_t  *gpuInfo,
+                         PmeGpuProgramHandle pmeGpuProgram)
  {
      pme->gpu          = new PmeGpu();
      PmeGpu *pmeGpu = pme->gpu;
@@ -230,6 +234,8 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
      pme_gpu_set_testing(pmeGpu, false);
  
      pmeGpu->deviceInfo = gpuInfo;
+    GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
+    pmeGpu->programHandle_ = pmeGpuProgram;
  
      pme_gpu_init_internal(pmeGpu);
      pme_gpu_init_sync_events(pmeGpu);
@@ -314,7 +320,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGpu, gmx::IVec *gridSize, gmx:
      }
  }
  
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
+void pme_gpu_reinit(gmx_pme_t          *pme,
+                    gmx_device_info_t  *gpuInfo,
+                    PmeGpuProgramHandle pmeGpuProgram)
  {
      if (!pme_gpu_active(pme))
      {
@@ -324,7 +332,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
      if (!pme->gpu)
      {
          /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo);
+        pme_gpu_init(pme, gpuInfo, pmeGpuProgram);
      }
      else
      {
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index e6f5a3572d2251f91aa65a17eb9c3bd8e95e7428..06738dce1b147377e3be3d444ab2b5a17ea69e74 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -671,12 +671,14 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARG
  /*! \libinternal \brief
   * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
   *
- * \param[in,out] pme       The PME structure.
- * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in,out] pme             The PME structure.
+ * \param[in,out] gpuInfo         The GPU information structure.
+ * \param[in]     pmeGpuProgram   The PME GPU program data
   * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
   */
  CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
-                                        gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo)) CUDA_FUNC_TERM
+                                        gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo),
+                                        PmeGpuProgramHandle CUDA_FUNC_ARGUMENT(pmeGpuProgram)) CUDA_FUNC_TERM
  
  /*! \libinternal \brief
   * Destroys the PME GPU data at the end of the run.
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cpp b/src/gromacs/ewald/pme-gpu-program-impl.cpp

new file mode 100644 (file)

index 0000000..1fc76b5
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-program-impl.cpp
@@ -0,0 +1,50 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl for non-GPU builds.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *) {};
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cu b/src/gromacs/ewald/pme-gpu-program-impl.cu

new file mode 100644 (file)

index 0000000..4d37929
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-program-impl.cu
@@ -0,0 +1,99 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "pme-gpu-program-impl.h"
+
+#include "pme-gpu-internal.h" // for GridOrdering enum
+#include "pme-gpu-types-host.h"
+
+//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
+template <
+    const int order,
+    const bool computeSplines,
+    const bool spreadCharges,
+    const bool wrapX,
+    const bool wrapY
+    >
+void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template<
+    GridOrdering gridOrdering,
+    bool computeEnergyAndVirial
+    >
+void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+template <
+    const int order,
+    const bool overwriteForces,
+    const bool wrapX,
+    const bool wrapY
+    >
+void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
+{
+    // PME interpolation order
+    constexpr int  pmeOrder = 4;
+    GMX_UNUSED_VALUE(pmeOrder);
+    // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+    constexpr bool wrapX = true;
+    constexpr bool wrapY = true;
+    GMX_UNUSED_VALUE(wrapX);
+    GMX_UNUSED_VALUE(wrapY);
+    splineAndSpreadKernel       = pme_spline_and_spread_kernel<pmeOrder, true, true, wrapX, wrapY>;
+    splineKernel                = pme_spline_and_spread_kernel<pmeOrder, true, false, wrapX, wrapY>;
+    spreadKernel                = pme_spline_and_spread_kernel<pmeOrder, false, true, wrapX, wrapY>;
+    gatherKernel                = pme_gather_kernel<pmeOrder, true, wrapX, wrapY>;
+    gatherReduceWithInputKernel = pme_gather_kernel<pmeOrder, false, wrapX, wrapY>;
+    solveXYZKernel              = pme_solve_kernel<GridOrdering::XYZ, false>;
+    solveXYZEnergyKernel        = pme_solve_kernel<GridOrdering::XYZ, true>;
+    solveYZXKernel              = pme_solve_kernel<GridOrdering::YZX, false>;
+    solveYZXEnergyKernel        = pme_solve_kernel<GridOrdering::YZX, true>;
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl()
+{
+}
diff --git a/src/gromacs/ewald/pme-gpu-program-impl.h b/src/gromacs/ewald/pme-gpu-program-impl.h

new file mode 100644 (file)

index 0000000..ee551d1
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-program-impl.h
@@ -0,0 +1,137 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares PmeGpuProgramImpl, which stores PME GPU (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_IMPL_H
+
+#include "config.h"
+
+#include "gromacs/utility/classhelpers.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+// TODO uncomment when we learn to compile .cpp with CUDA compiler
+//! include "gromacs/gpu_utils/gputraits.cuh"
+using Context = void *;
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#elif GMX_GPU == GMX_GPU_NONE
+// TODO place in gputraits_stub.h
+using Context = void *;
+#endif
+
+struct gmx_device_info_t;
+
+/*! \internal
+ * \brief
+ * PME GPU persistent host program/kernel data, which should be initialized once for the whole execution.
+ *
+ * Primary purpose of this is to not recompile GPU kernels for each OpenCL unit test,
+ * while the relevant GPU context (e.g. cl_context) instance persists.
+ * In CUDA, this just assigns the kernel function pointers.
+ * This also implicitly relies on the fact that reasonable share of the kernels are always used.
+ * If there were more template parameters, even smaller share of all possible kernels would be used.
+ *
+ * \todo In future if we would need to react to either user input or
+ * auto-tuning to compile different kernels, then we might wish to
+ * revisit the number of kernels we pre-compile, and/or the management
+ * of their lifetime.
+ *
+ * This also doesn't manage cuFFT/clFFT kernels, which depend on the PME grid dimensions.
+ *
+ * TODO: pass cl_context to the constructor and not create it inside.
+ * See also Redmine #2522.
+ */
+struct PmeGpuProgramImpl
+{
+    /*! \brief
+     * This is a handle to the GPU context, which is just a dummy in CUDA,
+     * but is created/destroyed by this class in OpenCL.
+     * TODO: Later we want to be able to own the context at a higher level and not here,
+     * but this class would still need the non-owning context handle to build the kernels.
+     */
+    Context context;
+
+    //! Conveniently all the PME kernels use the same single argument type
+#if GMX_GPU == GMX_GPU_CUDA
+    using PmeKernelHandle = void(*)(const struct PmeGpuCudaKernelParams);
+#elif GMX_GPU == GMX_GPU_OPENCL
+    using PmeKernelHandle = cl_kernel;
+#else
+    using PmeKernelHandle = void *;
+#endif
+
+    //@{
+    /**
+     * Spread/spline kernels are compiled only for order of 4.
+     * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
+     * as a placeholder for implementing 1/2D decomposition.
+     */
+    PmeKernelHandle splineKernel;
+    PmeKernelHandle spreadKernel;
+    PmeKernelHandle splineAndSpreadKernel;
+    //@}
+
+    //@{
+    /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
+     * it can either reduce with previous forces in the host buffer, or ignore them.
+     */
+    PmeKernelHandle gatherReduceWithInputKernel;
+    PmeKernelHandle gatherKernel;
+    //@}
+
+    //@{
+    /** Solve kernel doesn't care about the interpolation order, but can optionally
+     * compute energy and virial, and supports XYZ and YZX grid orderings.
+     */
+    PmeKernelHandle solveYZXKernel;
+    PmeKernelHandle solveXYZKernel;
+    PmeKernelHandle solveYZXEnergyKernel;
+    PmeKernelHandle solveXYZEnergyKernel;
+    //@}
+
+    PmeGpuProgramImpl() = delete;
+    //! Constructor for the given device
+    explicit PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo);
+    ~PmeGpuProgramImpl();
+    GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
+};
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu-program.cpp b/src/gromacs/ewald/pme-gpu-program.cpp

new file mode 100644 (file)

index 0000000..50031f3
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-program.cpp
@@ -0,0 +1,64 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-program.h"
+
+#include "gromacs/compat/make_unique.h"
+
+#include "pme-gpu-program-impl.h"
+
+PmeGpuProgram::PmeGpuProgram(const gmx_device_info_t *deviceInfo) :
+    impl_(gmx::compat::make_unique<PmeGpuProgramImpl>(deviceInfo))
+{
+}
+
+PmeGpuProgram::~PmeGpuProgram() = default;
+
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *deviceInfo)
+{
+    return gmx::compat::make_unique<PmeGpuProgram>(deviceInfo);
+}
diff --git a/src/gromacs/ewald/pme-gpu-program.h b/src/gromacs/ewald/pme-gpu-program.h

new file mode 100644 (file)

index 0000000..aa54a36
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-program.h
@@ -0,0 +1,80 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief
+ * Declares PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
+ * to store permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ * \inlibraryapi
+ */
+
+#ifndef GMX_EWALD_PME_PME_GPU_PROGRAM_H
+#define GMX_EWALD_PME_PME_GPU_PROGRAM_H
+
+#include <memory>
+
+struct PmeGpuProgramImpl;
+struct gmx_device_info_t;
+
+class PmeGpuProgram
+{
+    public:
+        explicit PmeGpuProgram(const gmx_device_info_t *deviceInfo);
+        ~PmeGpuProgram();
+
+        // TODO: design getters for information inside, if needed for PME, and make this private?
+        std::unique_ptr<PmeGpuProgramImpl> impl_;
+};
+
+/*! \brief This is an owning handle for the compiled PME GPU kernels.
+ */
+using PmeGpuProgramStorage = std::unique_ptr<PmeGpuProgram>;
+
+/*! \brief This is a handle for passing references to PME GPU program data.
+ * TODO: it should be a const reference, but for that the PmeGpu types need to be C++
+ */
+using PmeGpuProgramHandle = const PmeGpuProgram *;
+
+/*! \brief
+ * Factory function used to build persistent PME GPU program for the device at once.
+ * \todo This should shortly become GPU_FUNC to support OpenCL.
+ */
+PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *);
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h

index f74f90e53396d8586b0197bee527ee2ad6cb199b..ccc83940b1ad3e06ec167886610fc1d653c706a3 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types-host.h
+++ b/src/gromacs/ewald/pme-gpu-types-host.h
@@ -54,6 +54,7 @@
  #include <vector>
  
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
  #include "gromacs/gpu_utils/gpu_utils.h"      // for GpuApiCallBehavior
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/math/vectypes.h"
@@ -171,6 +172,9 @@ struct PmeGpu
      /*! \brief The information copied once per reinit from the CPU structure. */
      std::shared_ptr<PmeShared> common; // TODO: make the CPU structure use the same type
  
+    //! A handle to the program created by buildPmeGpuProgram()
+    PmeGpuProgramHandle programHandle_;
+
      /*! \brief The settings. */
      PmeGpuSettings settings;
  
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu

index 6a035dd4107c45001c7471f4287225db218c25ce..51516300d279acf9aac3a8cae877776f0cdf1ba5 100644 (file)
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -48,6 +48,7 @@
  #include "gromacs/utility/gmxassert.h"
  
  #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
  #include "pme-timings.cuh"
  
  //! Solving kernel max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime
@@ -404,6 +405,12 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
      }
  }
  
+//! Kernel instantiations
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+
  void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
                     GridOrdering gridOrdering, bool computeEnergyAndVirial)
  {
@@ -456,18 +463,18 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
      config.stream       = pmeGpu->archSpecific->pmeStream;
  
      int  timingId = gtPME_SOLVE;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
      if (gridOrdering == GridOrdering::YZX)
      {
          kernelPtr = computeEnergyAndVirial ?
-            pme_solve_kernel<GridOrdering::YZX, true> :
-            pme_solve_kernel<GridOrdering::YZX, false>;
+            pmeGpu->programHandle_->impl_->solveYZXEnergyKernel :
+            pmeGpu->programHandle_->impl_->solveYZXKernel;
      }
      else if (gridOrdering == GridOrdering::XYZ)
      {
          kernelPtr = computeEnergyAndVirial ?
-            pme_solve_kernel<GridOrdering::XYZ, true> :
-            pme_solve_kernel<GridOrdering::XYZ, false>;
+            pmeGpu->programHandle_->impl_->solveXYZEnergyKernel :
+            pmeGpu->programHandle_->impl_->solveXYZKernel;
      }
  
      pme_gpu_start_timing(pmeGpu, timingId);
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu

index 89df2c77f9f2eebff514d93c9f088caba66e7656..564bdfee079498266cece0c760fa3d85cbe8265d 100644 (file)
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -53,6 +53,7 @@
  #include "gromacs/utility/gmxassert.h"
  
  #include "pme.cuh"
+#include "pme-gpu-program-impl.h"
  #include "pme-grid.h"
  #include "pme-timings.cuh"
  
@@ -482,6 +483,11 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
      }
  }
  
+//! Kernel instantiations
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+
  void pme_gpu_spread(const PmeGpu    *pmeGpu,
                      int gmx_unused   gridIndex,
                      real            *h_grid,
@@ -517,31 +523,25 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
          GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
      }
  
-    // These should later check for PME decomposition
-    constexpr bool wrapX = true;
-    constexpr bool wrapY = true;
-    GMX_UNUSED_VALUE(wrapX);
-    GMX_UNUSED_VALUE(wrapY);
-
      int  timingId;
-    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
      if (computeSplines)
      {
          if (spreadCharges)
          {
              timingId  = gtPME_SPLINEANDSPREAD;
-            kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>;
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
          }
          else
          {
              timingId  = gtPME_SPLINE;
-            kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>;
+            kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
          }
      }
      else
      {
          timingId  = gtPME_SPREAD;
-        kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>;
+        kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
      }
  
      pme_gpu_start_timing(pmeGpu, timingId);
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp

index c18fa0881a396e968ea80bf08e8efb9bc84e90e4..8dc6f329c65fad846dbad8225c93a28a9e37aa23 100644 (file)
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -608,6 +608,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                          PmeRunMode           runMode,
                          PmeGpu              *pmeGpu,
                          gmx_device_info_t   *gpuInfo,
+                        PmeGpuProgramHandle  pmeGpuProgram,
                          const gmx::MDLogger  & /*mdlog*/)
  {
      int               use_threads, sum_use_threads, i;
@@ -954,7 +955,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
              }
          }
  
-        pme_gpu_reinit(pme.get(), gpuInfo);
+        pme_gpu_reinit(pme.get(), gpuInfo, pmeGpuProgram);
      }
  
      pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
@@ -1007,7 +1008,7 @@ void gmx_pme_reinit(struct gmx_pme_t **pmedata,
          NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
          *pmedata = gmx_pme_init(cr, numPmeDomains,
                                  &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
-                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+                                pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, nullptr, dummyLogger);
          //TODO this is mostly passing around current values
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 88ffa8d8ab812b1deb1c7804d6f4084e41cc166e..2b5a4925571124b867350eaefeb1fa00aefdbd57 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -69,6 +69,9 @@ struct gmx_wallcycle;
  struct NumPmeDomains;
  
  enum class GpuTaskCompletion;
+class PmeGpuProgram;
+//! Convenience name.
+using PmeGpuProgramHandle = const PmeGpuProgram *;
  
  namespace gmx
  {
@@ -123,6 +126,11 @@ bool gmx_pme_check_restrictions(int pme_order,
   *
   * \throws   gmx::InconsistentInputError if input grid sizes/PME order are inconsistent.
   * \returns  Pointer to newly allocated and initialized PME data.
+ *
+ * \todo We should evolve something like a \c GpuManager that holds \c
+ * gmx_device_info_t * and \c PmeGpuProgramHandle and perhaps other
+ * related things whose lifetime can/should exceed that of a task (or
+ * perhaps task manager). See Redmine #2522.
   */
  gmx_pme_t *gmx_pme_init(const t_commrec *cr,
                          const NumPmeDomains &numPmeDomains,
@@ -134,6 +142,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr,
                          PmeRunMode runMode,
                          PmeGpu *pmeGpu,
                          gmx_device_info_t *gpuInfo,
+                        PmeGpuProgramHandle pmeGpuProgram,
                          const gmx::MDLogger &mdlog);
  
  /*! \brief Destroys the PME data structure.*/
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp

index f89e8e4eaf03825267bcb2ecdfecc9606251c94e..6d1286f79429badb3781c6324933137cf627e969 100644 (file)
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -389,26 +389,27 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
              TestReferenceData refData;
              for (const auto &context : getPmeTestEnv()->getHardwareContexts())
              {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                  const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                  if (!supportedInput)
                  {
                      /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
                      continue;
                  }
  
                  /* Describing the test uniquely */
                  SCOPED_TRACE(formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
                                            ", order %d, %zu atoms, %s",
-                                          codePathToString(codePath), context.getDescription().c_str(),
+                                          codePathToString(codePath), context->getDescription().c_str(),
                                            gridSize[XX], gridSize[YY], gridSize[ZZ],
                                            pmeOrder,
                                            atomCount,
                                            (inputForceTreatment == PmeForceOutputHandling::ReduceWithInput) ? "with reduction" : "without reduction"
                                            ));
  
-                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), inputAtomData.coordinates, inputAtomData.charges, box);
+                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
  
                  /* Setting some more inputs */
                  pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
diff --git a/src/gromacs/ewald/tests/pmesolvetest.cpp b/src/gromacs/ewald/tests/pmesolvetest.cpp

index a86b356ede3928f85e5b3d83afe8463d3f1b7251..6f418fabe205360305b249dbeb340016b21d89b7 100644 (file)
--- a/src/gromacs/ewald/tests/pmesolvetest.cpp
+++ b/src/gromacs/ewald/tests/pmesolvetest.cpp
@@ -112,12 +112,12 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
              TestReferenceData refData;
              for (const auto &context : getPmeTestEnv()->getHardwareContexts())
              {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                  const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                  if (!supportedInput)
                  {
                      /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
+                    EXPECT_THROW(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj), NotImplementedError);
                      continue;
                  }
  
@@ -136,13 +136,14 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
                                                    gridOrdering.second.c_str(),
                                                    computeEnergyAndVirial ? "with" : "without",
                                                    codePathToString(codePath),
-                                                  context.getDescription().c_str(),
+                                                  context->getDescription().c_str(),
                                                    gridSize[XX], gridSize[YY], gridSize[ZZ],
                                                    ewaldCoeff_q, ewaldCoeff_lj
                                                    ));
  
                          /* Running the test */
-                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context.getDeviceInfo(), box, ewaldCoeff_q, ewaldCoeff_lj);
+                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
+                                                              context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
                          pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
                          const real     cellVolume = box[0] * box[4] * box[8];
                          //FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp

index 71988a6ef79508e8734907d5a4398ccc3a7101c8..01df43ed82c11fbc2d66627376993902980c7e5d 100644 (file)
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -123,12 +123,12 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
  
              for (const auto &context : getPmeTestEnv()->getHardwareContexts())
              {
-                CodePath   codePath       = context.getCodePath();
+                CodePath   codePath       = context->getCodePath();
                  const bool supportedInput = pmeSupportsInputForMode(&inputRec, codePath);
                  if (!supportedInput)
                  {
                      /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, coordinates, charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
                      continue;
                  }
  
@@ -139,14 +139,15 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
                      SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
                                                ", order %d, %zu atoms",
                                                option.second.c_str(), codePathToString(codePath),
-                                              context.getDescription().c_str(),
+                                              context->getDescription().c_str(),
                                                gridSize[XX], gridSize[YY], gridSize[ZZ],
                                                pmeOrder,
                                                atomCount));
  
                      /* Running the test */
  
-                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context.getDeviceInfo(), coordinates, charges, box);
+                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
+                                                          context->getPmeGpuProgram(), coordinates, charges, box);
  
                      const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
                      const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index b30a8c77a511a2bbe3660c4befdd596b902c0ad0..6d2890a88ab51ceb5a9264f53eab03c1e8773dd0 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -102,6 +102,7 @@ gmx_uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
  static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                        CodePath                  mode,
                                        gmx_device_info_t        *gpuInfo,
+                                      PmeGpuProgramHandle       pmeGpuProgram,
                                        size_t                    atomCount,
                                        const Matrix3x3          &box,
                                        real                      ewaldCoeff_q = 1.0f,
@@ -113,7 +114,7 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
      t_commrec      dummyCommrec  = {0};
      NumPmeDomains  numPmeDomains = { 1, 1 };
      gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, atomCount, false, false, true,
-                                                ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
+                                                ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, pmeGpuProgram, dummyLogger);
      PmeSafePointer pme(pmeDataRaw); // taking ownership
  
      // TODO get rid of this with proper matrix type
@@ -150,12 +151,13 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
  PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
                              CodePath                  mode,
                              gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                              const Matrix3x3          &box,
                              real                      ewaldCoeff_q,
                              real                      ewaldCoeff_lj
                              )
  {
-    return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+    return pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
      // hiding the fact that PME actually needs to know the number of atoms in advance
  }
  
@@ -163,6 +165,7 @@ PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
  PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                              CodePath                  mode,
                              gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                              const CoordinatesVector  &coordinates,
                              const ChargesVector      &charges,
                              const Matrix3x3          &box
@@ -170,7 +173,7 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
  {
      const size_t    atomCount = coordinates.size();
      GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
-    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
      pme_atomcomm_t *atc     = nullptr;
  
      switch (mode)
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h

index cb981d6b70f563d2c2482a79936ff590712f1e46..4378d1528c4f854c548bbeb040e0cce48aaa6146 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -117,17 +117,18 @@ gmx_uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
  
  // PME stages
  
-// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
  //! Simple PME initialization (no atom data)
  PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
                              CodePath mode = CodePath::CPU,
                              gmx_device_info_t *gpuInfo = nullptr,
+                            PmeGpuProgramHandle pmeGpuProgram = nullptr,
                              const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
                              real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
  //! PME initialization with atom data and system box
  PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
                              CodePath                  mode,
                              gmx_device_info_t        *gpuInfo,
+                            PmeGpuProgramHandle       pmeGpuProgram,
                              const CoordinatesVector  &coordinates,
                              const ChargesVector      &charges,
                              const Matrix3x3          &box
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp

index 29e4e31ea843eef53fbe634004779784281c73ce..b3c95b29eb7bf6e4c60d4af8834a3e874bc9f047 100644 (file)
--- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -44,6 +44,7 @@
  
  #include "testhardwarecontexts.h"
  
+#include "gromacs/compat/make_unique.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/hw_info.h"
@@ -57,6 +58,8 @@ namespace gmx
  namespace test
  {
  
+TestHardwareContext::~TestHardwareContext() = default;
+
  const char *codePathToString(CodePath codePath)
  {
      switch (codePath)
@@ -106,7 +109,7 @@ static gmx_hw_info_t *hardwareInit()
  
  void PmeTestEnvironment::SetUp()
  {
-    hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CPU, "", nullptr));
+    hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
  
      hardwareInfo_ = hardwareInit();
      if (!pme_gpu_supports_build(nullptr))
@@ -125,7 +128,9 @@ void PmeTestEnvironment::SetUp()
          get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
          std::string description = "(GPU " + std::string(stmp) + ") ";
          // TODO should this be CodePath::GPU?
-        hardwareContexts_.emplace_back(TestHardwareContext(CodePath::CUDA, description.c_str(), deviceInfo));
+        hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>
+                                           (CodePath::CUDA, description.c_str(),
+                                           deviceInfo));
      }
  }
  
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h

index 09ab2d1c358d6707aebe2bbc5198802c96f935f7..e7723b709ff44b2fad3d79cec8cc8aeab18e3ae0 100644 (file)
--- a/src/gromacs/ewald/tests/testhardwarecontexts.h
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.h
@@ -48,6 +48,7 @@
  
  #include <gtest/gtest.h>
  
+#include "gromacs/ewald/pme-gpu-program.h"
  #include "gromacs/hardware/detecthardware.h"
  #include "gromacs/hardware/gpu_hw_info.h"
  
@@ -66,32 +67,38 @@ enum class CodePath
  const char *codePathToString(CodePath codePath);
  
  /*! \internal \brief
- * A structure to describe a hardware context - an abstraction over
- * gmx_device_info_t with a human-readable string.
+ * A structure to describe a hardware context  that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
   */
  struct TestHardwareContext
  {
      //! Hardware path for the code being tested.
-    CodePath           codePath_;
+    CodePath                  codePath_;
      //! Readable description
-    std::string        description_;
+    std::string               description_;
      //! Device information pointer
-    gmx_device_info_t *deviceInfo_;
+    gmx_device_info_t        *deviceInfo_;
+    //! Persistent compiled GPU kernels for PME.
+    PmeGpuProgramStorage      program_;
  
      public:
          //! Retuns the code path for this context.
          CodePath getCodePath() const { return codePath_; }
          //! Returns a human-readable context description line
-        std::string        getDescription() const{return description_; }
+        std::string         getDescription() const{return description_; }
          //! Returns the device info pointer
-        gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+        gmx_device_info_t  *getDeviceInfo() const{return deviceInfo_; }
+        //! Returns the persistent PME GPU kernels
+        PmeGpuProgramHandle getPmeGpuProgram() const{return program_.get(); }
          //! Constructs the context
          TestHardwareContext(CodePath codePath, const char *description, gmx_device_info_t *deviceInfo) :
-            codePath_(codePath), description_(description), deviceInfo_(deviceInfo){}
+            codePath_(codePath), description_(description), deviceInfo_(deviceInfo),
+            program_(buildPmeGpuProgram(deviceInfo_)) {}
+        ~TestHardwareContext();
  };
  
-//! A container of hardware contexts
-typedef std::vector<TestHardwareContext> TestHardwareContexts;
+//! A container of handles to hardware contexts
+typedef std::vector < std::unique_ptr < TestHardwareContext>> TestHardwareContexts;
  
  /*! \internal \brief
   * This class performs one-time test initialization (enumerating the hardware)
diff --git a/src/gromacs/gpu_utils/gpu_macros.h b/src/gromacs/gpu_utils/gpu_macros.h

index 182c2035bead423318fee60f6690800b2f39c40f..e9d597a7548e0e4f3e095786b0351e3d1c573b9c 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_macros.h
+++ b/src/gromacs/gpu_utils/gpu_macros.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015,2017, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,6 +37,8 @@
  
  #include "config.h"
  
+#include "gromacs/utility/basedefinitions.h" // for gmx_unused
+
  /* These macros that let us define inlineable null implementations so
     that non-GPU Gromacs can run with no overhead without conditionality
     everywhere a GPU function is called. */
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index b08c74bc1e0b7697a71bcb3818db22ad43538ea5..3a77c003518ce8f69ca83d0a249f814dea22da61 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -59,6 +59,7 @@
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/ewald/ewald-utils.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-program.h"
  #include "gromacs/fileio/checkpoint.h"
  #include "gromacs/fileio/oenv.h"
  #include "gromacs/fileio/tpxio.h"
@@ -1036,12 +1037,17 @@ int Mdrunner::mdrunner()
      }
  
      gmx_device_info_t *pmeDeviceInfo = nullptr;
+    // Later, this program could contain kernels that might be later
+    // re-used as auto-tuning progresses, or subsequent simulations
+    // are invoked.
+    PmeGpuProgramStorage pmeGpuProgram;
      // This works because only one task of each type is currently permitted.
-    auto               pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
+    auto                 pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
      if (pmeGpuTaskMapping != gpuTaskAssignment.end())
      {
          pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
          init_gpu(mdlog, pmeDeviceInfo);
+        pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
      }
  
      /* getting number of PP/PME threads
@@ -1234,7 +1240,8 @@ int Mdrunner::mdrunner()
                                         mdrunOptions.reproducible,
                                         ewaldcoeff_q, ewaldcoeff_lj,
                                         nthreads_pme,
-                                       pmeRunMode, nullptr, pmeDeviceInfo, mdlog);
+                                       pmeRunMode, nullptr,
+                                       pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
              }
              GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
          }
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Thu, 26 Apr 2018 13:15:24 +0000 (15:15 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 28 May 2018 13:11:53 +0000 (15:11 +0200)
src/gromacs/ewald/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/pme-gather.cu		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-program-impl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-gpu-program-impl.cu	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-gpu-program-impl.h	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-gpu-program.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-gpu-program.h	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-gpu-types-host.h		patch \| blob \| history
src/gromacs/ewald/pme-solve.cu		patch \| blob \| history
src/gromacs/ewald/pme-spread.cu		patch \| blob \| history
src/gromacs/ewald/pme.cpp		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/tests/pmegathertest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesolvetest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesplinespreadtest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.h		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.cpp		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_macros.h		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history