From 379b2954fd3efbd1b2c724964932c2ef03078939 Mon Sep 17 00:00:00 2001
From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Mon, 26 Sep 2016 12:53:29 +0200
Subject: [PATCH] PME GPU/CUDA data framework.

This patch adds most of the PME GPU data handling routines
(pme.h and pme-gpu.cpp / pme.cu(h) / pme-gpu-internal.h/cpp),
data structures used on host and on device
(pme-gpu-types.h / pme.cuh).
The PME GPU kernels and their host code will live in separate files.
There is also cuFFT code (pme-3dfft.cu(h)) and
optional CUDA timing events (pme-timings.cu(h)) included.
Currently this is a dead code, PME GPU is not actually getting initialized
as the corresponding enum is always set for CPU path, which is asserted.

Change-Id: I9b03f54a2412885e25ea27f17bf2dca1b01f9f78
---
 src/gromacs/CMakeLists.txt                    |   1 +
 src/gromacs/ewald/CMakeLists.txt              |   4 +
 src/gromacs/ewald/pme-3dfft.cu                | 146 +++++
 src/gromacs/ewald/pme-3dfft.cuh               |  74 +++
 src/gromacs/ewald/pme-gpu-internal.cpp        | 397 +++++++++++++
 src/gromacs/ewald/pme-gpu-internal.h          | 551 ++++++++++++++++++
 src/gromacs/ewald/pme-gpu-types.h             | 356 +++++++++++
 src/gromacs/ewald/pme-gpu.cpp                 | 114 ++++
 src/gromacs/ewald/pme-grid.cpp                |   2 +
 src/gromacs/ewald/pme-grid.h                  |  13 +-
 src/gromacs/ewald/pme-internal.h              | 108 +++-
 src/gromacs/ewald/pme-load-balancing.cpp      |  11 +-
 src/gromacs/ewald/pme-only.cpp                |  18 +-
 src/gromacs/ewald/pme-pp.cpp                  |   6 +-
 src/gromacs/ewald/pme-timings.cu              | 112 ++++
 src/gromacs/ewald/pme-timings.cuh             |  66 +++
 src/gromacs/ewald/pme.cpp                     |  60 +-
 src/gromacs/ewald/pme.cu                      | 515 ++++++++++++++++
 src/gromacs/ewald/pme.cuh                     | 243 ++++++++
 src/gromacs/ewald/pme.h                       |  75 ++-
 src/gromacs/ewald/tests/pmetestcommon.cpp     |  11 +-
 src/gromacs/gpu_utils/gpuregiontimer.h        |  20 +-
 src/gromacs/gpu_utils/gpuregiontimer_ocl.h    |   3 +-
 src/gromacs/mdlib/mdsetup.cpp                 |   8 +
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    |  18 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |   7 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h       |   6 +-
 src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h       |   5 +-
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp     |  18 +-
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   |   7 +-
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h |  20 +-
 src/gromacs/mdlib/sim_util.cpp                |  13 +-
 src/gromacs/mdlib/sim_util.h                  |   1 +
 src/gromacs/timing/gpu_timing.h               |  52 +-
 src/gromacs/timing/wallcycle.cpp              |  76 ++-
 src/gromacs/timing/wallcycle.h                |   3 +-
 src/gromacs/timing/wallcyclereporting.h       |   8 +-
 src/programs/mdrun/md.cpp                     |  14 +-
 src/programs/mdrun/runner.cpp                 |   6 +-
 39 files changed, 3026 insertions(+), 142 deletions(-)
 create mode 100644 src/gromacs/ewald/pme-3dfft.cu
 create mode 100644 src/gromacs/ewald/pme-3dfft.cuh
 create mode 100644 src/gromacs/ewald/pme-gpu-internal.cpp
 create mode 100644 src/gromacs/ewald/pme-gpu-internal.h
 create mode 100644 src/gromacs/ewald/pme-gpu-types.h
 create mode 100644 src/gromacs/ewald/pme-gpu.cpp
 create mode 100644 src/gromacs/ewald/pme-timings.cu
 create mode 100644 src/gromacs/ewald/pme-timings.cuh
 create mode 100644 src/gromacs/ewald/pme.cu
 create mode 100644 src/gromacs/ewald/pme.cuh

diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
index ba8f6d2a55..333405f21e 100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -174,6 +174,7 @@ if (GMX_USE_CUDA)
     else()
         add_library(libgromacs ${LIBGROMACS_SOURCES})
     endif()
+    target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
 else()
     add_library(libgromacs ${LIBGROMACS_SOURCES})
 endif()
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index 8d1904c469..05210bda31 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -34,6 +34,10 @@
 
 file(GLOB EWALD_SOURCES *.cpp)
 set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${EWALD_SOURCES} PARENT_SCOPE)
+if (GMX_USE_CUDA)
+    file(GLOB EWALD_CUDA_SOURCES *.cu)
+    gmx_add_libgromacs_sources(${EWALD_CUDA_SOURCES})
+endif()
 
 if (BUILD_TESTING)
     add_subdirectory(tests)
diff --git a/src/gromacs/ewald/pme-3dfft.cu b/src/gromacs/ewald/pme-3dfft.cu
new file mode 100644
index 0000000000..38f2e75cb0
--- /dev/null
+++ b/src/gromacs/ewald/pme-3dfft.cu
@@ -0,0 +1,146 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements CUDA FFT routines for PME GPU.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme-3dfft.cuh"
+
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+#include "pme-gpu-types.h"
+
+static void handleCufftError(cufftResult_t status, const char *msg)
+{
+    if (status != CUFFT_SUCCESS)
+    {
+        gmx_fatal(FARGS, "%s (error code %d)\n", msg, status);
+    }
+}
+
+GpuParallel3dFft::GpuParallel3dFft(const pme_gpu_t *pmeGPU)
+{
+    const pme_gpu_cuda_kernel_params_t *kernelParamsPtr = pmeGPU->kernelParams.get();
+    ivec realGridSize, realGridSizePadded, complexGridSizePadded;
+    for (int i = 0; i < DIM; i++)
+    {
+        realGridSize[i]          = kernelParamsPtr->grid.realGridSize[i];
+        realGridSizePadded[i]    = kernelParamsPtr->grid.realGridSizePadded[i];
+        complexGridSizePadded[i] = kernelParamsPtr->grid.complexGridSizePadded[i];
+    }
+
+    GMX_RELEASE_ASSERT(!pme_gpu_uses_dd(pmeGPU), "FFT decomposition not implemented");
+
+    const int complexGridSizePaddedTotal = complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ];
+    const int realGridSizePaddedTotal    = realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
+
+    realGrid_ = (cufftReal *)kernelParamsPtr->grid.d_realGrid;
+    GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
+    complexGrid_ = (cufftComplex *)kernelParamsPtr->grid.d_fourierGrid;
+    GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
+
+    cufftResult_t result;
+    /* Commented code for a simple 3D grid with no padding */
+    /*
+       result = cufftPlan3d(&planR2C_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_R2C);
+       handleCufftError(result, "cufftPlan3d R2C plan failure");
+
+       result = cufftPlan3d(&planC2R_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_C2R);
+       handleCufftError(result, "cufftPlan3d C2R plan failure");
+     */
+
+    const int                 rank = 3, batch = 1;
+    result = cufftPlanMany(&planR2C_, rank, realGridSize,
+                           realGridSizePadded, 1, realGridSizePaddedTotal,
+                           complexGridSizePadded, 1, complexGridSizePaddedTotal,
+                           CUFFT_R2C,
+                           batch);
+    handleCufftError(result, "cufftPlanMany R2C plan failure");
+
+    result = cufftPlanMany(&planC2R_, rank, realGridSize,
+                           complexGridSizePadded, 1, complexGridSizePaddedTotal,
+                           realGridSizePadded, 1, realGridSizePaddedTotal,
+                           CUFFT_C2R,
+                           batch);
+    handleCufftError(result, "cufftPlanMany C2R plan failure");
+
+    cudaStream_t stream = pmeGPU->archSpecific->pmeStream;
+    GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT");
+
+    result = cufftSetStream(planR2C_, stream);
+    handleCufftError(result, "cufftSetStream R2C failure");
+
+    result = cufftSetStream(planC2R_, stream);
+    handleCufftError(result, "cufftSetStream C2R failure");
+}
+
+GpuParallel3dFft::~GpuParallel3dFft()
+{
+    cufftResult_t result;
+    result = cufftDestroy(planR2C_);
+    handleCufftError(result, "cufftDestroy R2C failure");
+    result = cufftDestroy(planC2R_);
+    handleCufftError(result, "cufftDestroy C2R failure");
+}
+
+void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir)
+{
+    cufftResult_t result;
+    if (dir == GMX_FFT_REAL_TO_COMPLEX)
+    {
+        result = cufftExecR2C(planR2C_, realGrid_, complexGrid_);
+        handleCufftError(result, "cuFFT R2C execution failure");
+    }
+    else
+    {
+        result = cufftExecC2R(planC2R_, complexGrid_, realGrid_);
+        handleCufftError(result, "cuFFT C2R execution failure");
+    }
+}
+
+inline void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index)
+{
+    int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
+    pme_gpu_start_timing(pmeGPU, timerId);
+    pmeGPU->archSpecific->fftSetup[grid_index]->perform3dFft(dir);
+    pme_gpu_stop_timing(pmeGPU, timerId);
+}
diff --git a/src/gromacs/ewald/pme-3dfft.cuh b/src/gromacs/ewald/pme-3dfft.cuh
new file mode 100644
index 0000000000..775c3241d3
--- /dev/null
+++ b/src/gromacs/ewald/pme-3dfft.cuh
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *  \brief Defines the CUDA 3D-FFT functions for PME.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_3DFFT_CUH
+#define GMX_EWALD_PME_3DFFT_CUH
+
+#include <cufft.h>                  // for the cufft types
+
+#include "gromacs/fft/fft.h"        // for the enum gmx_fft_direction
+
+struct pme_gpu_t;
+
+/*! \brief \internal A 3D FFT class for performing R2C/C2R transforms
+ * \todo Make this class actually parallel over multiple GPUs
+ */
+class GpuParallel3dFft
+{
+    cufftHandle   planR2C_;
+    cufftHandle   planC2R_;
+    cufftReal    *realGrid_;
+    cufftComplex *complexGrid_;
+
+    public:
+        /*! \brief
+         * Constructs CUDA FFT plans for performing 3D FFT on a PME grid.
+         *
+         * \param[in] pmeGPU                  The PME GPU structure.
+         */
+        GpuParallel3dFft(const pme_gpu_t *pmeGPU);
+        /*! \brief Destroys CUDA FFT plans. */
+        ~GpuParallel3dFft();
+        /*! \brief Performs the FFT transform in given direction */
+        void perform3dFft(gmx_fft_direction dir);
+};
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
new file mode 100644
index 0000000000..3349b63b79
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -0,0 +1,397 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief This file contains internal function implementations
+ * for performing the PME calculations on GPU.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-internal.h"
+
+#include "config.h"
+
+#include <list>
+#include <string>
+
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/math/invertmatrix.h"
+#include "gromacs/math/units.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "pme-grid.h"
+#include "pme-internal.h"
+
+/*! \internal \brief
+ * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ * \returns The pointer to the kernel parameters.
+ */
+static pme_gpu_kernel_params_base_t *pme_gpu_get_kernel_params_base_ptr(const pme_gpu_t *pmeGPU)
+{
+    // reinterpret_cast is needed because the derived CUDA structure is not known in this file
+    auto *kernelParamsPtr = reinterpret_cast<pme_gpu_kernel_params_base_t *>(pmeGPU->kernelParams.get());
+    return kernelParamsPtr;
+}
+
+void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial)
+{
+    GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU");
+    unsigned int j = 0;
+    virial[XX][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    virial[YY][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    virial[ZZ][ZZ] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    virial[XX][YY] = virial[YY][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    virial[XX][ZZ] = virial[ZZ][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    virial[YY][ZZ] = virial[ZZ][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+    *energy        = 0.5f * pmeGPU->staging.h_virialAndEnergy[j++];
+}
+
+void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box)
+{
+    auto        *kernelParamsPtr      = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    kernelParamsPtr->step.boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
+    GMX_ASSERT(kernelParamsPtr->step.boxVolume != 0.0f, "Zero volume of the unit cell");
+
+#if GMX_DOUBLE
+    GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
+#else
+    matrix recipBox;
+    gmx::invertBoxMatrix(box, recipBox);
+    /* The GPU recipBox is transposed as compared to the CPU recipBox.
+     * Spread uses matrix columns (while solve and gather use rows).
+     * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
+     */
+    const real newRecipBox[DIM][DIM] =
+    {
+        {recipBox[XX][XX], recipBox[YY][XX], recipBox[ZZ][XX]},
+        {             0.0, recipBox[YY][YY], recipBox[ZZ][YY]},
+        {             0.0,              0.0, recipBox[ZZ][ZZ]}
+    };
+    memcpy(kernelParamsPtr->step.recipBox, newRecipBox, sizeof(matrix));
+#endif
+}
+
+void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates)
+{
+    pme_gpu_copy_input_coordinates(pmeGPU, h_coordinates);
+    if (needToUpdateBox)
+    {
+        pme_gpu_update_input_box(pmeGPU, box);
+    }
+}
+
+/*! \brief \libinternal
+ * The PME GPU reinitialization function that is called both at the end of any MD step and on any load balancing step.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU)
+{
+    pme_gpu_clear_grids(pmeGPU);
+    pme_gpu_clear_energy_virial(pmeGPU);
+}
+
+void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir)
+{
+    /* Needed for copy back as well as timing events */
+    pme_gpu_synchronize(pmeGPU);
+
+    if (bCalcF && pme_gpu_performs_gather(pmeGPU))
+    {
+        pme_gpu_sync_output_forces(pmeGPU);
+    }
+    if (bCalcEnerVir && pme_gpu_performs_solve(pmeGPU))
+    {
+        pme_gpu_sync_output_energy_virial(pmeGPU);
+    }
+    pme_gpu_update_timings(pmeGPU);
+    pme_gpu_reinit_step(pmeGPU);
+}
+
+/*! \brief \libinternal
+ * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+static void pme_gpu_reinit_grids(pme_gpu_t *pmeGPU)
+{
+    auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    kernelParamsPtr->grid.ewaldFactor = (M_PI * M_PI) / (pmeGPU->common->ewaldcoeff_q * pmeGPU->common->ewaldcoeff_q);
+
+    /* The grid size variants */
+    for (int i = 0; i < DIM; i++)
+    {
+        kernelParamsPtr->grid.realGridSize[i]       = pmeGPU->common->nk[i];
+        kernelParamsPtr->grid.realGridSizeFP[i]     = (float)kernelParamsPtr->grid.realGridSize[i];
+        kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
+
+        // The complex grid currently uses no padding;
+        // if it starts to do so, then another test should be added for that
+        kernelParamsPtr->grid.complexGridSize[i]       = kernelParamsPtr->grid.realGridSize[i];
+        kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
+    }
+    /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
+    if (!pme_gpu_performs_FFT(pmeGPU))
+    {
+        // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
+        kernelParamsPtr->grid.realGridSizePadded[ZZ] = (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
+    }
+
+    /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
+    kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
+    kernelParamsPtr->grid.complexGridSize[ZZ]++;
+    kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
+
+    pme_gpu_realloc_and_copy_fract_shifts(pmeGPU);
+    pme_gpu_realloc_and_copy_bspline_values(pmeGPU);
+    pme_gpu_realloc_grids(pmeGPU);
+    pme_gpu_reinit_3dfft(pmeGPU);
+}
+
+/* Several GPU functions that refer to the CPU PME data live here.
+ * We would like to keep these away from the GPU-framework specific code for clarity,
+ * as well as compilation issues with MPI.
+ */
+
+/*! \brief \libinternal
+ * Copies everything useful from the PME CPU to the PME GPU structure.
+ * The goal is to minimize interaction with the PME CPU structure in the GPU code.
+ *
+ * \param[in] pme         The PME structure.
+ */
+static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
+{
+    /* TODO: Consider refactoring the CPU PME code to use the same structure,
+     * so that this function becomes 2 lines */
+    pme_gpu_t *pmeGPU             = pme->gpu;
+    pmeGPU->common->ngrids        = pme->ngrids;
+    pmeGPU->common->epsilon_r     = pme->epsilon_r;
+    pmeGPU->common->ewaldcoeff_q  = pme->ewaldcoeff_q;
+    pmeGPU->common->nk[XX]        = pme->nkx;
+    pmeGPU->common->nk[YY]        = pme->nky;
+    pmeGPU->common->nk[ZZ]        = pme->nkz;
+    pmeGPU->common->pmegrid_n[XX] = pme->pmegrid_nx;
+    pmeGPU->common->pmegrid_n[YY] = pme->pmegrid_ny;
+    pmeGPU->common->pmegrid_n[ZZ] = pme->pmegrid_nz;
+    pmeGPU->common->pme_order     = pme->pme_order;
+    for (int i = 0; i < DIM; i++)
+    {
+        pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]);
+    }
+    const int cellCount = c_pmeNeighborUnitcellCount;
+    pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx);
+    pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky);
+    pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz);
+    pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx);
+    pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky);
+    pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz);
+    pmeGPU->common->runMode = pme->runMode;
+}
+
+/*! \brief \libinternal
+ * Finds out if PME with given inputs is possible to run on GPU.
+ *
+ * \param[in]  pme          The PME structure.
+ * \param[out] error        The error message if the input is not supported on GPU.
+ * \returns                 True if this PME input is possible to run on GPU, false otherwise.
+ */
+static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (pme->nnodes != 1)
+    {
+        errorReasons.push_back("PME decomposition");
+    }
+    if (pme->pme_order != 4)
+    {
+        errorReasons.push_back("interpolation orders other than 4");
+    }
+    if (pme->bFEP)
+    {
+        errorReasons.push_back("free energy calculations (multiple grids)");
+    }
+    if (pme->doLJ)
+    {
+        errorReasons.push_back("Lennard-Jones PME");
+    }
+#if GMX_DOUBLE
+    {
+        errorReasons.push_back("double precision");
+    }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+    {
+        errorReasons.push_back("non-CUDA build of GROMACS");
+    }
+#endif
+
+    bool inputSupported = errorReasons.empty();
+    if (!inputSupported && error)
+    {
+        std::string regressionTestMarker = "PME GPU does not support";
+        // this prefix is tested for in the regression tests script gmxtest.pl
+        *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+    }
+    return inputSupported;
+}
+
+/*! \libinternal \brief
+ * Initializes the PME GPU data at the beginning of the run.
+ *
+ * \param[in,out] pme       The PME structure.
+ * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in]     mdlog     The logger.
+ * \param[in]     cr        The communication structure.
+ */
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
+                         const t_commrec *cr)
+{
+    std::string errorString;
+    bool        canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
+    if (!canRunOnGpu)
+    {
+        GMX_THROW(gmx::NotImplementedError(errorString));
+    }
+
+    pme->gpu          = new pme_gpu_t();
+    pme_gpu_t *pmeGPU = pme->gpu;
+    pmeGPU->common = std::shared_ptr<pme_shared_t>(new pme_shared_t());
+
+    /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
+    /* A convenience variable. */
+    pmeGPU->settings.useDecomposition = (pme->nnodes == 1);
+    /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
+    pmeGPU->settings.performGPUGather = true;
+
+    // GPU initialization
+    init_gpu(mdlog, cr->nodeid, gpuInfo);
+    pmeGPU->deviceInfo = gpuInfo;
+
+    pme_gpu_init_internal(pmeGPU);
+    pme_gpu_init_sync_events(pmeGPU);
+    pme_gpu_alloc_energy_virial(pmeGPU);
+
+    pme_gpu_copy_common_data_from(pme);
+
+    GMX_ASSERT(pmeGPU->common->epsilon_r != 0.0f, "PME GPU: bad electrostatic coefficient");
+
+    auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r;
+}
+
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+{
+    if (!pme_gpu_active(pme))
+    {
+        return;
+    }
+
+    if (!pme->gpu)
+    {
+        /* First-time initialization */
+        pme_gpu_init(pme, gpuInfo, mdlog, cr);
+    }
+    else
+    {
+        /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
+        pme_gpu_copy_common_data_from(pme);
+    }
+    /* GPU FFT will only get used for a single rank.*/
+    pme->gpu->settings.performGPUFFT   = (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
+    pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
+
+    pme_gpu_reinit_grids(pme->gpu);
+    pme_gpu_reinit_step(pme->gpu);
+}
+
+void pme_gpu_destroy(pme_gpu_t *pmeGPU)
+{
+    /* Free lots of data */
+    pme_gpu_free_energy_virial(pmeGPU);
+    pme_gpu_free_bspline_values(pmeGPU);
+    pme_gpu_free_forces(pmeGPU);
+    pme_gpu_free_coordinates(pmeGPU);
+    pme_gpu_free_coefficients(pmeGPU);
+    pme_gpu_free_spline_data(pmeGPU);
+    pme_gpu_free_grid_indices(pmeGPU);
+    pme_gpu_free_fract_shifts(pmeGPU);
+    pme_gpu_free_grids(pmeGPU);
+
+    pme_gpu_destroy_3dfft(pmeGPU);
+    pme_gpu_destroy_sync_events(pmeGPU);
+
+    /* Free the GPU-framework specific data last */
+    pme_gpu_destroy_specific(pmeGPU);
+
+    delete pmeGPU;
+}
+
+void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, const int nAtoms, const real *charges)
+{
+    auto      *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    kernelParamsPtr->atoms.nAtoms = nAtoms;
+    const int  alignment = pme_gpu_get_atom_data_alignment(pmeGPU);
+    pmeGPU->nAtomsPadded = ((nAtoms + alignment - 1) / alignment) * alignment;
+    const int  nAtomsAlloc   = c_usePadding ? pmeGPU->nAtomsPadded : nAtoms;
+    const bool haveToRealloc = (pmeGPU->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
+    pmeGPU->nAtomsAlloc = nAtomsAlloc;
+
+#if GMX_DOUBLE
+    GMX_RELEASE_ASSERT(false, "Only single precision supported");
+    GMX_UNUSED_VALUE(charges);
+#else
+    pme_gpu_realloc_and_copy_input_coefficients(pmeGPU, reinterpret_cast<const float *>(charges));
+    /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
+#endif
+
+    if (haveToRealloc)
+    {
+        pme_gpu_realloc_coordinates(pmeGPU);
+        pme_gpu_realloc_forces(pmeGPU);
+        pme_gpu_realloc_spline_data(pmeGPU);
+        pme_gpu_realloc_grid_indices(pmeGPU);
+    }
+}
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
new file mode 100644
index 0000000000..03fe8de852
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -0,0 +1,551 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief This file contains internal function definitions for performing the PME calculations on GPU.
+ * These are not meant to be exposed outside of the PME GPU code.
+ * As of now, their bodies are still in the common pme-gpu.cpp files.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#ifndef GMX_EWALD_PME_GPU_INTERNAL_H
+#define GMX_EWALD_PME_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
+
+#include "pme-gpu-types.h"                     // for the inline functions accessing pme_gpu_t members
+
+struct gmx_hw_info_t;
+struct gmx_gpu_opt_t;
+struct gmx_pme_t;                              // only used in pme_gpu_reinit
+struct t_commrec;
+struct gmx_wallclock_gpu_pme_t;
+struct pme_atomcomm_t;
+
+namespace gmx
+{
+class MDLogger;
+}
+
+/* Some general constants for PME GPU behaviour follow. */
+
+/*! \brief \libinternal
+ * false: The atom data GPU buffers are sized precisely according to the number of atoms.
+ *        (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
+ *        The atom index checks in the spread/gather code potentially hinder the performance.
+ * true:  The atom data GPU buffers are padded with zeroes so that the possible number of atoms
+ *        fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
+ *        The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
+ *        Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
+ * \todo Estimate performance differences
+ */
+const bool c_usePadding = true;
+
+/*! \brief \libinternal
+ * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
+ * true:  Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
+ *        Could be good for performance in specific systems with lots of neutral atoms.
+ * \todo Estimate performance differences.
+ */
+const bool c_skipNeutralAtoms = false;
+
+/*! \brief \libinternal
+ * Number of PME solve output floating point numbers.
+ * 6 for symmetric virial matrix + 1 for reciprocal energy.
+ */
+const int c_virialAndEnergyCount = 7;
+
+/* A block of CUDA-only functions that live in pme.cu */
+
+/*! \libinternal \brief
+ * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
+ * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ * \returns   Number of atoms in a single GPU atom data chunk.
+ */
+CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
+
+/*! \libinternal \brief
+ * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ * \returns   Number of atoms in a single GPU atom spline data chunk.
+ */
+CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
+
+/*! \libinternal \brief
+ * Synchronizes the current step, waiting for the GPU kernels/transfers to finish.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Allocates the fixed size energy and virial buffer both on GPU and CPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the energy and virial memory both on GPU and CPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Clears the energy and virial memory on GPU with 0.
+ * Should be called at the end of the energy/virial calculation step.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates and copies the pre-computed B-spline values to the GPU.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the GPU buffer for the PME forces.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the GPU buffer for the PME forces.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
+ * To be called e.g. after the bonded calculations.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[in] h_forces           The input forces rvec buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                   const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[out] h_forces          The output forces rvec buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                    float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the PME GPU output forces copying to the CPU buffer to finish.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ *
+ * Needs to be called on every DD step/in the beginning.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the input coordinates from the CPU buffer onto the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
+ *
+ * Needs to be called every MD step. The coordinates are then used in the spline calculation.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                        const rvec      *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the coordinates on the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
+ * Clears the padded part if needed.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ * \param[in] h_coefficients    The input atom charges/coefficients.
+ *
+ * Does not need to be done every MD step, only whenever the local charges change.
+ * (So, in the beginning of the run, or on DD step).
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                                     const float     *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the charges/coefficients on the GPU.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffers on the GPU and the host for the atoms spline data.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the buffers on the GPU for the atoms spline data.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffers on the GPU and the host for the particle gridline indices.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the buffer on the GPU for the particle gridline indices.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Clears the real space grid on the GPU.
+ * Should be called at the end of each MD step.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the pre-computed fractional coordinates' shifts on the GPU.
+ *
+ * \param[in] pmeGPU            The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the input real-space grid from the host to the GPU.
+ *
+ * \param[in] pmeGPU   The PME GPU structure.
+ * \param[in] h_grid   The host-side grid buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                        float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the output real-space grid from the GPU to the host.
+ *
+ * \param[in] pmeGPU   The PME GPU structure.
+ * \param[out] h_grid  The host-side grid buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                                         float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the grid copying to the host-side buffer after spreading to finish.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the grid copying to the host-side buffer after solving to finish.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Does the one-time GPU-framework specific PME initialization.
+ * For CUDA, the PME stream is created with the highest priority.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the PME GPU-framework specific data.
+ * Should be called last in the PME GPU destructor.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Initializes the PME GPU synchronization events.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the PME GPU synchronization events.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Initializes the CUDA FFT structures.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the CUDA FFT structures.
+ *
+ * \param[in] pmeGPU  The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/* Several CUDA event-based timing functions that live in pme-timings.cu */
+
+/*! \libinternal \brief
+ * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \brief
+ * Resets the PME GPU timings. To be called at the reset step.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \param[in] timings        The gmx_wallclock_gpu_pme_t structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t         *CUDA_FUNC_ARGUMENT(pmeGPU),
+                                             gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
+
+/* The inlined convenience PME GPU status getters */
+
+/*! \libinternal \brief
+ * Tells if PME runs on multiple GPUs with the decomposition.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \returns                  True if PME runs on multiple GPUs, false otherwise.
+ */
+gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU)
+{
+    return !pmeGPU->settings.useDecomposition;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the gathering stage on GPU.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \returns                  True if the gathering is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.performGPUGather;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the FFT stages on GPU.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \returns                  True if FFT is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.performGPUFFT;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the grid (un-)wrapping on GPU.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \returns                  True if (un-)wrapping is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.useDecomposition;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the grid solving on GPU.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \returns                  True if solving is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.performGPUSolve;
+}
+
+/* A block of C++ functions that live in pme-gpu-internal.cpp */
+
+/*! \libinternal \brief
+ * Returns the output virial and energy of the PME solving.
+ * Should be called after pme_gpu_finish_step.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[out] energy            The output energy.
+ * \param[out] virial            The output virial matrix.
+ */
+void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
+
+/*! \libinternal \brief
+ * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \param[in] box            The unit cell box.
+ */
+void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
+
+/*! \libinternal \brief
+ * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
+ *
+ * \param[in] pmeGPU           The PME GPU structure.
+ * \param[in] needToUpdateBox  Tells if the stored unit cell parameters should be updated from \p box.
+ * \param[in] box              The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
+ *                             Would only be used if \p needToUpdateBox is true.
+ * \param[in] h_coordinates    Input coordinates (XYZ rvec array).
+ */
+void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
+
+/*! \libinternal \brief
+ * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
+ *
+ * \param[in] pmeGPU         The PME GPU structure.
+ * \param[in] bCalcForces    The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
+ * \param[in] bCalcEnerVir   The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
+ */
+void pme_gpu_finish_step(const pme_gpu_t *pmeGPU,  const bool       bCalcForces,
+                         const bool       bCalcEnerVir);
+
+/*! \libinternal \brief
+ * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
+ *
+ * \param[in,out] pme       The PME structure.
+ * \param[in,out] gpuInfo   The GPU information structure.
+ * \param[in]     mdlog     The logger.
+ * \param[in]     cr        The communication structure.
+ * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
+ */
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+
+/*! \libinternal \brief
+ * Destroys the PME GPU data at the end of the run.
+ *
+ * \param[in] pmeGPU     The PME GPU structure.
+ */
+void pme_gpu_destroy(pme_gpu_t *pmeGPU);
+
+/*! \libinternal \brief
+ * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
+ *
+ * \param[in] pmeGPU    The PME GPU structure.
+ * \param[in] nAtoms    The number of particles.
+ * \param[in] charges   The pointer to the host-side array of particle charges.
+ *
+ * This is a function that should only be called in the beginning of the run and on domain decomposition.
+ * Should be called before the pme_gpu_set_io_ranges.
+ */
+void pme_gpu_reinit_atoms(pme_gpu_t        *pmeGPU,
+                          const int         nAtoms,
+                          const real       *charges);
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h
new file mode 100644
index 0000000000..1bf3a88c1a
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -0,0 +1,356 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines the GPU-agnostic PME GPU data structures
+ * (the host-side PME GPU data, and the GPU function parameters).
+ * \todo Due to Gerrit workflow and time constraints, some renaming/refactoring
+ * which does not impair the performance will be performed once
+ * most of the initial PME CUDA implementation is merged
+ * into the master branch (likely, after release 2017).
+ * This should include:
+ * -- bringing the structure/function names up to guidelines
+ * ---- pme_gpu_settings_t -> PmeGpuTasks
+ * -- refining GPU notation application (#2053)
+ * -- renaming coefficients to charges (?)
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#ifndef GMX_EWALD_PME_GPU_TYPES_H
+#define GMX_EWALD_PME_GPU_TYPES_H
+
+#include "config.h"
+
+#include <memory>
+#include <vector>
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+
+struct gmx_hw_info;
+struct gmx_device_info_t;
+
+/*! \brief Possible PME codepaths
+ * \todo: make this enum class with gmx_pme_t C++ refactoring
+ */
+enum PmeRunMode
+{
+    CPU,     //!< Whole PME step is done on CPU
+    GPU,     //!< Whole PME step is done on GPU
+    Hybrid,  //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU.
+};
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+struct pme_gpu_cuda_t;
+/*! \brief A typedef for including the GPU host data by pointer */
+typedef pme_gpu_cuda_t pme_gpu_specific_t;
+
+struct pme_gpu_cuda_kernel_params_t;
+/*! \brief A typedef for including the GPU kernel arguments data by pointer */
+typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t;
+
+#else
+
+/*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */
+typedef int pme_gpu_specific_t;
+/*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
+typedef int pme_gpu_kernel_params_t;
+
+#endif
+
+/* What follows is all the PME GPU function arguments,
+ * sorted into several device-side structures depending on the update rate.
+ * This is GPU agnostic (float3 replaced by float[3], etc.).
+ * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described
+ * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh.
+ */
+
+/*! \internal \brief
+ * A GPU data structure for storing the constant PME data.
+ * This only has to be initialized once.
+ */
+struct pme_gpu_const_params_t
+{
+    /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */
+    float elFactor;
+    /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
+     * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
+    float *d_virialAndEnergy;
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data related to the grid sizes and cut-off.
+ * This only has to be updated at every DD step.
+ */
+struct pme_gpu_grid_params_t
+{
+    /* Grid sizes */
+    /*! \brief Real-space grid data dimensions. */
+    int   realGridSize[DIM];
+    /*! \brief Real-space grid dimensions, only converted to floating point. */
+    float realGridSizeFP[DIM];
+    /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */
+    int   realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */
+    /*! \brief Fourier grid dimensions. This counts the complex numbers! */
+    int   complexGridSize[DIM];
+    /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
+    int   complexGridSizePadded[DIM];
+
+    /* Grid pointers */
+    /*! \brief Real space grid. */
+    float *d_realGrid;
+    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
+    float *d_fourierGrid;
+
+    /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
+    float ewaldFactor;
+
+    /*! \brief Grid spline values as in pme->bsp_mod
+     * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
+     */
+    float              *d_splineModuli;
+    /*! \brief Offsets for X/Y/Z components of d_splineModuli */
+    int                 splineValuesOffset[DIM];
+
+    /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
+    float               *d_fractShiftsTable;
+    /*! \brief Gridline indices lookup table
+     * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
+    int                *d_gridlineIndicesTable;
+    /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
+    int                 tablesOffsets[DIM];
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition.
+ * This only has to be updated every DD step.
+ */
+struct pme_gpu_atom_params_t
+{
+    /*! \brief Number of local atoms */
+    int    nAtoms;
+    /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
+     * The coordinates themselves change and need to be copied to the GPU every MD step,
+     * but reallocation happens only at DD.
+     */
+    float *d_coordinates;
+    /*! \brief Pointer to the global GPU memory with input atom charges.
+     * The charges only need to be reallocated and copied to the GPU at DD step.
+     */
+    float  *d_coefficients;
+    /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
+     * The forces change and need to be copied from (and possibly to) the GPU every MD step,
+     * but reallocation happens only at DD.
+     */
+    float  *d_forces;
+    /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
+     * Computed on GPU in the spline calculation part.
+     */
+    int *d_gridlineIndices;
+
+    /* B-spline parameters are computed entirely on GPU every MD step, not copied.
+     * Unless we want to try something like GPU spread + CPU gather?
+     */
+    /*! \brief Pointer to the global GPU memory with B-spline values */
+    float  *d_theta;
+    /*! \brief Pointer to the global GPU memory with B-spline derivative values */
+    float  *d_dtheta;
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data which might change every MD step.
+ */
+struct pme_gpu_step_params_t
+{
+    /* The box parameters. The box only changes size each step with pressure coupling enabled. */
+    /*! \brief
+     * Reciprocal (inverted unit cell) box.
+     *
+     * The box is transposed as compared to the CPU pme->recipbox.
+     * Basically, spread uses matrix columns (while solve and gather use rows).
+     * This storage format might be not the most optimal since the box is always triangular so there are zeroes.
+     */
+    float  recipBox[DIM][DIM];
+    /*! \brief The unit cell volume for solving. */
+    float  boxVolume;
+};
+
+/*! \internal \brief
+ * A single structure encompassing almost all the PME data used in GPU kernels on device.
+ * This is inherited by the GPU framework-specific structure
+ * (pme_gpu_cuda_kernel_params_t in pme.cuh).
+ * This way, most code preparing the kernel parameters can be GPU-agnostic by casting
+ * the kernel parameter data pointer to pme_gpu_kernel_params_base_t.
+ */
+struct pme_gpu_kernel_params_base_t
+{
+    /*! \brief Constant data that is set once. */
+    pme_gpu_const_params_t constants;
+    /*! \brief Data dependent on the grid size/cutoff. */
+    pme_gpu_grid_params_t  grid;
+    /*! \brief Data dependent on the DD and local atoms. */
+    pme_gpu_atom_params_t  atoms;
+    /*! \brief Data that possibly changes on every MD step. */
+    pme_gpu_step_params_t  step;
+};
+
+/* Here are the host-side structures */
+
+/*! \internal \brief
+ * The PME GPU settings structure, included in the main PME GPU structure by value.
+ */
+struct pme_gpu_settings_t
+{
+    /* Permanent settings set on initialization */
+    /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */
+    bool performGPUSolve;
+    /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */
+    bool performGPUGather;
+    /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */
+    bool performGPUFFT;
+    /*! \brief A convenience boolean which tells if PME decomposition is used. */
+    bool useDecomposition;
+    /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
+     * Only intended to be used by the test framework.
+     */
+    bool copyAllOutputs;
+    /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */
+    int  stepFlags;
+};
+
+/*! \internal \brief
+ * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
+ * Buffers are managed by the PME GPU module.
+ */
+struct pme_gpu_staging_t
+{
+    /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
+    float  *h_virialAndEnergy;
+    /*! \brief B-spline values intermediate host-side buffer. */
+    float  *h_splineModuli;
+
+    /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
+    float  *h_theta;
+    /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */
+    float  *h_dtheta;
+    /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */
+    int    *h_gridlineIndices;
+};
+
+/*! \internal \brief
+ * The PME GPU structure for all the data copied directly from the CPU PME structure.
+ * The copying is done when the CPU PME structure is already (re-)initialized
+ * (pme_gpu_reinit is called at the end of gmx_pme_init).
+ * All the variables here are named almost the same way as in gmx_pme_t.
+ * The types are different: pointers are replaced by vectors.
+ * TODO: use the shared data with the PME CPU.
+ * Included in the main PME GPU structure by value.
+ */
+struct pme_shared_t
+{
+    /*! \brief Grid count - currently always 1 on GPU */
+    int ngrids;
+    /*! \brief Grid dimensions - nkx, nky, nkz */
+    int nk[DIM];
+    /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
+     * TODO: find out if these are really needed for the CPU FFT compatibility.
+     */
+    int               pmegrid_n[DIM];
+    /*! \brief PME interpolation order */
+    int               pme_order;
+    /*! \brief Ewald splitting coefficient for Coulomb */
+    real              ewaldcoeff_q;
+    /*! \brief Electrostatics parameter */
+    real              epsilon_r;
+    /*! \brief Gridline indices - nnx, nny, nnz */
+    std::vector<int>  nn[DIM];
+    /*! \brief Fractional shifts - fshx, fshy, fshz */
+    std::vector<real> fsh[DIM];
+    /*! \brief Precomputed B-spline values */
+    std::vector<real> bsp_mod[DIM];
+    /*! \brief The PME codepath being taken */
+    PmeRunMode        runMode;
+};
+
+/*! \internal \brief
+ * The main PME GPU host structure, included in the PME CPU structure by pointer.
+ */
+struct pme_gpu_t
+{
+    /*! \brief The information copied once per reinit from the CPU structure. */
+    std::shared_ptr<pme_shared_t> common; // TODO: make the CPU structure use the same type
+
+    /*! \brief The settings. */
+    pme_gpu_settings_t settings;
+
+    /*! \brief The host-side buffers.
+     * The device-side buffers are buried in kernelParams, but that will have to change.
+     */
+    pme_gpu_staging_t staging;
+
+    /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT.
+     * Used for kernel scheduling.
+     * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
+     * TODO: this and the next member represent a memory allocation/padding properties -
+     * what a container type should do ideally.
+     */
+    int nAtomsPadded;
+    /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT
+     * if c_usePadding is true.
+     * Used only as a basic size for almost all the atom data allocations
+     * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
+     * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
+     * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+     */
+    int nAtomsAlloc;
+
+    /*! \brief A pointer to the device used during the execution. */
+    gmx_device_info_t *deviceInfo;
+
+    /*! \brief A single structure encompassing all the PME data used on GPU.
+     * Its value is the only argument to all the PME GPU kernels.
+     * \todo Test whether this should be copied to the constant GPU memory once per MD step
+     * (or even less often with no box updates) instead of being an argument.
+     */
+    std::shared_ptr<pme_gpu_kernel_params_t> kernelParams;
+
+    /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */
+    std::shared_ptr<pme_gpu_specific_t> archSpecific; /* FIXME: make it an unique_ptr */
+};
+
+#endif
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp
new file mode 100644
index 0000000000..e830c53d8d
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -0,0 +1,114 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme-gpu-internal.h"
+#include "pme-grid.h"
+#include "pme-internal.h"
+#include "pme-solve.h"
+
+bool pme_gpu_task_enabled(const gmx_pme_t *pme)
+{
+    return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
+}
+
+void pme_gpu_reset_timings(const gmx_pme_t *pme)
+{
+    if (pme_gpu_active(pme))
+    {
+        pme_gpu_reset_timings(pme->gpu);
+    }
+}
+
+void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
+{
+    if (pme_gpu_active(pme))
+    {
+        pme_gpu_get_timings(pme->gpu, timings);
+    }
+}
+
+void pme_gpu_get_results(const gmx_pme_t *pme,
+                         gmx_wallcycle_t  wcycle,
+                         matrix           vir_q,
+                         real            *energy_q)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
+    const bool haveComputedForces          = pme->gpu->settings.stepFlags & GMX_PME_CALC_F;
+
+    // The wallcycle hierarchy is messy - we pause ewcFORCE just to exclude the PME GPU sync time
+    wallcycle_stop(wcycle, ewcFORCE);
+
+    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+    pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial);
+    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    wallcycle_start_nocount(wcycle, ewcFORCE);
+
+    if (haveComputedEnergyAndVirial)
+    {
+        if (pme->doCoulomb)
+        {
+            if (pme_gpu_performs_solve(pme->gpu))
+            {
+                pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q);
+            }
+            else
+            {
+                get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q);
+            }
+        }
+        else
+        {
+            *energy_q = 0;
+        }
+    }
+    /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */
+}
diff --git a/src/gromacs/ewald/pme-grid.cpp b/src/gromacs/ewald/pme-grid.cpp
index 5c01efa499..82479cbf56 100644
--- a/src/gromacs/ewald/pme-grid.cpp
+++ b/src/gromacs/ewald/pme-grid.cpp
@@ -50,6 +50,8 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 
+#include "pme-internal.h"
+
 #ifdef DEBUG_PME
 #include "gromacs/fileio/pdbio.h"
 #include "gromacs/utility/cstringutil.h"
diff --git a/src/gromacs/ewald/pme-grid.h b/src/gromacs/ewald/pme-grid.h
index 7f398a24f9..e5e4f62368 100644
--- a/src/gromacs/ewald/pme-grid.h
+++ b/src/gromacs/ewald/pme-grid.h
@@ -41,8 +41,7 @@
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
-#include "pme-internal.h"
-
+struct gmx_pme_t;
 
 /*! \brief
  * We allow coordinates to be out the unit-cell by up to 2 box lengths,
@@ -56,24 +55,26 @@ constexpr int c_pmeMaxUnitcellShift = 2;
  */
 constexpr int c_pmeNeighborUnitcellCount = 2*c_pmeMaxUnitcellShift + 1;
 
+struct pmegrid_t;
+struct pmegrids_t;
 
 #if GMX_MPI
 void
-gmx_sum_qgrid_dd(struct gmx_pme_t *pme, real *grid, int direction);
+gmx_sum_qgrid_dd(gmx_pme_t *pme, real *grid, int direction);
 #endif
 
 int
 copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, real *pmegrid, real *fftgrid, int grid_index);
 
 int
-copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
+copy_fftgrid_to_pmegrid(gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
                         int nthread, int thread);
 
 void
 wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid);
 
 void
-unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid);
+unwrap_periodic_pmegrid(gmx_pme_t *pme, real *pmegrid);
 
 void
 pmegrid_init(pmegrid_t *grid,
@@ -110,6 +111,6 @@ reuse_pmegrids(const pmegrids_t *oldgrid, pmegrids_t *newgrid);
 /* This function is called from gmx_pme_do() only from debugging code
    that is commented out. */
 void
-dump_local_fftgrid(struct gmx_pme_t *pme, const real *fftgrid);
+dump_local_fftgrid(gmx_pme_t *pme, const real *fftgrid);
 
 #endif
diff --git a/src/gromacs/ewald/pme-internal.h b/src/gromacs/ewald/pme-internal.h
index fbb0dfe37a..0feb977419 100644
--- a/src/gromacs/ewald/pme-internal.h
+++ b/src/gromacs/ewald/pme-internal.h
@@ -64,11 +64,14 @@
 #include "gromacs/timing/walltime_accounting.h"
 #include "gromacs/utility/gmxmpi.h"
 
+#include "pme-gpu-types.h"
+
 //! A repeat of typedef from parallel_3dfft.h
 typedef struct gmx_parallel_3dfft *gmx_parallel_3dfft_t;
 
 struct t_commrec;
 struct t_inputrec;
+struct pme_gpu_t;
 
 //@{
 //! Grid indices for A state for charge and Lennard-Jones C6
@@ -167,7 +170,7 @@ typedef struct {
 } splinedata_t;
 
 /*! \brief Data structure for coordinating transfer between PP and PME ranks*/
-typedef struct {
+struct pme_atomcomm_t{
     int      dimind;        /* The index of the dimension, 0=x, 1=y */
     int      nslab;
     int      nodeid;
@@ -203,20 +206,20 @@ typedef struct {
     int            *thread_idx; /* Which thread should spread which coefficient */
     thread_plist_t *thread_plist;
     splinedata_t   *spline;
-} pme_atomcomm_t;
+};
 
 /*! \brief Data structure for a single PME grid */
-typedef struct {
+struct pmegrid_t{
     ivec  ci;     /* The spatial location of this grid         */
     ivec  n;      /* The used size of *grid, including order-1 */
     ivec  offset; /* The grid offset from the full node grid   */
     int   order;  /* PME spreading order                       */
     ivec  s;      /* The allocated size of *grid, s >= n       */
     real *grid;   /* The grid local thread, size n             */
-} pmegrid_t;
+};
 
 /*! \brief Data structures for PME grids */
-typedef struct {
+struct pmegrids_t{
     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
     int        nthread;      /* The number of threads operating on this grid     */
     ivec       nc;           /* The local spatial decomposition over the threads */
@@ -224,7 +227,7 @@ typedef struct {
     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
     int       *g2t[DIM];     /* The grid to thread index                         */
     ivec       nthread_comm; /* The number of threads to communicate with        */
-} pmegrids_t;
+};
 
 /*! \brief Data structure for spline-interpolation working buffers */
 struct pme_spline_work;
@@ -233,7 +236,7 @@ struct pme_spline_work;
 struct pme_solve_work_t;
 
 /*! \brief Master PME data structure */
-typedef struct gmx_pme_t {
+struct gmx_pme_t {
     int           ndecompdim; /* The number of decomposition dimensions */
     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
     int           nodeid_major;
@@ -264,7 +267,26 @@ typedef struct gmx_pme_t {
     real       ewaldcoeff_lj; /* Ewald splitting coefficient for r^-6 */
     real       epsilon_r;
 
-    class EwaldBoxZScaler *boxScaler;   /*! The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */
+
+    enum PmeRunMode runMode; /* Which codepath is the PME runner taking - CPU, GPU, mixed;
+                              * TODO: this is the information that should be owned by the task scheduler,
+                              * and ideally not be duplicated here.
+                              */
+
+    pme_gpu_t      *gpu;     /* A pointer to the GPU data.
+                              * TODO: this should be unique or a shared pointer.
+                              * Currently in practice there is a single gmx_pme_t instance while a code
+                              * is partially set up for many of them. The PME tuning calls gmx_pme_reinit()
+                              * which fully reinitializes the one and only PME structure anew while maybe
+                              * keeping the old grid buffers if they were already large enough.
+                              * This small choice should be made clear in the later refactoring -
+                              * do we store many PME objects for different grid sizes,
+                              * or a single PME object that handles different grid sizes gracefully.
+                              */
+
+
+    class EwaldBoxZScaler *boxScaler;   /**< The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */
+
 
     int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
 
@@ -279,6 +301,7 @@ typedef struct gmx_pme_t {
                                          * This can probably be done in a better way
                                          * but this simple hack works for now
                                          */
+
     /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
     /* pmegrid_nz might be larger than strictly necessary to ensure
@@ -329,10 +352,40 @@ typedef struct gmx_pme_t {
     /* Work data for sum_qgrid */
     real *   sum_qgrid_tmp;
     real *   sum_qgrid_dd_tmp;
-} t_gmx_pme_t;
+};
 
 //! @endcond
 
+/*! \brief
+ * Finds out if PME is currently running on GPU.
+ * TODO: should this be removed eventually?
+ *
+ * \param[in] pme  The PME structure.
+ * \returns        True if PME runs on GPU currently, false otherwise.
+ */
+inline bool pme_gpu_active(const gmx_pme_t *pme)
+{
+    return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
+}
+
+/*! \brief Check restrictions on pme_order and the PME grid nkx,nky,nkz.
+ *
+ * With bFatal=TRUE, a fatal error is generated on violation,
+ * bValidSettings=NULL can be passed.
+ * With bFatal=FALSE, *bValidSettings reports the validity of the settings.
+ * bUseThreads tells if any MPI rank doing PME uses more than 1 threads.
+ * If at calling you bUseThreads is unknown, pass TRUE for conservative
+ * checking.
+ *
+ * TODO: the GPU restrictions are checked separately during pme_gpu_init().
+ */
+void gmx_pme_check_restrictions(int pme_order,
+                                int nkx, int nky, int nkz,
+                                int nnodes_major,
+                                gmx_bool bUseThreads,
+                                gmx_bool bFatal,
+                                gmx_bool *bValidSettings);
+
 /*! \brief Initialize the PME-only side of the PME <-> PP communication */
 gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr);
 
@@ -349,11 +402,33 @@ enum {
 
 /*! \brief Called by PME-only ranks to receive coefficients and coordinates
  *
- * The return value is used to control further processing, with meanings:
- * pmerecvqxX:             all parameters set, chargeA and chargeB can be NULL
- * pmerecvqxFINISH:        no parameters set
- * pmerecvqxSWITCHGRID:    only grid_size and *ewaldcoeff are set
- * pmerecvqxRESETCOUNTERS: *step is set
+ * \param[in,out] pme_pp         PME-PP communication structure.
+ * \param[out] natoms            Number of received atoms.
+ * \param[out] chargeA           State A charges, if received.
+ * \param[out] chargeB           State B charges, if received.
+ * \param[out] sqrt_c6A          State A coefficients, if received.
+ * \param[out] sqrt_c6B          State B coefficients, if received.
+ * \param[out] sigmaA            State A coefficients, if received.
+ * \param[out] sigmaB            State B coefficients, if received.
+ * \param[out] box               System box, if received.
+ * \param[out] x                 Atoms' coordinates, if received.
+ * \param[out] f                 Atoms' PME forces, if received.
+ * \param[out] maxshift_x        Maximum shift in X direction, if received.
+ * \param[out] maxshift_y        Maximum shift in Y direction, if received.
+ * \param[out] lambda_q          Free-energy lambda for electrostatics, if received.
+ * \param[out] lambda_lj         Free-energy lambda for Lennard-Jones, if received.
+ * \param[out] bEnerVir          Set to true if this is an energy/virial calculation step, otherwise set to false.
+ * \param[out] step              MD integration step number.
+ * \param[out] grid_size         PME grid size, if received.
+ * \param[out] ewaldcoeff_q      Ewald cut-off parameter for electrostatics, if received.
+ * \param[out] ewaldcoeff_lj     Ewald cut-off parameter for Lennard-Jones, if received.
+ * \param[out] atomSetChanged    Set to true only if the local domain atom data (charges/coefficients)
+ *                               has been received (after DD) and should be reinitialized. Otherwise not changed.
+ *
+ * \retval pmerecvqxX             All parameters were set, chargeA and chargeB can be NULL.
+ * \retval pmerecvqxFINISH        No parameters were set.
+ * \retval pmerecvqxSWITCHGRID    Only grid_size and *ewaldcoeff were set.
+ * \retval pmerecvqxRESETCOUNTERS *step was set.
  */
 int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
                                int *natoms,
@@ -365,7 +440,10 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
                                real *lambda_q, real *lambda_lj,
                                gmx_bool *bEnerVir,
                                gmx_int64_t *step,
-                               ivec grid_size, real *ewaldcoeff_q, real *ewaldcoeff_lj);
+                               ivec grid_size,
+                               real *ewaldcoeff_q,
+                               real *ewaldcoeff_lj,
+                               bool *atomSetChanged);
 
 /*! \brief Send the PME mesh force, virial and energy to the PP-only nodes */
 void gmx_pme_send_force_vir_ener(struct gmx_pme_pp *pme_pp,
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp
index d42337c153..dc85a081cf 100644
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -845,7 +845,14 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
 
     if (!pme_lb->bSepPMERanks)
     {
-        if (pme_lb->setup[pme_lb->cur].pmedata == nullptr)
+        /* FIXME:
+         * CPU PME keeps a list of allocated pmedata's, that's why pme_lb->setup[pme_lb->cur].pmedata is not always nullptr.
+         * GPU PME, however, currently needs the gmx_pme_reinit always called on load balancing
+         * (pme_gpu_reinit might be not sufficiently decoupled from gmx_pme_init).
+         * This can lead to a lot of reallocations for PME GPU.
+         * Would be nicer if the allocated grid list was hidden within a single pmedata structure.
+         */
+        if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr) || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
         {
             /* Generate a new PME data structure,
              * copying part of the old pointers.
@@ -914,7 +921,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
     int    n_prev;
     double cycles_prev;
 
-    assert(pme_lb != NULL);
+    assert(pme_lb != nullptr);
 
     if (!pme_lb->bActive)
     {
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp
index 66a7728b24..75eb9fe2a0 100644
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -122,8 +122,13 @@ static void gmx_pmeonly_switch(int *npmedata, struct gmx_pme_t ***pmedata,
             pme->nky == grid_size[YY] &&
             pme->nkz == grid_size[ZZ])
         {
+            /* Here we have found an existing PME data structure that suits us.
+             * However, in the GPU case, we have to reinitialize it - there's only one GPU structure.
+             * This should not cause actual GPU reallocations, at least (the allocated buffers are never shrunk).
+             * So, just some grid size updates in the GPU kernel parameters.
+             */
+            gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size, ewaldcoeff_q, ewaldcoeff_lj);
             *pme_ret = pme;
-
             return;
         }
 
@@ -183,6 +188,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
         do
         {
             /* Domain decomposition */
+            bool atomSetChanged = false;
             ret = gmx_pme_recv_coeffs_coords(pme_pp,
                                              &natoms,
                                              &chargeA, &chargeB,
@@ -193,7 +199,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                                              &lambda_q, &lambda_lj,
                                              &bEnerVir,
                                              &step,
-                                             grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
+                                             grid_switch,
+                                             &ewaldcoeff_q,
+                                             &ewaldcoeff_lj,
+                                             &atomSetChanged);
 
             if (ret == pmerecvqxSWITCHGRID)
             {
@@ -201,6 +210,11 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, ewaldcoeff_q, ewaldcoeff_lj, cr, ir, &pme);
             }
 
+            if (atomSetChanged)
+            {
+                gmx_pme_reinit_atoms(pme, natoms, chargeA);
+            }
+
             if (ret == pmerecvqxRESETCOUNTERS)
             {
                 /* Reset the cycle and flop counters */
diff --git a/src/gromacs/ewald/pme-pp.cpp b/src/gromacs/ewald/pme-pp.cpp
index 24819755e2..6a1de5199c 100644
--- a/src/gromacs/ewald/pme-pp.cpp
+++ b/src/gromacs/ewald/pme-pp.cpp
@@ -436,7 +436,8 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
                                gmx_int64_t       *step,
                                ivec               grid_size,
                                real              *ewaldcoeff_q,
-                               real              *ewaldcoeff_lj)
+                               real              *ewaldcoeff_lj,
+                               bool              *atomSetChanged)
 {
     int status = -1;
     int nat    = 0;
@@ -493,6 +494,8 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
 
         if (cnb.flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
         {
+            *atomSetChanged = true;
+
             /* Receive the send counts from the other PP nodes */
             for (int sender = 0; sender < pme_pp->nnode; sender++)
             {
@@ -644,6 +647,7 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
     GMX_UNUSED_VALUE(grid_size);
     GMX_UNUSED_VALUE(ewaldcoeff_q);
     GMX_UNUSED_VALUE(ewaldcoeff_lj);
+    GMX_UNUSED_VALUE(atomSetChanged);
 
     status = pmerecvqxX;
 #endif
diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu
new file mode 100644
index 0000000000..9fbe39dde9
--- /dev/null
+++ b/src/gromacs/ewald/pme-timings.cu
@@ -0,0 +1,112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME GPU timing events in CUDA.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme-timings.cuh"
+
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+
+/*! \brief \internal
+ * Tells if CUDA-based performance tracking is enabled for PME.
+ *
+ * \param[in] pme            The PME data structure.
+ * \returns                  True if timings are enabled, false otherwise.
+ */
+gmx_inline bool pme_gpu_timings_enabled(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->archSpecific->useTiming;
+}
+
+void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+        pmeGPU->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGPU->archSpecific->pmeStream);
+    }
+}
+
+void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+        pmeGPU->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGPU->archSpecific->pmeStream);
+    }
+}
+
+void pme_gpu_get_timings(const pme_gpu_t *pmeGPU, gmx_wallclock_gpu_pme_t *timings)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        GMX_RELEASE_ASSERT(timings, "Null GPU timing pointer");
+        for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+        {
+            timings->timing[i].t = pmeGPU->archSpecific->timingEvents[i].getTotalTime();
+            timings->timing[i].c = pmeGPU->archSpecific->timingEvents[i].getCallCount();
+        }
+    }
+}
+
+void pme_gpu_update_timings(const pme_gpu_t *pmeGPU)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+        {
+            pmeGPU->archSpecific->timingEvents[i].getLastRangeTime();
+        }
+    }
+}
+
+void pme_gpu_reset_timings(const pme_gpu_t *pmeGPU)
+{
+    if (pme_gpu_timings_enabled(pmeGPU))
+    {
+        for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+        {
+            pmeGPU->archSpecific->timingEvents[i].reset();
+        }
+    }
+}
diff --git a/src/gromacs/ewald/pme-timings.cuh b/src/gromacs/ewald/pme-timings.cuh
new file mode 100644
index 0000000000..8c94e7f817
--- /dev/null
+++ b/src/gromacs/ewald/pme-timings.cuh
@@ -0,0 +1,66 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *  \brief Defines PME GPU timing functions.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_TIMINGS_CUH
+#define GMX_EWALD_PME_TIMINGS_CUH
+
+#include "gromacs/gpu_utils/gpuregiontimer.cuh"
+#include "gromacs/timing/gpu_timing.h"       // TODO: move include to the source files
+
+struct pme_gpu_t;
+
+/*! \libinternal \brief
+ * Starts timing the certain PME GPU stage during a single step (if timings are enabled).
+ *
+ * \param[in] pmeGPU         The PME GPU data structure.
+ * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId);
+
+/*! \libinternal \brief
+ * Stops timing the certain PME GPU stage during a single step (if timings are enabled).
+ *
+ * \param[in] pmeGPU         The PME GPU data structure.
+ * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId);
+
+#endif
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index fdd9b4a69b..cdd0a41f3d 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -104,6 +104,7 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/logger.h"
 #include "gromacs/utility/real.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
@@ -111,6 +112,7 @@
 
 #include "calculate-spline-moduli.h"
 #include "pme-gather.h"
+#include "pme-gpu-internal.h"
 #include "pme-grid.h"
 #include "pme-internal.h"
 #include "pme-redistribute.h"
@@ -503,18 +505,22 @@ static int div_round_up(int enumerator, int denominator)
     return (enumerator + denominator - 1)/denominator;
 }
 
-int gmx_pme_init(struct gmx_pme_t **pmedata,
-                 t_commrec *        cr,
-                 int                nnodes_major,
-                 int                nnodes_minor,
-                 const t_inputrec * ir,
-                 int                homenr,
-                 gmx_bool           bFreeEnergy_q,
-                 gmx_bool           bFreeEnergy_lj,
-                 gmx_bool           bReproducible,
-                 real               ewaldcoeff_q,
-                 real               ewaldcoeff_lj,
-                 int                nthread)
+int gmx_pme_init(struct gmx_pme_t   **pmedata,
+                 t_commrec           *cr,
+                 int                  nnodes_major,
+                 int                  nnodes_minor,
+                 const t_inputrec    *ir,
+                 int                  homenr,
+                 gmx_bool             bFreeEnergy_q,
+                 gmx_bool             bFreeEnergy_lj,
+                 gmx_bool             bReproducible,
+                 real                 ewaldcoeff_q,
+                 real                 ewaldcoeff_lj,
+                 int                  nthread,
+                 PmeRunMode           runMode,
+                 pme_gpu_t           *pmeGPU,
+                 gmx_device_info_t   *gpuInfo,
+                 const gmx::MDLogger &mdlog)
 {
     int               use_threads, sum_use_threads, i;
     ivec              ndata;
@@ -745,6 +751,9 @@ int gmx_pme_init(struct gmx_pme_t **pmedata,
     snew(pme->bsp_mod[YY], pme->nky);
     snew(pme->bsp_mod[ZZ], pme->nkz);
 
+    pme->gpu     = pmeGPU; /* Carrying over the single GPU structure */
+    pme->runMode = runMode;
+
     /* The required size of the interpolation grid, including overlap.
      * The allocated size (pmegrid_n?) might be slightly larger.
      */
@@ -755,7 +764,6 @@ int gmx_pme_init(struct gmx_pme_t **pmedata,
     pme->pmegrid_nz_base = pme->nkz;
     pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
     set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
-
     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
     pme->pmegrid_start_iz = 0;
@@ -846,6 +854,8 @@ int gmx_pme_init(struct gmx_pme_t **pmedata,
     pme->lb_buf2       = nullptr;
     pme->lb_buf_nalloc = 0;
 
+    pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
     // no exception was thrown during the init, so we hand over the PME structure handle
@@ -891,8 +901,14 @@ int gmx_pme_reinit(struct gmx_pme_t **pmedata,
 
     try
     {
+        const gmx::MDLogger dummyLogger;
+        // This is reinit which is currently only changing grid size/coefficients,
+        // so we don't expect the actual logging.
+        // TODO: when PME is an object, it should take reference to mdlog on construction and save it.
         ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
-                           &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
+                           &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
+                           pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+        //TODO this is mostly passing around current values
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
@@ -985,6 +1001,8 @@ int gmx_pme_do(struct gmx_pme_t *pme,
                real *dvdlambda_q, real *dvdlambda_lj,
                int flags)
 {
+    GMX_ASSERT(pme->runMode == PmeRunMode::CPU, "gmx_pme_do should not be called on the GPU PME run.");
+
     int                  d, i, j, npme, grid_index, max_grid_index;
     int                  n_d;
     pme_atomcomm_t      *atc        = nullptr;
@@ -1738,5 +1756,19 @@ void gmx_pme_destroy(gmx_pme_t *pme)
     sfree(pme->sum_qgrid_tmp);
     sfree(pme->sum_qgrid_dd_tmp);
 
+    if (pme_gpu_active(pme) && pme->gpu)
+    {
+        pme_gpu_destroy(pme->gpu);
+    }
+
     sfree(pme);
 }
+
+void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges)
+{
+    if (pme_gpu_active(pme))
+    {
+        pme_gpu_reinit_atoms(pme->gpu, nAtoms, charges);
+    }
+    // TODO: handle the CPU case here; handle the whole t_mdatoms
+}
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
new file mode 100644
index 0000000000..bd67a01ee7
--- /dev/null
+++ b/src/gromacs/ewald/pme.cu
@@ -0,0 +1,515 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief This file contains internal CUDA function implementations
+ * for performing the PME calculations on GPU.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cmath>
+
+/* The rest */
+#include "pme.h"
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/pmalloc_cuda.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "pme.cuh"
+#include "pme-3dfft.cuh"
+#include "pme-grid.h"
+
+int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
+{
+    const int order = pmeGPU->common->pme_order;
+    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    return PME_ATOM_DATA_ALIGNMENT;
+}
+
+int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
+{
+    const int order = pmeGPU->common->pme_order;
+    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    return PME_SPREADGATHER_ATOMS_PER_WARP;
+}
+
+void pme_gpu_synchronize(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
+}
+
+void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU)
+{
+    const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
+    cudaError_t  stat                = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
+    CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
+    pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize);
+}
+
+void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy);
+    CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
+    pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr;
+    pfree(pmeGPU->staging.h_virialAndEnergy);
+    pmeGPU->staging.h_virialAndEnergy = nullptr;
+}
+
+void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0,
+                                       c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
+}
+
+void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU)
+{
+    const int splineValuesOffset[DIM] = {
+        0,
+        pmeGPU->kernelParams->grid.realGridSize[XX],
+        pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY]
+    };
+    memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
+
+    const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] +
+        pmeGPU->kernelParams->grid.realGridSize[YY] +
+        pmeGPU->kernelParams->grid.realGridSize[ZZ];
+    const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize);
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
+                        &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true);
+    if (shouldRealloc)
+    {
+        /* Reallocate the host buffer */
+        pfree(pmeGPU->staging.h_splineModuli);
+        pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
+    }
+    for (int i = 0; i < DIM; i++)
+    {
+        memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
+    }
+    /* TODO: pin original buffer instead! */
+    cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
+                      newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+}
+
+void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU)
+{
+    pfree(pmeGPU->staging.h_splineModuli);
+    cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize,
+                     &pmeGPU->archSpecific->splineValuesSizeAlloc);
+}
+
+void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU)
+{
+    const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
+    GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
+                        &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
+}
+
+void pme_gpu_free_forces(const pme_gpu_t *pmeGPU)
+{
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
+}
+
+void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces)
+{
+    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
+    const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
+    GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
+    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
+}
+
+void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
+{
+    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
+    const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
+    GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
+    cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
+    cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME gather forces synchronization failure");
+}
+
+void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
+{
+    cudaStream_t s    = pmeGPU->archSpecific->pmeStream;
+    cudaError_t  stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0);
+    CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
+}
+
+void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU)
+{
+    const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM;
+    GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
+                        &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true);
+    if (c_usePadding)
+    {
+        const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms;
+        const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex;
+        if (paddingCount > 0)
+        {
+            cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+            CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
+        }
+    }
+}
+
+void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates)
+{
+    GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
+#if GMX_DOUBLE
+    GMX_RELEASE_ASSERT(false, "Only single precision is supported");
+    GMX_UNUSED_VALUE(h_coordinates);
+#else
+    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
+                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
+#endif
+}
+
+void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU)
+{
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc);
+}
+
+void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients)
+{
+    GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
+    const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc;
+    GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
+                        &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
+                        newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
+    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
+                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    if (c_usePadding)
+    {
+        const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
+        const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex;
+        if (paddingCount > 0)
+        {
+            cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+            CU_RET_ERR(stat, "PME failed to clear the padded charges");
+        }
+    }
+}
+
+void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU)
+{
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc);
+}
+
+void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU)
+{
+    const int    order             = pmeGPU->common->pme_order;
+    const int    alignment         = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
+    const size_t nAtomsPadded      = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment;
+    const int    newSplineDataSize = DIM * order * nAtomsPadded;
+    GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
+    /* Two arrays of the same size */
+    const bool shouldRealloc        = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize);
+    int        currentSizeTemp      = pmeGPU->archSpecific->splineDataSize;
+    int        currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc;
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float),
+                        &currentSizeTemp, &currentSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
+                        &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
+    // the host side reallocation
+    if (shouldRealloc)
+    {
+        pfree(pmeGPU->staging.h_theta);
+        pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float));
+        pfree(pmeGPU->staging.h_dtheta);
+        pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float));
+    }
+}
+
+void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU)
+{
+    /* Two arrays of the same size */
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta);
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc);
+    pfree(pmeGPU->staging.h_theta);
+    pfree(pmeGPU->staging.h_dtheta);
+}
+
+void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU)
+{
+    const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc;
+    GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
+    cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
+                        &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true);
+    pfree(pmeGPU->staging.h_gridlineIndices);
+    pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
+}
+
+void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU)
+{
+    cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc);
+    pfree(pmeGPU->staging.h_gridlineIndices);
+}
+
+void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU)
+{
+    auto     *kernelParamsPtr = pmeGPU->kernelParams.get();
+    const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
+        kernelParamsPtr->grid.realGridSizePadded[YY] *
+        kernelParamsPtr->grid.realGridSizePadded[ZZ];
+    const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
+        kernelParamsPtr->grid.complexGridSizePadded[YY] *
+        kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
+    // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
+    if (pmeGPU->archSpecific->performOutOfPlaceFFT)
+    {
+        /* 2 separate grids */
+        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
+                            &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc,
+                            newComplexGridSize, pmeGPU->archSpecific->pmeStream, true);
+        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
+                            &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
+                            newRealGridSize, pmeGPU->archSpecific->pmeStream, true);
+    }
+    else
+    {
+        /* A single buffer so that any grid will fit */
+        const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
+        cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
+                            &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
+                            newGridsSize, pmeGPU->archSpecific->pmeStream, true);
+        kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
+        pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize;
+        // the size might get used later for copying the grid
+    }
+}
+
+void pme_gpu_free_grids(const pme_gpu_t *pmeGPU)
+{
+    if (pmeGPU->archSpecific->performOutOfPlaceFFT)
+    {
+        cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid);
+    }
+    cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid,
+                     &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc);
+}
+
+void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0,
+                                       pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    /* Should the complex grid be cleared in some weird case? */
+    CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
+}
+
+void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
+{
+    cudaStream_t stream          = pmeGPU->archSpecific->pmeStream;
+    auto        *kernelParamsPtr = pmeGPU->kernelParams.get();
+
+    const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
+    const int    ny                  = kernelParamsPtr->grid.realGridSize[YY];
+    const int    nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
+    const int    cellCount           = c_pmeNeighborUnitcellCount;
+    const int    gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
+
+    memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
+
+    const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
+
+    /* Two arrays, same size */
+    int currentSizeTemp      = pmeGPU->archSpecific->fractShiftsSize;
+    int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
+    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
+                        &currentSizeTemp, &currentSizeTempAlloc,
+                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
+    float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
+    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
+                        &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
+                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
+    int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
+
+    /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
+
+    for (int i = 0; i < DIM; i++)
+    {
+        kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
+        cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
+                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
+        cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
+                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
+    }
+
+    pme_gpu_make_fract_shifts_textures(pmeGPU);
+}
+
+void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
+{
+    pme_gpu_free_fract_shifts_textures(pmeGPU);
+    /* Two arrays, same size */
+    cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
+    cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
+}
+
+void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0);
+    CU_RET_ERR(stat, "Error while waiting for PME solve output");
+
+    for (int j = 0; j < c_virialAndEnergyCount; j++)
+    {
+        GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
+    }
+}
+
+void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid)
+{
+    const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
+    cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
+}
+
+void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
+{
+    const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
+    cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
+    cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME spread grid sync event record failure");
+}
+
+void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
+    CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
+}
+
+void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0);
+    CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
+}
+
+void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0);
+    CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
+    //should check for pme_gpu_performs_solve(pmeGPU)
+}
+
+void pme_gpu_init_internal(pme_gpu_t *pmeGPU)
+{
+    /* Allocate the target-specific structures */
+    pmeGPU->archSpecific.reset(new pme_gpu_specific_t());
+    pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t());
+
+    pmeGPU->archSpecific->performOutOfPlaceFFT = true;
+    /* This should give better performance, according to the cuFFT documentation.
+     * The performance seems to be the same though.
+     * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
+     */
+
+    pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) &&
+        (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+    /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings.
+     * This should probably also check for gpuId exclusivity?
+     */
+
+    /* Creating a PME CUDA stream */
+    cudaError_t stat;
+    int         highest_priority, lowest_priority;
+    stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
+    CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
+    stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream,
+                                        cudaStreamDefault, //cudaStreamNonBlocking,
+                                        highest_priority);
+    CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
+}
+
+void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
+{
+    /* Destroy the CUDA stream */
+    cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME cudaStreamDestroy error");
+}
+
+void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat;
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
+    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming);
+    CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
+}
+
+void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU)
+{
+    cudaError_t stat;
+    stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
+    stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
+    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
+    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
+    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
+    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
+}
+
+void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU)
+{
+    if (pme_gpu_performs_FFT(pmeGPU))
+    {
+        pmeGPU->archSpecific->fftSetup.resize(0);
+        for (int i = 0; i < pmeGPU->common->ngrids; i++)
+        {
+            pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGPU)));
+        }
+    }
+}
+
+void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU)
+{
+    pmeGPU->archSpecific->fftSetup.resize(0);
+}
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh
new file mode 100644
index 0000000000..16b4b9aae8
--- /dev/null
+++ b/src/gromacs/ewald/pme.cuh
@@ -0,0 +1,243 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief This file defines the PME CUDA-specific data structure,
+ * various compile-time constants shared among the PME CUDA kernels,
+ * and also names some PME CUDA memory management routines.
+ * TODO: consider changing defines into variables where possible; have inline getters.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_CUH
+#define GMX_EWALD_PME_CUH
+
+#include <cassert>
+
+#include <array>
+
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size
+
+#include "pme-gpu-internal.h"                    // for the general PME GPU behaviour defines
+#include "pme-timings.cuh"
+
+class GpuParallel3dFft;
+
+/* Some defines for PME behaviour follow */
+
+/*
+    Here is a current memory layout for the theta/dtheta B-spline float parameter arrays.
+    This is the data in global memory used both by spreading and gathering kernels (with same scheduling).
+    This example has PME order 4 and 2 particles per warp/data chunk.
+    Each particle has 16 threads assigned to it, each thread works on 4 non-sequential global grid contributions.
+
+    ----------------------------------------------------------------------------
+    particles 0, 1                                        | particles 2, 3     | ...
+    ----------------------------------------------------------------------------
+    order index 0           | index 1 | index 2 | index 3 | order index 0 .....
+    ----------------------------------------------------------------------------
+    tx0 tx1 ty0 ty1 tz0 tz1 | ..........
+    ----------------------------------------------------------------------------
+
+    Each data chunk for a single warp is 24 floats. This goes both for theta and dtheta.
+    24 = 2 particles per warp * order 4 * 3 dimensions. 48 floats (1.5 warp size) per warp in total.
+    I have also tried intertwining theta and theta in a single array (they are used in pairs in gathering stage anyway)
+    and it didn't seem to make a performance difference.
+
+    The corresponding defines follow.
+ */
+
+/* This is the distance between the neighbour theta elements - would be 2 for the intertwining layout */
+#define PME_SPLINE_THETA_STRIDE 1
+
+/*! \brief
+ * The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order.
+ * The assumption is currently that any thread processes only a single atom's contributions.
+ */
+#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
+
+//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
+
+//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
+//! which is asserted in case we use atom data padding at all.
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+
+/*! \brief \internal
+ * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
+ *
+ * \param[in] atomDataIndexGlobal  The atom data index.
+ * \param[in] nAtomData            The atom data array element count.
+ * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
+{
+    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
+}
+
+/*! \brief \internal
+ * An inline CUDA function for skipping the zero-charge atoms.
+ *
+ * \returns                        Non-0 if atom should be processed, 0 otherwise.
+ * \param[in] coefficient          The atom charge.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+{
+    assert(isfinite(coefficient));
+    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+}
+
+/*! \brief \internal
+ * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer.
+ */
+struct pme_gpu_cuda_t
+{
+    /*! \brief The CUDA stream where everything related to the PME happens. */
+    cudaStream_t pmeStream;
+
+    /* Synchronization events */
+    /*! \brief Triggered after the energy/virial have been copied to the host (after the solving stage). */
+    cudaEvent_t syncEnerVirD2H;
+    /*! \brief Triggered after the output forces have been copied to the host (after the gathering stage). */
+    cudaEvent_t syncForcesD2H;
+    /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
+    cudaEvent_t syncSpreadGridD2H;
+    /*! \brief Triggered after the atom spline data has been copied to the host (after the spline computation). */
+    cudaEvent_t syncSplineAtomDataD2H;
+    /*! \brief Triggered after the grid hes been copied to the host (after the solving stage) */
+    cudaEvent_t syncSolveGridD2H;
+
+    // TODO: consider moving some things below into the non-CUDA struct.
+
+    /* Settings which are set at the start of the run */
+    /*! \brief A boolean which tells whether the complex and real grids for cuFFT are different or same. Currenty true. */
+    bool performOutOfPlaceFFT;
+    /*! \brief A boolean which tells if the CUDA timing events are enabled.
+     * True by default, disabled by setting the environment variable GMX_DISABLE_CUDA_TIMING.
+     * FIXME: this should also be disabled if any other GPU task is running concurrently on the same device,
+     * as CUDA events on multiple streams are untrustworthy.
+     */
+    bool                                             useTiming;
+
+    std::vector<std::unique_ptr<GpuParallel3dFft > > fftSetup;
+
+    std::array<GpuRegionTimer, gtPME_EVENT_COUNT>    timingEvents;
+
+    /* GPU arrays element counts (not the arrays sizes in bytes!).
+     * They might be larger than the actual meaningful data sizes.
+     * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory.
+     * These integer pairs are mostly meaningful for the cu_realloc/free_buffered calls.
+     * As such, if cu_realloc/free_buffered is refactored, they can be freely changed, too.
+     * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying.
+     * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
+     */
+    /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/
+    int coordinatesSize;
+    /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */
+    int coordinatesSizeAlloc;
+    /*! \brief The kernelParams.atoms.forces float element count (actual) */
+    int forcesSize;
+    /*! \brief The kernelParams.atoms.forces float element count (reserved) */
+    int forcesSizeAlloc;
+    /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */
+    int gridlineIndicesSize;
+    /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */
+    int gridlineIndicesSizeAlloc;
+    /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */
+    int splineDataSize;
+    /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
+    int splineDataSizeAlloc;
+    /*! \brief The kernelParams.atoms.coefficients float element count (actual) */
+    int coefficientsSize;
+    /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
+    int coefficientsSizeAlloc;
+    /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
+    int splineValuesSize;
+    /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
+    int splineValuesSizeAlloc;
+    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
+    int fractShiftsSize;
+    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
+    int fractShiftsSizeAlloc;
+    /*! \brief The kernelParams.grid.realGrid float element count (actual) */
+    int realGridSize;
+    /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
+    int realGridSizeAlloc;
+    /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
+    int complexGridSize;
+    /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
+    int complexGridSizeAlloc;
+};
+
+
+/*! \brief \internal
+ * A single structure encompassing all the PME data used in CUDA kernels.
+ * This inherits from pme_gpu_kernel_params_base_t and adds a couple cudaTextureObject_t handles,
+ * which we would like to avoid in plain C++.
+ */
+struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t
+{
+    /* These are CUDA texture objects, related to the grid size. */
+    /*! \brief CUDA texture object for accessing grid.d_fractShiftsTable */
+    cudaTextureObject_t fractShiftsTableTexture;
+    /*! \brief CUDA texture object for accessing grid.d_gridlineIndicesTable */
+    cudaTextureObject_t gridlineIndicesTableTexture;
+};
+
+/* CUDA texture functions which will reside in respective kernel files
+ * (due to texture references having scope of a translation unit).
+ */
+
+/*! \brief \internal
+ * Creates/binds 2 textures used in the spline parameter computation.
+ *
+ * \param[in, out] pmeGPU         The PME GPU structure.
+ */
+inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
+
+/*! \brief \internal
+ * Frees/unbinds 2 textures used in the spline parameter computation.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ */
+inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+
+#endif
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 1b6c0551db..ad25ca3abb 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -52,15 +52,24 @@
 
 #include "gromacs/gmxlib/nrnb.h"
 #include "gromacs/math/vectypes.h"
-#include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/timing/walltime_accounting.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
+#include "pme-gpu-types.h"
+
 struct t_commrec;
 struct t_inputrec;
+struct pme_gpu_t;
+struct gmx_wallclock_gpu_pme_t;
+struct gmx_device_info_t;
+
+namespace gmx
+{
+class MDLogger;
+}
 
 enum {
     GMX_SUM_GRID_FORWARD, GMX_SUM_GRID_BACKWARD
@@ -95,7 +104,11 @@ int gmx_pme_init(struct gmx_pme_t **pmedata, struct t_commrec *cr,
                  gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj,
                  gmx_bool bReproducible,
                  real ewaldcoeff_q, real ewaldcoeff_lj,
-                 int nthread);
+                 int nthread,
+                 PmeRunMode runMode,
+                 pme_gpu_t *pmeGPU,
+                 gmx_device_info_t *gpuInfo,
+                 const gmx::MDLogger &mdlog);
 
 /*! \brief Destroys the PME data structure.*/
 void gmx_pme_destroy(gmx_pme_t *pme);
@@ -115,7 +128,7 @@ void gmx_pme_destroy(gmx_pme_t *pme);
 #define GMX_PME_DO_ALL_F  (GMX_PME_SPREAD | GMX_PME_SOLVE | GMX_PME_CALC_F)
 //@}
 
-/*! \brief Do a PME calculation for the long range electrostatics and/or LJ.
+/*! \brief Do a PME calculation on a CPU for the long range electrostatics and/or LJ.
  *
  * The meaning of \p flags is defined above, and determines which
  * parts of the calculation are performed.
@@ -183,4 +196,60 @@ void gmx_pme_receive_f(struct t_commrec *cr,
                        real *dvdlambda_q, real *dvdlambda_lj,
                        float *pme_cycles);
 
+/*! \brief
+ * This function updates the local atom data on GPU after DD (charges, coordinates, etc.).
+ * TODO: it should update the PME CPU atom data as well.
+ * (currently PME CPU call gmx_pme_do() gets passed the input pointers each step).
+ *
+ * \param[in] pme            The PME structure.
+ * \param[in] nAtoms         The number of particles.
+ * \param[in] charges        The pointer to the array of particle charges.
+ */
+void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges);
+
+/* A block of PME GPU functions */
+
+/*! \brief
+ * Tells if PME is enabled to run on GPU (not necessarily active at the moment).
+ * For now, this decision is stored in the PME structure itself.
+ * FIXME: this is an information that should be managed by the task scheduler.
+ * As soon as such functionality appears, this function should be removed from this module.
+ *
+ * \param[in]  pme            The PME data structure.
+ * \returns true if PME can run on GPU, false otherwise.
+ */
+bool pme_gpu_task_enabled(const gmx_pme_t *pme);
+
+/*! \brief
+ * Resets the PME GPU timings. To be called at the reset step.
+ *
+ * \param[in] pme            The PME structure.
+ */
+void pme_gpu_reset_timings(const gmx_pme_t *pme);
+
+/*! \brief
+ * Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
+ *
+ * \param[in] pme               The PME structure.
+ * \param[in] timings           The gmx_wallclock_gpu_pme_t structure.
+ */
+void pme_gpu_get_timings(const gmx_pme_t         *pme,
+                         gmx_wallclock_gpu_pme_t *timings);
+
+/* The main PME GPU functions */
+
+/*! \brief
+ * Gets the output forces and virial/energy if corresponding flags are (were?) passed in.
+ *
+ * \param[in]  pme            The PME data structure.
+ * \param[in]  wcycle         The wallclock counter.
+ * \param[out] vir_q          The output virial matrix.
+ * \param[out] energy_q       The output energy.
+ */
+void pme_gpu_get_results(const gmx_pme_t *pme,
+                         gmx_wallcycle_t  wcycle,
+                         matrix           vir_q,
+                         real            *energy_q);
+
+
 #endif
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 475b84412f..d1cafc59d4 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -53,9 +53,11 @@
 #include "gromacs/ewald/pme-spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
 #include "gromacs/math/invertmatrix.h"
+#include "gromacs/mdtypes/commrec.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
 #include "gromacs/utility/stringutil.h"
 
 namespace gmx
@@ -71,9 +73,12 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                       real                      ewaldCoeff_lj = 1.0f
                                       )
 {
-    gmx_pme_t *pmeDataRaw = nullptr;
-    gmx_pme_init(&pmeDataRaw, nullptr, 1, 1, inputRec,
-                 atomCount, false, false, true, ewaldCoeff_q, ewaldCoeff_lj, 1);
+    gmx_pme_t     *pmeDataRaw = nullptr;
+    const MDLogger dummyLogger;
+    const auto     runMode       = PmeRunMode::CPU;
+    t_commrec      dummyCommrec  = {0};
+    gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
+                 ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger);
     PmeSafePointer pme(pmeDataRaw); // taking ownership
 
     // TODO get rid of this with proper matrix type
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h
index 2cbbe575d2..e453a5e18b 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer.h
@@ -77,6 +77,7 @@ template <GpuFramework> struct GpuTraits
 /*! \libinternal \brief
  * This is a GPU region timing implementation interface.
  * It should provide methods for measuring the last timespan.
+ * Copying/assignment is disabled since the underlying timing events are owned by this.
  */
 template <GpuFramework framework> class GpuRegionTimerImpl
 {
@@ -85,8 +86,15 @@ template <GpuFramework framework> class GpuRegionTimerImpl
     using CommandEvent  = typename GpuTraits<framework>::CommandEvent;
 
     public:
+
         GpuRegionTimerImpl()  = default;
         ~GpuRegionTimerImpl() = default;
+        //! No copying
+        GpuRegionTimerImpl(const GpuRegionTimerImpl &)       = delete;
+        //! No assignment
+        GpuRegionTimerImpl &operator=(GpuRegionTimerImpl &&) = delete;
+        //! Moving is disabled but can be considered in the future if needed
+        GpuRegionTimerImpl(GpuRegionTimerImpl &&)            = delete;
 
         /*! \brief Will be called before the region start. */
         inline void openTimingRegion(CommandStream) = 0;
@@ -121,21 +129,17 @@ template <GpuFramework framework> class GpuRegionTimerWrapper
         Idle,
         Recording,
         Stopped
-    } debugState_;
+    } debugState_ = TimerState::Idle;
 
     //! The number of times the timespan has been measured
-    unsigned int                  callCount_;
+    unsigned int                  callCount_ = 0;
     //! The accumulated duration of the timespans measured (milliseconds)
-    double                        totalMilliseconds_;
+    double                        totalMilliseconds_ = 0.0;
     //! The underlying region timer implementation
     GpuRegionTimerImpl<framework> impl_;
 
     public:
-        GpuRegionTimerWrapper() : impl_()
-        {
-            reset();
-        }
-        ~GpuRegionTimerWrapper() = default;
+
         /*! \brief
          * To be called before the region start.
          *
diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
index e0b4b3f87f..fc64d04e96 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
@@ -75,8 +75,7 @@ template <> class GpuRegionTimerImpl<GpuFramework::OpenCL>
     size_t                   currentEvent_ = 0;
 
     public:
-        GpuRegionTimerImpl()  = default;
-        ~GpuRegionTimerImpl() = default;
+
         inline void openTimingRegion(CommandStream){}
         inline void closeTimingRegion(CommandStream){}
 
diff --git a/src/gromacs/mdlib/mdsetup.cpp b/src/gromacs/mdlib/mdsetup.cpp
index bbc42e5762..1876ab2f9e 100644
--- a/src/gromacs/mdlib/mdsetup.cpp
+++ b/src/gromacs/mdlib/mdsetup.cpp
@@ -38,6 +38,7 @@
 
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
 #include "gromacs/listed-forces/manage-threading.h"
 #include "gromacs/mdlib/mdatoms.h"
 #include "gromacs/mdlib/shellfc.h"
@@ -139,4 +140,11 @@ void mdAlgorithmsSetupAtomData(t_commrec         *cr,
     }
 
     setup_bonded_threading(fr, &top->idef);
+
+    gmx_pme_reinit_atoms(fr->pmedata, numHomeAtoms, mdatoms->chargeA);
+    /* This handles the PP+PME rank case where fr->pmedata is valid.
+     * For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms().
+     * TODO: this only handles the GPU logic so far, should handle CPU as well.
+     * TODO: this also does not account for TPI.
+     */
 }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index ed6d41064d..3f79587105 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -774,9 +774,9 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
  * \param[inout] timings  GPU task timing data
  * \param[in] iloc        interaction locality
  */
-static void countPruneKernelTime(cu_timers_t         *timers,
-                                 gmx_wallclock_gpu_t *timings,
-                                 const int            iloc)
+static void countPruneKernelTime(cu_timers_t               *timers,
+                                 gmx_wallclock_gpu_nbnxn_t *timings,
+                                 const int                  iloc)
 {
     // We might have not done any pruning (e.g. if we skipped with empty domains).
     if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
@@ -821,13 +821,13 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
         gmx_incons(stmp);
     }
 
-    cu_plist_t                 *plist    = nb->plist[iloc];
-    cu_timers_t                *timers   = nb->timers;
-    struct gmx_wallclock_gpu_t *timings  = nb->timings;
-    nb_staging                  nbst     = nb->nbst;
+    cu_plist_t                       *plist    = nb->plist[iloc];
+    cu_timers_t                      *timers   = nb->timers;
+    struct gmx_wallclock_gpu_nbnxn_t *timings  = nb->timings;
+    nb_staging                        nbst     = nb->nbst;
 
-    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
-    bool                        bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                              bCalcEner   = flags & GMX_FORCE_ENERGY;
+    bool                              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* turn energy calculation always on/off (for debugging/testing only) */
     bCalcEner = (bCalcEner || always_ener) && !never_ener;
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 9500e4b78f..e01e2a66fa 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -397,7 +397,7 @@ static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 }
 
 /*! Initializes the timings data structure. */
-static void init_timings(gmx_wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 {
     int i, j;
 
@@ -825,9 +825,10 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     }
 }
 
-gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+//! This function is documented in the header file
+gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
 {
-    return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+    return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
 
 void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 0e0be3c1e1..56e8270e60 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -274,9 +274,9 @@ struct gmx_nbnxn_cuda_t
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
      * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
-    bool                 bDoTime;   /**< True if event-based timing is enabled.               */
-    cu_timers_t         *timers;    /**< CUDA event-based timers.                             */
-    gmx_wallclock_gpu_t *timings;   /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
+    bool                       bDoTime;   /**< True if event-based timing is enabled.               */
+    cu_timers_t               *timers;    /**< CUDA event-based timers.                             */
+    gmx_wallclock_gpu_nbnxn_t *timings;   /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
 };
 
 #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
index 841574e399..3211a9ec26 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -55,8 +55,9 @@ struct nonbonded_verlet_group_t;
 struct nbnxn_pairlist_t;
 struct nbnxn_atomdata_t;
 struct NbnxnListParameters;
-struct gmx_wallclock_gpu_t;
+struct gmx_wallclock_gpu_nbnxn_t;
 struct gmx_gpu_info_t;
+struct gmx_device_info_t;
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
@@ -104,7 +105,7 @@ void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
 
 /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
 GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(NULL)
+struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
 
 /** Resets nonbonded GPU timings. */
 GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 2e1102cef1..881688d14f 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -972,9 +972,9 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
  * \param[inout] timings  GPU task timing data
  * \param[in] iloc        interaction locality
  */
-static void countPruneKernelTime(cl_timers_t         *timers,
-                                 gmx_wallclock_gpu_t *timings,
-                                 const int            iloc)
+static void countPruneKernelTime(cl_timers_t               *timers,
+                                 gmx_wallclock_gpu_nbnxn_t *timings,
+                                 const int                  iloc)
 {
     // We might have not done any pruning (e.g. if we skipped with empty domains).
     if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
@@ -1024,13 +1024,13 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
         gmx_incons(stmp);
     }
 
-    cl_plist_t                 *plist    = nb->plist[iloc];
-    cl_timers_t                *timers   = nb->timers;
-    struct gmx_wallclock_gpu_t *timings  = nb->timings;
-    cl_nb_staging               nbst     = nb->nbst;
+    cl_plist_t                       *plist    = nb->plist[iloc];
+    cl_timers_t                      *timers   = nb->timers;
+    struct gmx_wallclock_gpu_nbnxn_t *timings  = nb->timings;
+    cl_nb_staging                     nbst     = nb->nbst;
 
-    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
-    int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                              bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int                               bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* turn energy calculation always on/off (for debugging/testing only) */
     bCalcEner = (bCalcEner || always_ener) && !never_ener;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index da63c733c4..4da8909363 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -554,7 +554,7 @@ static void init_timers(cl_timers_t *t,
 
 /*! \brief Initializes the timings data structure.
  */
-static void init_timings(gmx_wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 {
     int i, j;
 
@@ -1239,11 +1239,10 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     }
 }
 
-
 //! This function is documented in the header file
-gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
 {
-    return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+    return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
 
 //! This function is documented in the header file
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
index 0e9d527cee..4e1b64c6c0 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -359,16 +359,16 @@ struct gmx_nbnxn_ocl_t
     cl_command_queue    stream[2];             /**< local and non-local GPU queues                             */
 
     /** events used for synchronization */
-    cl_event nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
-                                             is done (and the local transfer can proceed) */
-    cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
-                                             the local stream that need to precede the
-                                             non-local force calculations are done
-                                             (e.g. f buffer 0-ing, local x/q H2D) */
-
-    cl_bool                     bDoTime;  /**< True if event-based timing is enabled.                     */
-    cl_timers_t                *timers;   /**< OpenCL event-based timers.                                 */
-    struct gmx_wallclock_gpu_t *timings;  /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
+    cl_event nonlocal_done;                     /**< event triggered when the non-local non-bonded kernel
+                                                   is done (and the local transfer can proceed) */
+    cl_event misc_ops_and_local_H2D_done;       /**< event triggered when the tasks issued in
+                                                   the local stream that need to precede the
+                                                   non-local force calculations are done
+                                                   (e.g. f buffer 0-ing, local x/q H2D) */
+
+    cl_bool                           bDoTime;  /**< True if event-based timing is enabled.                     */
+    cl_timers_t                      *timers;   /**< OpenCL event-based timers.                                 */
+    struct gmx_wallclock_gpu_nbnxn_t *timings;  /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
 };
 
 #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index b0267051a3..5278ad6873 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -2436,6 +2436,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
                 nonbonded_verlet_t *nbv,
+                gmx_pme_t *pme,
                 gmx_bool bWriteStat)
 {
     t_nrnb *nrnb_tot = nullptr;
@@ -2522,11 +2523,17 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
 
     if (printReport)
     {
-        struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
-
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
+        if (pme_gpu_task_enabled(pme))
+        {
+            pme_gpu_get_timings(pme, &pme_gpu_timings);
+        }
         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
                         elapsed_time_over_all_ranks,
-                        wcycle, cycle_sum, gputimes);
+                        wcycle, cycle_sum,
+                        nbnxn_gpu_timings,
+                        &pme_gpu_timings);
 
         if (EI_DYNAMICS(inputrec->eI))
         {
diff --git a/src/gromacs/mdlib/sim_util.h b/src/gromacs/mdlib/sim_util.h
index cc3bd29c10..310e9859b5 100644
--- a/src/gromacs/mdlib/sim_util.h
+++ b/src/gromacs/mdlib/sim_util.h
@@ -126,6 +126,7 @@ void finish_run(FILE *log, const gmx::MDLogger &mdlog, t_commrec *cr,
                 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
                 gmx_walltime_accounting_t walltime_accounting,
                 nonbonded_verlet_t *nbv,
+                gmx_pme_t *pme,
                 gmx_bool bWriteStat);
 
 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr);
diff --git a/src/gromacs/timing/gpu_timing.h b/src/gromacs/timing/gpu_timing.h
index e9ec67d855..214b0120eb 100644
--- a/src/gromacs/timing/gpu_timing.h
+++ b/src/gromacs/timing/gpu_timing.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,26 +47,52 @@
 extern "C" {
 #endif
 
-/*! \internal \brief Nonbonded kernel time and call count. */
-struct gmx_nbnxn_kernel_timing_data_t
+/*! \internal \brief GPU kernel time and call count. */
+struct gmx_kernel_timing_data_t
 {
     double  t; /**< Accumulated lapsed time */
     int     c; /**< Number of calls corresponding to the elapsed time */
 };
 
-/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
-struct gmx_wallclock_gpu_t
+/*! \internal \brief
+ *
+ * PME GPU stages timing events indices, corresponding to the string in PMEStageNames in wallcycle.cpp.
+ */
+enum
+{
+    gtPME_SPLINE = 0,
+    gtPME_SPREAD,
+    gtPME_SPLINEANDSPREAD,
+    gtPME_FFT_R2C,
+    gtPME_SOLVE,
+    gtPME_FFT_C2R,
+    gtPME_GATHER,
+    gtPME_EVENT_COUNT /* not a stage ID but a static array size */
+};
+
+/*! \internal \brief GPU timings for PME. */
+struct gmx_wallclock_gpu_pme_t
+{
+    /* A separate PME structure to avoid refactoring the NB code for gmx_wallclock_gpu_t later
+     * TODO: devise a better GPU timing data structuring.
+     */
+    /*! \brief Array of PME GPU timing data. */
+    gmx_kernel_timing_data_t timing[gtPME_EVENT_COUNT];
+};
+
+/*! \internal \brief GPU NB timings for kernels and H2d/D2H transfers. */
+struct gmx_wallclock_gpu_nbnxn_t
 {
-    gmx_nbnxn_kernel_timing_data_t ktime[2][2];      /**< table containing the timings of the four
+    gmx_kernel_timing_data_t ktime[2][2];      /**< table containing the timings of the four
                                                         versions of the nonbonded kernels: force-only,
                                                         force+energy, force+pruning, and force+energy+pruning */
-    gmx_nbnxn_kernel_timing_data_t pruneTime;        /**< table containing the timings of the 1st pass prune-only kernels */
-    gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
-    double                         nb_h2d_t;         /**< host to device transfer time in nb calculation  */
-    double                         nb_d2h_t;         /**< device to host transfer time in nb calculation */
-    int                            nb_c;             /**< total call count of the nonbonded gpu operations */
-    double                         pl_h2d_t;         /**< pair search step host to device transfer time */
-    int                            pl_h2d_c;         /**< pair search step  host to device transfer call count */
+    gmx_kernel_timing_data_t pruneTime;        /**< table containing the timings of the 1st pass prune-only kernels */
+    gmx_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
+    double                   nb_h2d_t;         /**< host to device transfer time in nb calculation  */
+    double                   nb_d2h_t;         /**< device to host transfer time in nb calculation */
+    int                      nb_c;             /**< total call count of the nonbonded gpu operations */
+    double                   pl_h2d_t;         /**< pair search step host to device transfer time */
+    int                      pl_h2d_c;         /**< pair search step  host to device transfer call count */
 };
 
 #ifdef __cplusplus
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp
index efa296bdff..2ad431a09d 100644
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -105,7 +105,7 @@ static const char *wcn[ewcNR] =
     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
     "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
+    "PME wait for PP", "Wait + Recv. PME F", "Wait PME GPU spread", "Wait PME GPU gather", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
     "Vsite spread", "COM pull force",
     "Write traj.", "Update", "Constraints", "Comm. energies",
     "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
@@ -128,6 +128,18 @@ static const char *wcsn[ewcsNR] =
     "NB F buffer ops.",
 };
 
+/* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
+static const char *PMEStageNames[] =
+{
+    "Spline",
+    "Spread",
+    "Spline/spread",
+    "FFT r2c",
+    "Solve",
+    "FFT c2r",
+    "Gather",
+};
+
 gmx_bool wallcycle_have_counter(void)
 {
     return gmx_cycles_have_counter();
@@ -717,9 +729,10 @@ static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, i
 void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
                      int nth_pp, int nth_pme, double realtime,
                      gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
-                     struct gmx_wallclock_gpu_t *gpu_t)
+                     const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
+                     const gmx_wallclock_gpu_pme_t *gpu_pme_t)
 {
-    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
+    double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
     double      c2t, c2t_pp, c2t_pme = 0;
     int         i, j, npp, nth_tot;
     char        buf[STRLEN];
@@ -876,25 +889,31 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
     }
 
     /* print GPU timing summary */
-    if (gpu_t)
+    double tot_gpu = 0.0;
+    if (gpu_pme_t)
+    {
+        for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+        {
+            tot_gpu += gpu_pme_t->timing[k].t;
+        }
+    }
+    if (gpu_nbnxn_t)
     {
         const char *k_log_str[2][2] = {
             {"Nonbonded F kernel", "Nonbonded F+ene k."},
             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
         };
-
-        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
+        tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 
         /* add up the kernel timings */
-        tot_k = 0.0;
         for (i = 0; i < 2; i++)
         {
             for (j = 0; j < 2; j++)
             {
-                tot_k += gpu_t->ktime[i][j].t;
+                tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
             }
         }
-        tot_gpu += tot_k + gpu_t->pruneTime.t;
+        tot_gpu += gpu_nbnxn_t->pruneTime.t;
 
         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
         if (wc->wcc[ewcPMEMESH].n > 0)
@@ -907,44 +926,57 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
         fprintf(fplog, "%s\n", hline);
         print_gputimes(fplog, "Pair list H2D",
-                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
+                       gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
         print_gputimes(fplog, "X / q H2D",
-                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
+                       gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 
         for (i = 0; i < 2; i++)
         {
             for (j = 0; j < 2; j++)
             {
-                if (gpu_t->ktime[i][j].c)
+                if (gpu_nbnxn_t->ktime[i][j].c)
                 {
                     print_gputimes(fplog, k_log_str[i][j],
-                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
+                                   gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
                 }
             }
         }
-        if (gpu_t->pruneTime.c)
+        if (gpu_pme_t)
         {
-            print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+            for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+            {
+                if (gpu_pme_t->timing[k].c)
+                {
+                    print_gputimes(fplog, PMEStageNames[k],
+                                   gpu_pme_t->timing[k].c,
+                                   gpu_pme_t->timing[k].t,
+                                   tot_gpu);
+                }
+            }
         }
-        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
+        if (gpu_nbnxn_t->pruneTime.c)
+        {
+            print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
+        }
+        print_gputimes(fplog, "F D2H",  gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
         fprintf(fplog, "%s\n", hline);
-        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
+        print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
         fprintf(fplog, "%s\n", hline);
-        if (gpu_t->dynamicPruneTime.c)
+        if (gpu_nbnxn_t->dynamicPruneTime.c)
         {
             /* We print the dynamic pruning kernel timings after a separator
              * and avoid adding it to tot_gpu as this is not in the force
              * overlap. We print the fraction as relative to the rest.
              */
-            print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+            print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
             fprintf(fplog, "%s\n", hline);
         }
-
         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
-        if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
         {
+            // FIXME the code below is not updated for PME on GPU
             fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
-                    tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
+                    tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                     gpu_cpu_ratio);
         }
 
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h
index 8ad165514a..01cec41b8f 100644
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -52,13 +52,12 @@ enum {
     ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU,
     ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
     ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
-    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+    ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_PME_SPREAD, ewcWAIT_GPU_PME_GATHER, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
     ewcVSITESPREAD, ewcPULLPOT,
     ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
     ewcTEST, ewcNR
 };
 
-
 enum {
     ewcsDD_REDIST, ewcsDD_GRID, ewcsDD_SETUPCOMM,
     ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER,
diff --git a/src/gromacs/timing/wallcyclereporting.h b/src/gromacs/timing/wallcyclereporting.h
index e84243fe8c..632ea53d29 100644
--- a/src/gromacs/timing/wallcyclereporting.h
+++ b/src/gromacs/timing/wallcyclereporting.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -54,7 +54,8 @@ class MDLogger;
 }
 
 typedef struct gmx_wallcycle *gmx_wallcycle_t;
-typedef struct gmx_wallclock_gpu_t gmx_wallclock_gpu_t;
+struct gmx_wallclock_gpu_nbnxn_t;
+struct gmx_wallclock_gpu_pme_t;
 
 typedef std::array<double, ewcNR+ewcsNR> WallcycleCounts;
 /* Convenience typedef */
@@ -66,7 +67,8 @@ WallcycleCounts wallcycle_sum(struct t_commrec *cr, gmx_wallcycle_t wc);
 void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
                      int nth_pp, int nth_pme, double realtime,
                      gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
-                     struct gmx_wallclock_gpu_t *gpu_t);
+                     const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
+                     const gmx_wallclock_gpu_pme_t *gpu_pme_t);
 /* Print the cycle and time accounting */
 
 #endif
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp
index 70f87b5390..e96e080667 100644
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -163,7 +163,8 @@ static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commre
                                gmx_int64_t *step_rel, t_inputrec *ir,
                                gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                gmx_walltime_accounting_t walltime_accounting,
-                               struct nonbonded_verlet_t *nbv)
+                               struct nonbonded_verlet_t *nbv,
+                               struct gmx_pme_t *pme)
 {
     char sbuf[STEPSTRSIZE];
 
@@ -175,6 +176,15 @@ static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commre
     if (use_GPU(nbv))
     {
         nbnxn_gpu_reset_timings(nbv);
+    }
+
+    if (pme_gpu_task_enabled(pme))
+    {
+        pme_gpu_reset_timings(pme);
+    }
+
+    if (use_GPU(nbv) || pme_gpu_task_enabled(pme))
+    {
         resetGpuProfiler();
     }
 
@@ -1861,7 +1871,7 @@ double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
                           "mdrun -resetstep.", step);
             }
             reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-                               use_GPU(fr->nbv) ? fr->nbv : nullptr);
+                               use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata);
             wcycle_set_reset_counters(wcycle, -1);
             if (!(cr->duty & DUTY_PME))
             {
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 9255754315..316c2e3f28 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -1080,11 +1080,14 @@ int Mdrunner::mdrunner()
         {
             try
             {
+                gmx_device_info_t *pmeGpuInfo = nullptr;
+                auto               runMode    = PmeRunMode::CPU;
                 status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
                                       mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                       mdrunOptions.reproducible,
                                       ewaldcoeff_q, ewaldcoeff_lj,
-                                      nthreads_pme);
+                                      nthreads_pme,
+                                      runMode, nullptr, pmeGpuInfo, mdlog);
             }
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             if (status != 0)
@@ -1186,6 +1189,7 @@ int Mdrunner::mdrunner()
     finish_run(fplog, mdlog, cr,
                inputrec, nrnb, wcycle, walltime_accounting,
                fr ? fr->nbv : nullptr,
+               fr ? fr->pmedata : nullptr,
                EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 
     // Free PME data
-- 
2.22.0