From 9ce8e5c8c9848b0ff4ca0ce05ef96896fa40b3eb Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Tue, 14 Jan 2020 16:53:26 +0100
Subject: [PATCH] Make nbnxm headers more self-contained

Resolves existing TODO, removing a dependency of the CPU kernels
on config.h

Eliminates an unnecessary function, and the inlining of a function
that is only called every NS step.

Renamed gmx_nbnxn_*_t to gmx_nbnxm_gpu_t because there was no
advantage to the previous scheme. Each GPU build configuration only
ever uses one type, so there is no problem having different
declarations in different build configurations. The compiler would
complain if there was. The renaming of the former collection of types
in gpu_types.h created a dependency on config.h which should be
avoided.

Something about the renaming of gmx_nbnxm_cuda_t broke the way Doxygen
understood nbnxm_cuda_types.h. I thought that declaring a Doxygen
header for atomdata.h had fixed it, somehow, but it didn't, so there
is a suppression for the suspected false positive. Also fixed the
Doxygen in atomdata.h

Change-Id: I18c573ad87cbab9ae4f38aa49541f0821dc1c145
---
 docs/doxygen/suppressions.txt                 |   3 +
 src/gromacs/mdlib/sim_util.cpp                |   1 +
 src/gromacs/nbnxm/atomdata.cpp                |   7 +-
 src/gromacs/nbnxm/atomdata.h                  | 137 +++++++++++-------
 src/gromacs/nbnxm/clusterdistancekerneltype.h |   4 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  14 +-
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  52 +++----
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  96 ++++++------
 src/gromacs/nbnxm/gpu_common.h                |  12 +-
 src/gromacs/nbnxm/gpu_common_utils.h          |   4 +-
 src/gromacs/nbnxm/gpu_data_mgmt.h             |  39 +++--
 src/gromacs/nbnxm/gpu_jit_support.h           |   6 +-
 src/gromacs/nbnxm/gpu_types.h                 |  65 ---------
 src/gromacs/nbnxm/grid.cpp                    |   2 +-
 src/gromacs/nbnxm/gridset.cpp                 |   3 +-
 src/gromacs/nbnxm/gridsetdata.h               |   3 +-
 src/gromacs/nbnxm/kernel_common.h             |   2 +-
 src/gromacs/nbnxm/kerneldispatch.cpp          |   1 +
 src/gromacs/nbnxm/nbnxm.cpp                   |  18 ++-
 src/gromacs/nbnxm/nbnxm.h                     |  28 ++--
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  32 ++--
 src/gromacs/nbnxm/nbnxm_setup.cpp             |   9 +-
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp        |  10 +-
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      |  38 +++--
 .../nbnxm/opencl/nbnxm_ocl_jit_support.cpp    |   2 +-
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h    |   2 +-
 src/gromacs/nbnxm/pairlist.cpp                |   2 +-
 src/gromacs/nbnxm/pairsearch.h                |   2 +-
 src/gromacs/nbnxm/prunekerneldispatch.cpp     |   3 +-
 29 files changed, 287 insertions(+), 310 deletions(-)
 delete mode 100644 src/gromacs/nbnxm/gpu_types.h

diff --git a/docs/doxygen/suppressions.txt b/docs/doxygen/suppressions.txt
index ab296f5250..8f3a2dd479 100644
--- a/docs/doxygen/suppressions.txt
+++ b/docs/doxygen/suppressions.txt
@@ -34,6 +34,9 @@ src/gromacs/nbnxm/pairlist_simd_4xm.h: warning: should include "simd.h"
 src/gromacs/nbnxm/kernels_simd_2xmm/kernel_common.h: warning: should include "nbnxm_simd.h"
 src/gromacs/nbnxm/kernels_simd_4xm/kernel_common.h: warning: should include "nbnxm_simd.h"
 
+# This seems to be a false positive
+src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h: error: gmx_nbnxm_gpu_t: is in internal file(s), but appears in public documentation
+
 # Temporary while we change the SIMD implementation
 src/gromacs/simd/impl_sparc64_hpc_ace/impl_sparc64_hpc_ace_common.h: warning: should include "simd.h"
 
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 934714a2a0..868ef05a4a 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -91,6 +91,7 @@
 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 07684c9009..8fd60a8f03 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -67,6 +67,7 @@
 #include "grid.h"
 #include "gridset.h"
 #include "nbnxm_geometry.h"
+#include "nbnxm_gpu.h"
 #include "pairlist.h"
 
 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
@@ -1073,7 +1074,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet&   gridSet,
 void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                     const gmx::AtomLocality locality,
                                     bool                    fillLocal,
-                                    gmx_nbnxn_gpu_t*        gpu_nbv,
+                                    gmx_nbnxm_gpu_t*        gpu_nbv,
                                     DeviceBuffer<float>     d_x,
                                     GpuEventSynchronizer*   xReadyOnDevice)
 {
@@ -1462,7 +1463,7 @@ void reduceForcesGpu(const gmx::AtomLocality                    locality,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                     gmx_nbnxn_gpu_t*                           gpu_nbv,
+                     gmx_nbnxm_gpu_t*                           gpu_nbv,
                      bool                                       useGpuFPmeReduction,
                      bool                                       accumulateForce)
 {
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 4aa2f72f23..68c25e5933 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -33,6 +33,15 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
+/*! \libinternal \file
+ *  \brief
+ *  Functionality for per-atom data in the nbnxm module
+ *
+ *  \author Berk Hess <hess@kth.se>
+ *  \ingroup module_nbnxm
+ *  \inlibraryapi
+ */
+
 
 #ifndef GMX_NBNXN_ATOMDATA_H
 #define GMX_NBNXN_ATOMDATA_H
@@ -47,13 +56,12 @@
 #include "gromacs/utility/bitmask.h"
 #include "gromacs/utility/real.h"
 
-#include "gpu_types.h"
-
 namespace gmx
 {
 class MDLogger;
 }
 
+struct gmx_nbnxm_gpu_t;
 struct nbnxn_atomdata_t;
 struct nonbonded_verlet_t;
 struct t_mdatoms;
@@ -67,7 +75,7 @@ class GridSet;
 enum class KernelType;
 } // namespace Nbnxm
 
-/* Convenience type for vector with aligned memory */
+//! Convenience type for vector with aligned memory
 template<typename T>
 using AlignedVector = std::vector<T, gmx::AlignedAllocator<T>>;
 
@@ -122,8 +130,9 @@ struct nbnxn_atomdata_output_t
     AlignedVector<real>   VSc;    // Temporary SIMD Coulomb group energy storage
 };
 
-/* Block size in atoms for the non-bonded thread force-buffer reduction,
- * should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8).
+/*! \brief Block size in atoms for the non-bonded thread force-buffer reduction.
+ *
+ * Should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8).
  * Should be small to reduce the reduction and zeroing cost,
  * but too small will result in overhead.
  * Currently the block size is NBNXN_BUFFERFLAG_SIZE*3*sizeof(real)=192 bytes.
@@ -134,20 +143,24 @@ struct nbnxn_atomdata_output_t
 #    define NBNXN_BUFFERFLAG_SIZE 16
 #endif
 
-/* We store the reduction flags as gmx_bitmask_t.
+/*! \brief We store the reduction flags as gmx_bitmask_t.
  * This limits the number of flags to BITMASK_SIZE.
  */
 #define NBNXN_BUFFERFLAG_MAX_THREADS (BITMASK_SIZE)
 
-/* Flags for telling if threads write to force output buffers */
+/*! \internal
+ * \brief Flags for telling if threads write to force output buffers */
 typedef struct
 {
-    int            nflag;       /* The number of flag blocks                         */
-    gmx_bitmask_t* flag;        /* Bit i is set when thread i writes to a cell-block */
-    int            flag_nalloc; /* Allocation size of cxy_flag                       */
+    //! The number of flag blocks
+    int nflag;
+    //! Bit i is set when thread i writes to a cell-block
+    gmx_bitmask_t* flag;
+    //! Allocation size of cxy_flag
+    int flag_nalloc;
 } nbnxn_buffer_flags_t;
 
-/* LJ combination rules: geometric, Lorentz-Berthelot, none */
+/*! \brief LJ combination rules: geometric, Lorentz-Berthelot, none */
 enum
 {
     ljcrGEOM,
@@ -156,7 +169,8 @@ enum
     ljcrNR
 };
 
-/* Struct that stores atom related data for the nbnxn module
+/*! \internal
+ * \brief Struct that stores atom related data for the nbnxn module
  *
  * Note: performance would improve slightly when all std::vector containers
  *       in this struct would not initialize during resize().
@@ -195,86 +209,103 @@ struct nbnxn_atomdata_t
         gmx::HostVector<int> energrp;
     };
 
-    // Diagonal and topology exclusion helper data for all SIMD kernels
+    /*! \internal
+     * \brief Diagonal and topology exclusion helper data for all SIMD kernels. */
     struct SimdMasks
     {
         SimdMasks();
 
-        // Helper data for setting up diagonal exclusion masks in the SIMD 4xN kernels
+        //! Helper data for setting up diagonal exclusion masks in the SIMD 4xN kernels
         AlignedVector<real> diagonal_4xn_j_minus_i;
-        // Helper data for setting up diaginal exclusion masks in the SIMD 2xNN kernels
+        //! Helper data for setting up diaginal exclusion masks in the SIMD 2xNN kernels
         AlignedVector<real> diagonal_2xnn_j_minus_i;
-        // Filters for topology exclusion masks for the SIMD kernels
+        //! Filters for topology exclusion masks for the SIMD kernels
         AlignedVector<uint32_t> exclusion_filter;
-        // Filters for topology exclusion masks for double SIMD kernels without SIMD int32 logical support
+        //! Filters for topology exclusion masks for double SIMD kernels without SIMD int32 logical support
         AlignedVector<uint64_t> exclusion_filter64;
-        // Array of masks needed for exclusions
+        //! Array of masks needed for exclusions
         AlignedVector<real> interaction_array;
     };
 
-    /* Constructor
+    /*! \brief Constructor
      *
      * \param[in] pinningPolicy  Sets the pinning policy for all data that might be transfered to a GPU
      */
     nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy);
 
-    /* Returns a const reference to the parameters */
+    //! Returns a const reference to the parameters
     const Params& params() const { return params_; }
 
-    /* Returns a non-const reference to the parameters */
+    //! Returns a non-const reference to the parameters
     Params& paramsDeprecated() { return params_; }
 
-    /* Returns the current total number of atoms stored */
+    //! Returns the current total number of atoms stored
     int numAtoms() const { return numAtoms_; }
 
-    /* Return the coordinate buffer, and q with xFormat==nbatXYZQ */
+    //! Return the coordinate buffer, and q with xFormat==nbatXYZQ
     gmx::ArrayRef<const real> x() const { return x_; }
 
-    /* Return the coordinate buffer, and q with xFormat==nbatXYZQ */
+    //! Return the coordinate buffer, and q with xFormat==nbatXYZQ
     gmx::ArrayRef<real> x() { return x_; }
 
-    /* Resizes the coordinate buffer and sets the number of atoms */
+    //! Resizes the coordinate buffer and sets the number of atoms
     void resizeCoordinateBuffer(int numAtoms);
 
-    /* Resizes the force buffers for the current number of atoms */
+    //! Resizes the force buffers for the current number of atoms
     void resizeForceBuffers();
 
 private:
-    // The LJ and charge parameters
+    //! The LJ and charge parameters
     Params params_;
-    // The total number of atoms currently stored
+    //! The total number of atoms currently stored
     int numAtoms_;
 
 public:
-    int                        natoms_local; /* Number of local atoms                           */
-    int                        XFormat;     /* The format of x (and q), enum                      */
-    int                        FFormat;     /* The format of f, enum                              */
-    gmx_bool                   bDynamicBox; /* Do we need to update shift_vec every step?    */
-    gmx::HostVector<gmx::RVec> shift_vec;   /* Shift vectors, copied from t_forcerec              */
-    int                        xstride;     /* stride for a coordinate in x (usually 3 or 4)      */
-    int                        fstride;     /* stride for a coordinate in f (usually 3 or 4)      */
+    //! Number of local atoms
+    int natoms_local;
+    //! The format of x (and q), enum
+    int XFormat;
+    //! The format of f, enum
+    int FFormat;
+    //! Do we need to update shift_vec every step?
+    gmx_bool bDynamicBox;
+    //! Shift vectors, copied from t_forcerec
+    gmx::HostVector<gmx::RVec> shift_vec;
+    //! stride for a coordinate in x (usually 3 or 4)
+    int xstride;
+    //! stride for a coordinate in f (usually 3 or 4)
+    int fstride;
+
 private:
-    gmx::HostVector<real> x_; /* x and possibly q, size natoms*xstride              */
+    //! x and possibly q, size natoms*xstride
+    gmx::HostVector<real> x_;
 
 public:
-    // Masks for handling exclusions in the SIMD kernels
+    //! Masks for handling exclusions in the SIMD kernels
     const SimdMasks simdMasks;
 
-    /* Output data */
-    std::vector<nbnxn_atomdata_output_t> out; /* Output data structures, 1 per thread */
-
-    /* Reduction related data */
-    gmx_bool             bUseBufferFlags; /* Use the flags or operate on all atoms     */
-    nbnxn_buffer_flags_t buffer_flags;    /* Flags for buffer zeroing+reduc.  */
-    gmx_bool             bUseTreeReduce;  /* Use tree for force reduction */
-    tMPI_Atomic*         syncStep;        /* Synchronization step for tree reduce */
+    //! Output data structures, 1 per thread
+    std::vector<nbnxn_atomdata_output_t> out;
+
+    //! Reduction related data
+    //! \{
+    //! Use the flags or operate on all atoms
+    gmx_bool bUseBufferFlags;
+    //! Flags for buffer zeroing+reduc.
+    nbnxn_buffer_flags_t buffer_flags;
+    //! Use tree for force reduction
+    gmx_bool bUseTreeReduce;
+    //! Synchronization step for tree reduce
+    tMPI_Atomic* syncStep;
+    //! \}
 };
 
-/* Copy na rvec elements from x to xnb using nbatFormat, start dest a0,
+/*! \brief Copy na rvec elements from x to xnb using nbatFormat, start dest a0,
  * and fills up to na_round with coordinates that are far away.
  */
 void copy_rvec_to_nbat_real(const int* a, int na, int na_round, const rvec* x, int nbatFormat, real* xnb, int a0);
 
+//! Describes the combination rule in use by this force field
 enum
 {
     enbnxninitcombruleDETECT,
@@ -283,7 +314,8 @@ enum
     enbnxninitcombruleNONE
 };
 
-/* Initialize the non-bonded atom data structure.
+/*! \brief Initialize the non-bonded atom data structure.
+ *
  * The enum for nbatXFormat is in the file defining nbnxn_atomdata_t.
  * Copy the ntypes*ntypes*2 sized nbfp non-bonded parameter list
  * to the atom data structure.
@@ -298,12 +330,13 @@ void nbnxn_atomdata_init(const gmx::MDLogger&      mdlog,
                          int                       n_energygroups,
                          int                       nout);
 
+//! Sets the atomdata after pair search
 void nbnxn_atomdata_set(nbnxn_atomdata_t*     nbat,
                         const Nbnxm::GridSet& gridSet,
                         const t_mdatoms*      mdatoms,
                         const int*            atinfo);
 
-/* Copy the shift vectors to nbat */
+//! Copy the shift vectors to nbat
 void nbnxn_atomdata_copy_shiftvec(gmx_bool dynamic_box, rvec* shift_vec, nbnxn_atomdata_t* nbat);
 
 /*! \brief Transform coordinates to xbat layout
@@ -337,7 +370,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet,
 void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
                                     gmx::AtomLocality     locality,
                                     bool                  fillLocal,
-                                    gmx_nbnxn_gpu_t*      gpu_nbv,
+                                    gmx_nbnxm_gpu_t*      gpu_nbv,
                                     DeviceBuffer<float>   d_x,
                                     GpuEventSynchronizer* xReadyOnDevice);
 
@@ -366,14 +399,14 @@ void reduceForcesGpu(gmx::AtomLocality                          locality,
                      const Nbnxm::GridSet&                      gridSet,
                      void*                                      pmeForcesDevice,
                      gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
-                     gmx_nbnxn_gpu_t*                           gpu_nbv,
+                     gmx_nbnxm_gpu_t*                           gpu_nbv,
                      bool                                       useGpuFPmeReduction,
                      bool                                       accumulateForce);
 
-/* Add the fshift force stored in nbat to fshift */
+//! Add the fshift force stored in nbat to fshift
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift);
 
-/* Get the atom start index and number of atoms for a given locality */
+//! Get the atom start index and number of atoms for a given locality
 void nbnxn_get_atom_range(gmx::AtomLocality     atomLocality,
                           const Nbnxm::GridSet& gridSet,
                           int*                  atomStart,
diff --git a/src/gromacs/nbnxm/clusterdistancekerneltype.h b/src/gromacs/nbnxm/clusterdistancekerneltype.h
index 43ce747dad..c7f680ab0c 100644
--- a/src/gromacs/nbnxm/clusterdistancekerneltype.h
+++ b/src/gromacs/nbnxm/clusterdistancekerneltype.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,10 +45,10 @@
 #ifndef GMX_NBNXM_CLUSTERDISTANCEKERNELTYPE_H
 #define GMX_NBNXM_CLUSTERDISTANCEKERNELTYPE_H
 
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/simd/simd.h"
 #include "gromacs/utility/gmxassert.h"
 
-#include "atomdata.h"
 #include "pairlistparams.h"
 
 //! The types of kernel for calculating the distance between pairs of atom clusters
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 11a67cdae4..57504d06bb 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -363,7 +363,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
  *  the local, this function records the event if called with the local stream as
  *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
  */
-void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t* nb, const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t* nb, const InteractionLocality interactionLocality)
 {
     cudaStream_t stream = nb->stream[interactionLocality];
 
@@ -389,7 +389,7 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t* nb, const Interact
 }
 
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -477,7 +477,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbatom, co
    the local x+q H2D (and all preceding) tasks are complete and synchronize
    with this event in the non-local stream before launching the non-bonded kernel.
  */
-void gpu_launch_kernel(gmx_nbnxn_cuda_t* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
+void gpu_launch_kernel(gmx_nbnxm_gpu_t* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
 {
     cu_atomdata_t* adat   = nb->atdat;
     cu_nbparam_t*  nbp    = nb->nbparam;
@@ -589,7 +589,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
     return shmem;
 }
 
-void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t* nb, const InteractionLocality iloc, const int numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc, const int numParts)
 {
     cu_atomdata_t* adat   = nb->atdat;
     cu_nbparam_t*  nbp    = nb->nbparam;
@@ -713,7 +713,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t* nb, const InteractionLocality
     }
 }
 
-void gpu_launch_cpyback(gmx_nbnxn_cuda_t*        nb,
+void gpu_launch_cpyback(gmx_nbnxm_gpu_t*         nb,
                         nbnxn_atomdata_t*        nbatom,
                         const gmx::StepWorkload& stepWork,
                         const AtomLocality       atomLocality)
@@ -817,7 +817,7 @@ void cuda_set_cacheconfig()
 /* X buffer operations on GPU: performs conversion from rvec to nb format. */
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                            bool                      setFillerCoords,
-                           gmx_nbnxn_gpu_t*          nb,
+                           gmx_nbnxm_gpu_t*          nb,
                            DeviceBuffer<float>       d_x,
                            GpuEventSynchronizer*     xReadyOnDevice,
                            const Nbnxm::AtomLocality locality,
@@ -884,7 +884,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
  */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
                                DeviceBuffer<float>                        totalForcesDevice,
-                               gmx_nbnxn_gpu_t*                           nb,
+                               gmx_nbnxm_gpu_t*                           nb,
                                void*                                      pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
                                int                                        atomStart,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 5aea844726..2714dfee8d 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -45,7 +45,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+// TODO We would like to move this down, but the way gmx_nbnxm_gpu_t
 //      is currently declared means this has to be before gpu_types.h
 #include "nbnxm_cuda_types.h"
 
@@ -89,7 +89,7 @@ namespace Nbnxm
 static unsigned int gpu_min_ci_balanced_factor = 44;
 
 /* Fw. decl. */
-static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t* nb);
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxm_gpu_t* nb);
 
 /* Fw. decl, */
 static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam);
@@ -400,7 +400,7 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 }
 
 /*! Initializes simulation constant data. */
-static void cuda_init_const(gmx_nbnxn_cuda_t*               nb,
+static void cuda_init_const(gmx_nbnxm_gpu_t*                nb,
                             const interaction_const_t*      ic,
                             const PairlistParams&           listParams,
                             const nbnxn_atomdata_t::Params& nbatParams)
@@ -412,16 +412,16 @@ static void cuda_init_const(gmx_nbnxn_cuda_t*               nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-gmx_nbnxn_cuda_t* gpu_init(const gmx_device_info_t*   deviceInfo,
-                           const interaction_const_t* ic,
-                           const PairlistParams&      listParams,
-                           const nbnxn_atomdata_t*    nbat,
-                           int /*rank*/,
-                           gmx_bool bLocalAndNonlocal)
+gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t*   deviceInfo,
+                          const interaction_const_t* ic,
+                          const PairlistParams&      listParams,
+                          const nbnxn_atomdata_t*    nbat,
+                          int /*rank*/,
+                          gmx_bool bLocalAndNonlocal)
 {
     cudaError_t stat;
 
-    gmx_nbnxn_cuda_t* nb;
+    gmx_nbnxm_gpu_t* nb;
     snew(nb, 1);
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
@@ -509,7 +509,7 @@ gmx_nbnxn_cuda_t* gpu_init(const gmx_device_info_t*   deviceInfo,
     return nb;
 }
 
-void gpu_init_pairlist(gmx_nbnxn_cuda_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
+void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 {
     char         sbuf[STRLEN];
     bool         bDoTime = (nb->bDoTime && !h_plist->sci.empty());
@@ -565,7 +565,7 @@ void gpu_init_pairlist(gmx_nbnxn_cuda_t* nb, const NbnxnPairlistGpu* h_plist, co
     d_plist->haveFreshList = true;
 }
 
-void gpu_upload_shiftvec(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbatom)
+void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom)
 {
     cu_atomdata_t* adat = nb->atdat;
     cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
@@ -579,7 +579,7 @@ void gpu_upload_shiftvec(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbatom)
 }
 
 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
-static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t* nb, int natoms_clear)
+static void nbnxn_cuda_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
@@ -590,7 +590,7 @@ static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t* nb, int natoms_clear)
 }
 
 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
-static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t* nb)
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxm_gpu_t* nb)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
@@ -604,7 +604,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t* nb)
     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 }
 
-void gpu_clear_outputs(gmx_nbnxn_cuda_t* nb, bool computeVirial)
+void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial)
 {
     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
     /* clear shift force array and energies if the outputs were
@@ -615,7 +615,7 @@ void gpu_clear_outputs(gmx_nbnxn_cuda_t* nb, bool computeVirial)
     }
 }
 
-void gpu_init_atomdata(gmx_nbnxn_cuda_t* nb, const nbnxn_atomdata_t* nbat)
+void gpu_init_atomdata(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbat)
 {
     cudaError_t    stat;
     int            nalloc, natoms;
@@ -702,7 +702,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t* nbparam)
     }
 }
 
-void gpu_free(gmx_nbnxn_cuda_t* nb)
+void gpu_free(gmx_nbnxm_gpu_t* nb)
 {
     cudaError_t    stat;
     cu_atomdata_t* atdat;
@@ -798,7 +798,7 @@ void gpu_free(gmx_nbnxn_cuda_t* nb)
 }
 
 //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxn_cuda_t* nb)
+gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t* nb)
 {
     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
@@ -811,38 +811,38 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv)
     }
 }
 
-int gpu_min_ci_balanced(gmx_nbnxn_cuda_t* nb)
+int gpu_min_ci_balanced(gmx_nbnxm_gpu_t* nb)
 {
     return nb != nullptr ? gpu_min_ci_balanced_factor * nb->dev_info->prop.multiProcessorCount : 0;
 }
 
-gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t* nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t* nb)
 {
     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
 }
 
-void* gpu_get_command_stream(gmx_nbnxn_gpu_t* nb, const InteractionLocality iloc)
+void* gpu_get_command_stream(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc)
 {
     assert(nb);
 
     return static_cast<void*>(&nb->stream[iloc]);
 }
 
-void* gpu_get_xq(gmx_nbnxn_gpu_t* nb)
+void* gpu_get_xq(gmx_nbnxm_gpu_t* nb)
 {
     assert(nb);
 
     return static_cast<void*>(nb->atdat->xq);
 }
 
-void* gpu_get_f(gmx_nbnxn_gpu_t* nb)
+void* gpu_get_f(gmx_nbnxm_gpu_t* nb)
 {
     assert(nb);
 
     return static_cast<void*>(nb->atdat->f);
 }
 
-rvec* gpu_get_fshift(gmx_nbnxn_gpu_t* nb)
+rvec* gpu_get_fshift(gmx_nbnxm_gpu_t* nb)
 {
     assert(nb);
 
@@ -851,7 +851,7 @@ rvec* gpu_get_fshift(gmx_nbnxn_gpu_t* nb)
 
 /* Initialization for X buffer operations on GPU. */
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
-void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, gmx_nbnxn_gpu_t* gpu_nbv)
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, gmx_nbnxm_gpu_t* gpu_nbv)
 {
     cudaStream_t stream        = gpu_nbv->stream[InteractionLocality::Local];
     bool         bDoTime       = gpu_nbv->bDoTime;
@@ -937,7 +937,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, gmx_nbnxn_gpu_t*
 
 /* Initialization for F buffer operations on GPU. */
 void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
-                                    gmx_nbnxn_gpu_t*            gpu_nbv,
+                                    gmx_nbnxm_gpu_t*            gpu_nbv,
                                     int                         natoms_total,
                                     GpuEventSynchronizer* const localReductionDone)
 {
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index d65d308c48..797a0b4bc4 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2013-2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -223,84 +223,94 @@ typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 
 class GpuEventSynchronizer;
 
-/** \internal
+/*! \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
-struct gmx_nbnxn_cuda_t
+struct gmx_nbnxm_gpu_t
 {
-    //! CUDA device information
+    /*! \brief CUDA device information */
     const gmx_device_info_t* dev_info;
-    //! true if doing both local/non-local NB work on GPU
+    /*! \brief true if doing both local/non-local NB work on GPU */
     bool bUseTwoStreams;
-    //! atom data
+    /*! \brief atom data */
     cu_atomdata_t* atdat;
-    //! f buf ops cell index mapping
+    /*! \brief f buf ops cell index mapping */
     int* cell;
-    //! number of indices in cell buffer
+    /*! \brief number of indices in cell buffer */
     int ncell;
-    //! number of indices allocated in cell buffer
+    /*! \brief number of indices allocated in cell buffer */
     int ncell_alloc;
-    //! array of atom indices
+    /*! \brief array of atom indices */
     int* atomIndices;
-    //! size of atom indices
+    /*! \brief size of atom indices */
     int atomIndicesSize;
-    //! size of atom indices allocated in device buffer
+    /*! \brief size of atom indices allocated in device buffer */
     int atomIndicesSize_alloc;
-    //! x buf ops num of atoms
+    /*! \brief x buf ops num of atoms */
     int* cxy_na;
-    //! number of elements in cxy_na
+    /*! \brief number of elements in cxy_na */
     int ncxy_na;
-    //! number of elements allocated allocated in device buffer
+    /*! \brief number of elements allocated allocated in device buffer */
     int ncxy_na_alloc;
-    //! x buf ops cell index mapping
+    /*! \brief x buf ops cell index mapping */
     int* cxy_ind;
-    //! number of elements in cxy_ind
+    /*! \brief number of elements in cxy_ind */
     int ncxy_ind;
-    //! number of elements allocated allocated in device buffer
+    /*! \brief number of elements allocated allocated in device buffer */
     int ncxy_ind_alloc;
-    //! parameters required for the non-bonded calc.
+    /*! \brief parameters required for the non-bonded calc. */
     cu_nbparam_t* nbparam;
-    //! pair-list data structures (local and non-local)
+    /*! \brief pair-list data structures (local and non-local) */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t*> plist;
-    //! staging area where fshift/energies get downloaded
+    /*! \brief staging area where fshift/energies get downloaded */
     nb_staging_t nbst;
-    //! local and non-local GPU streams
+    /*! \brief local and non-local GPU streams */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream;
 
-    /** events used for synchronization */
-    cudaEvent_t nonlocal_done; /**< event triggered when the non-local non-bonded kernel
-                                  is done (and the local transfer can proceed)           */
-    cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
-                                                the local stream that need to precede the
-                                                non-local force or buffer operation calculations are
-                                                done (e.g. f buffer 0-ing, local x/q H2D, buffer op
-                                                initialization in local stream that is required also
-                                                by nonlocal stream ) */
-
-    //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
-    //  to be executed in the current domain. As long as bonded work is not split up into
-    //  local/nonlocal, if there is bonded GPU work, both flags will be true.
+    /*! \brief Events used for synchronization */
+    /*! \{ */
+    /*! \brief Event triggered when the non-local non-bonded
+     * kernel is done (and the local transfer can proceed) */
+    cudaEvent_t nonlocal_done;
+    /*! \brief Event triggered when the tasks issued in the local
+     * stream that need to precede the non-local force or buffer
+     * operation calculations are done (e.g. f buffer 0-ing, local
+     * x/q H2D, buffer op initialization in local stream that is
+     * required also by nonlocal stream ) */
+    cudaEvent_t misc_ops_and_local_H2D_done;
+    /*! \} */
+
+    /*! \brief True if there is work for the current domain in the
+     * respective locality.
+     *
+     * This includes local/nonlocal GPU work, either bonded or
+     * nonbonded, scheduled to be executed in the current
+     * domain. As long as bonded work is not split up into
+     * local/nonlocal, if there is bonded GPU work, both flags
+     * will be true. */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
 
-    /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete
+    /*! \brief Pointer to event synchronizer triggered when the local
+     * GPU buffer ops / reduction is complete
      *
-     * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu.
+     * \note That the synchronizer is managed outside of this module
+     * in StatePropagatorDataGpu.
      */
     GpuEventSynchronizer* localFReductionDone;
 
-    GpuEventSynchronizer* xNonLocalCopyD2HDone; /**< event triggered when
-                                                   non-local coordinate buffer has been
-                                                   copied from device to host*/
+    /*! \brief Event triggered when non-local coordinate buffer
+     * has been copied from device to host. */
+    GpuEventSynchronizer* xNonLocalCopyD2HDone;
 
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
      * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
-    //! True if event-based timing is enabled.
+    /*! \brief True if event-based timing is enabled. */
     bool bDoTime;
-    //! CUDA event-based timers.
+    /*! \brief CUDA event-based timers. */
     cu_timers_t* timers;
-    //! Timing data. TODO: deprecate this and query timers for accumulated data instead
+    /*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */
     gmx_wallclock_gpu_nbnxn_t* timings;
 };
 
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index d209f0319a..a8369ce2d2 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -124,7 +124,7 @@ static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocalit
 
 
 //NOLINTNEXTLINE(misc-definitions-in-headers)
-void setupGpuShortRangeWork(gmx_nbnxn_gpu_t*               nb,
+void setupGpuShortRangeWork(gmx_nbnxm_gpu_t*               nb,
                             const gmx::GpuBonded*          gpuBonded,
                             const gmx::InteractionLocality iLocality)
 {
@@ -146,13 +146,13 @@ void setupGpuShortRangeWork(gmx_nbnxn_gpu_t*               nb,
  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
  * \param[in]     iLocality Interaction locality identifier
  */
-static bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t& nb, const gmx::InteractionLocality iLocality)
+static bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t& nb, const gmx::InteractionLocality iLocality)
 {
     return nb.haveWork[iLocality];
 }
 
 //NOLINTNEXTLINE(misc-definitions-in-headers)
-bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t* nb, const gmx::AtomLocality aLocality)
+bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t* nb, const gmx::AtomLocality aLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -362,7 +362,7 @@ static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
  * \todo Move into shared source file with gmx_compile_cpp_as_cuda
  */
 //NOLINTNEXTLINE(misc-definitions-in-headers)
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t*         nb,
+bool gpu_try_finish_task(gmx_nbnxm_gpu_t*         nb,
                          const gmx::StepWorkload& stepWork,
                          const AtomLocality       aloc,
                          real*                    e_lj,
@@ -458,7 +458,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t*         nb,
  * \return            The number of cycles the gpu wait took
  */
 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-float gpu_wait_finish_task(gmx_nbnxn_gpu_t*         nb,
+float gpu_wait_finish_task(gmx_nbnxm_gpu_t*         nb,
                            const gmx::StepWorkload& stepWork,
                            AtomLocality             aloc,
                            real*                    e_lj,
diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h
index 4c3333d82a..176ab8f045 100644
--- a/src/gromacs/nbnxm/gpu_common_utils.h
+++ b/src/gromacs/nbnxm/gpu_common_utils.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -64,7 +64,7 @@ namespace Nbnxm
  * local part of the force array also depends on the non-local kernel.
  * The skip of the local kernel is taken care of separately.
  */
-static inline bool canSkipNonbondedWork(const gmx_nbnxn_gpu_t& nb, InteractionLocality iloc)
+static inline bool canSkipNonbondedWork(const gmx_nbnxm_gpu_t& nb, InteractionLocality iloc)
 {
     assert(nb.plist[iloc]);
     return (iloc == InteractionLocality::NonLocal && nb.plist[iloc]->nsci == 0);
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index c93c536bec..2f504e91f8 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -50,21 +50,20 @@
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/locality.h"
 
-#include "gpu_types.h"
-
-struct NbnxnPairlistGpu;
-struct nbnxn_atomdata_t;
-struct PairlistParams;
-struct gmx_wallclock_gpu_nbnxn_t;
+struct gmx_nbnxm_gpu_t;
 struct gmx_gpu_info_t;
 struct gmx_device_info_t;
+struct gmx_wallclock_gpu_nbnxn_t;
+struct nbnxn_atomdata_t;
+struct NbnxnPairlistGpu;
+struct PairlistParams;
 
 namespace Nbnxm
 {
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-gmx_nbnxn_gpu_t* gpu_init(const gmx_device_info_t gmx_unused* deviceInfo,
+gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t gmx_unused* deviceInfo,
                           const interaction_const_t gmx_unused* ic,
                           const PairlistParams gmx_unused& listParams,
                           const nbnxn_atomdata_t gmx_unused* nbat,
@@ -74,13 +73,13 @@ gmx_nbnxn_gpu_t* gpu_init(const gmx_device_info_t gmx_unused* deviceInfo,
 
 /** Initializes pair-list data for GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
-void gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused*   nb,
+void gpu_init_pairlist(gmx_nbnxm_gpu_t gmx_unused*   nb,
                        const struct NbnxnPairlistGpu gmx_unused* h_nblist,
                        gmx::InteractionLocality gmx_unused iloc) GPU_FUNC_TERM;
 
 /** Initializes atom-data on the GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
-void gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbat) GPU_FUNC_TERM;
+void gpu_init_atomdata(gmx_nbnxm_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbat) GPU_FUNC_TERM;
 
 /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed.
@@ -91,19 +90,19 @@ void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused* nb
 
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 GPU_FUNC_QUALIFIER
-void gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbatom) GPU_FUNC_TERM;
+void gpu_upload_shiftvec(gmx_nbnxm_gpu_t gmx_unused* nb, const nbnxn_atomdata_t gmx_unused* nbatom) GPU_FUNC_TERM;
 
 /** Clears GPU outputs: nonbonded force, shift force and energy. */
 GPU_FUNC_QUALIFIER
-void gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused* nb, bool gmx_unused computeVirial) GPU_FUNC_TERM;
+void gpu_clear_outputs(gmx_nbnxm_gpu_t gmx_unused* nb, bool gmx_unused computeVirial) GPU_FUNC_TERM;
 
 /** Frees all GPU resources used for the nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-void gpu_free(gmx_nbnxn_gpu_t gmx_unused* nb) GPU_FUNC_TERM;
+void gpu_free(gmx_nbnxm_gpu_t gmx_unused* nb) GPU_FUNC_TERM;
 
 /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
 GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused* nb)
+struct gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t gmx_unused* nb)
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /** Resets nonbonded GPU timings. */
@@ -113,37 +112,37 @@ void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused* nbv) GPU_FUNC_TERM;
 /** Calculates the minimum size of proximity lists to improve SM load balance
  *  with GPU non-bonded kernels. */
 GPU_FUNC_QUALIFIER
-int gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(-1);
+int gpu_min_ci_balanced(gmx_nbnxm_gpu_t gmx_unused* nb) GPU_FUNC_TERM_WITH_RETURN(-1);
 
 /** Returns if analytical Ewald GPU kernels are used. */
 GPU_FUNC_QUALIFIER
-gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused* nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t gmx_unused* nb)
         GPU_FUNC_TERM_WITH_RETURN(FALSE);
 
 /** Returns an opaque pointer to the GPU command stream
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void* gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
+void* gpu_get_command_stream(gmx_nbnxm_gpu_t gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
         CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 /** Returns an opaque pointer to the GPU coordinate+charge array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void* gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+void* gpu_get_xq(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 /** Returns an opaque pointer to the GPU force array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void* gpu_get_f(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+void* gpu_get_f(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 /** Returns an opaque pointer to the GPU shift force array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-rvec* gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+rvec* gpu_get_fshift(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 } // namespace Nbnxm
 
diff --git a/src/gromacs/nbnxm/gpu_jit_support.h b/src/gromacs/nbnxm/gpu_jit_support.h
index b784d37ce8..3a5928d2bb 100644
--- a/src/gromacs/nbnxm/gpu_jit_support.h
+++ b/src/gromacs/nbnxm/gpu_jit_support.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,9 +46,9 @@
 
 #include "gromacs/utility/basedefinitions.h"
 
-#include "gpu_types.h"
+struct gmx_nbnxm_gpu_t;
 
 /*! \brief Handles any JIT compilation of nbnxn kernels for the selected device */
-OPENCL_FUNC_QUALIFIER void nbnxn_gpu_compile_kernels(gmx_nbnxn_gpu_t gmx_unused* nb) OPENCL_FUNC_TERM;
+OPENCL_FUNC_QUALIFIER void nbnxn_gpu_compile_kernels(gmx_nbnxm_gpu_t gmx_unused* nb) OPENCL_FUNC_TERM;
 
 #endif
diff --git a/src/gromacs/nbnxm/gpu_types.h b/src/gromacs/nbnxm/gpu_types.h
deleted file mode 100644
index 43039f7986..0000000000
--- a/src/gromacs/nbnxm/gpu_types.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014,2015,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \libinternal \file
- * \brief Sets gmx_nbnxn_gpu_t to the correct type depending on the build
- *
- * \ingroup module_nbnxm
- */
-
-#ifndef GMX_NBNXN_GPU_TYPES_H
-#define GMX_NBNXN_GPU_TYPES_H
-
-#include "config.h"
-
-#ifndef DOXYGEN
-
-#    if GMX_GPU == GMX_GPU_OPENCL
-struct gmx_nbnxn_ocl_t;
-using gmx_nbnxn_gpu_t = gmx_nbnxn_ocl_t;
-#    endif
-
-#    if GMX_GPU == GMX_GPU_CUDA
-struct gmx_nbnxn_cuda_t;
-using gmx_nbnxn_gpu_t = gmx_nbnxn_cuda_t;
-#    endif
-
-#    if GMX_GPU == GMX_GPU_NONE
-using gmx_nbnxn_gpu_t = int;
-#    endif
-
-#endif // !DOXYGEN
-
-#endif
diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp
index 060722c4e3..d852ec4f08 100644
--- a/src/gromacs/nbnxm/grid.cpp
+++ b/src/gromacs/nbnxm/grid.cpp
@@ -57,10 +57,10 @@
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/updategroupscog.h"
 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/simd/simd.h"
 #include "gromacs/simd/vector_operations.h"
 
-#include "atomdata.h"
 #include "boundingboxes.h"
 #include "gridsetdata.h"
 #include "nbnxm_geometry.h"
diff --git a/src/gromacs/nbnxm/gridset.cpp b/src/gromacs/nbnxm/gridset.cpp
index 1adcff76a6..a2e83ddc6a 100644
--- a/src/gromacs/nbnxm/gridset.cpp
+++ b/src/gromacs/nbnxm/gridset.cpp
@@ -48,10 +48,9 @@
 
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/updategroupscog.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/utility/fatalerror.h"
 
-#include "atomdata.h"
-
 namespace Nbnxm
 {
 
diff --git a/src/gromacs/nbnxm/gridsetdata.h b/src/gromacs/nbnxm/gridsetdata.h
index ed8fc182bd..9d7301d345 100644
--- a/src/gromacs/nbnxm/gridsetdata.h
+++ b/src/gromacs/nbnxm/gridsetdata.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -49,6 +49,7 @@
 
 #include <vector>
 
+#include "gromacs/gpu_utils/hostallocator.h"
 
 namespace Nbnxm
 {
diff --git a/src/gromacs/nbnxm/kernel_common.h b/src/gromacs/nbnxm/kernel_common.h
index ecc6abf9d0..f93e03521d 100644
--- a/src/gromacs/nbnxm/kernel_common.h
+++ b/src/gromacs/nbnxm/kernel_common.h
@@ -48,9 +48,9 @@
 
 #include "gromacs/math/vectypes.h"
 /* nbnxn_atomdata_t and nbnxn_pairlist_t could be forward declared, but that requires modifications in all SIMD kernel files */
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/utility/real.h"
 
-#include "atomdata.h"
 #include "pairlist.h"
 
 struct interaction_const_t;
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp
index 165dd2a724..d3513966b4 100644
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/utility/real.h"
 
 #include "kernel_common.h"
+#include "nbnxm_gpu.h"
 #include "nbnxm_simd.h"
 #include "pairlistset.h"
 #include "pairlistsets.h"
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 1531f02816..07b24fc923 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,9 +46,10 @@
 #include "nbnxm.h"
 
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/timing/wallcycle.h"
 
-#include "atomdata.h"
+#include "nbnxm_gpu.h"
 #include "pairlistsets.h"
 #include "pairsearch.h"
 
@@ -162,7 +163,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
 
     /* Skip the reduction if there was no short-range GPU work to do
      * (either NB or both NB and bonded work). */
-    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    if (!pairlistIsSimple() && !Nbnxm::haveGpuShortRangeWork(gpu_nbv, locality))
     {
         return;
     }
@@ -190,7 +191,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality lo
 
     /* Skip the reduction if there was no short-range GPU work to do
      * (either NB or both NB and bonded work). */
-    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    if (!pairlistIsSimple() && !Nbnxm::haveGpuShortRangeWork(gpu_nbv, locality))
     {
         return;
     }
@@ -235,6 +236,15 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter, real rlistInner)
     pairlistSets_->changePairlistRadii(rlistOuter, rlistInner);
 }
 
+void nonbonded_verlet_t::setupGpuShortRangeWork(const gmx::GpuBonded*          gpuBonded,
+                                                const gmx::InteractionLocality iLocality)
+{
+    if (useGpu() && !emulateGpu())
+    {
+        Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality);
+    }
+}
+
 void nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
 {
     Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 1a27e2e160..c820e57726 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -118,19 +118,17 @@
 #include "gromacs/mdtypes/locality.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/enumerationhelpers.h"
-#include "gromacs/utility/range.h"
 #include "gromacs/utility/real.h"
 
-// TODO: Remove this include
-#include "nbnxm_gpu.h"
-
 struct gmx_device_info_t;
 struct gmx_domdec_zones_t;
 struct gmx_enerdata_t;
 struct gmx_hw_info_t;
 struct gmx_mtop_t;
+struct gmx_nbnxm_gpu_t;
 struct gmx_wallcycle;
 struct interaction_const_t;
+struct nbnxn_atomdata_t;
 struct nonbonded_verlet_t;
 class PairSearch;
 class PairlistSets;
@@ -153,9 +151,13 @@ class GpuEventSynchronizer;
 namespace gmx
 {
 class ForceWithShiftForces;
+class GpuBonded;
 template<typename>
 class ListOfLists;
 class MDLogger;
+template<typename>
+class Range;
+class StepWorkload;
 class UpdateGroupsCog;
 } // namespace gmx
 
@@ -223,7 +225,7 @@ public:
                        std::unique_ptr<PairSearch>       pairSearch,
                        std::unique_ptr<nbnxn_atomdata_t> nbat,
                        const Nbnxm::KernelSetup&         kernelSetup,
-                       gmx_nbnxn_gpu_t*                  gpu_nbv,
+                       gmx_nbnxm_gpu_t*                  gpu_nbv,
                        gmx_wallcycle*                    wcycle);
 
     ~nonbonded_verlet_t();
@@ -382,19 +384,7 @@ public:
     void changePairlistRadii(real rlistOuter, real rlistInner);
 
     //! Set up internal flags that indicate what type of short-range work there is.
-    void setupGpuShortRangeWork(const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
-    {
-        if (useGpu() && !emulateGpu())
-        {
-            Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality);
-        }
-    }
-
-    //! Returns true if there is GPU short-range work for the given atom locality.
-    bool haveGpuShortRangeWork(const gmx::AtomLocality aLocality)
-    {
-        return ((useGpu() && !emulateGpu()) && Nbnxm::haveGpuShortRangeWork(gpu_nbv, aLocality));
-    }
+    void setupGpuShortRangeWork(const gmx::GpuBonded* gpuBonded, gmx::InteractionLocality iLocality);
 
     // TODO: Make all data members private
 public:
@@ -413,7 +403,7 @@ private:
 
 public:
     //! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr)
-    gmx_nbnxn_gpu_t* gpu_nbv;
+    gmx_nbnxm_gpu_t* gpu_nbv;
 };
 
 namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 035d5f1ae8..adbbcf7f0c 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -47,12 +47,10 @@
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdtypes/locality.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
-#include "atomdata.h"
-#include "gpu_types.h"
-
 struct interaction_const_t;
 struct nbnxn_atomdata_t;
 struct gmx_wallcycle;
@@ -80,7 +78,7 @@ class Grid;
  * \param [in]    aloc      Atom locality flag.
  */
 GPU_FUNC_QUALIFIER
-void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused*   nb,
+void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t gmx_unused*   nb,
                         const struct nbnxn_atomdata_t gmx_unused* nbdata,
                         gmx::AtomLocality gmx_unused aloc) GPU_FUNC_TERM;
 
@@ -95,7 +93,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused*   nb,
  *
  */
 GPU_FUNC_QUALIFIER
-void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused* nb,
+void gpu_launch_kernel(gmx_nbnxm_gpu_t gmx_unused* nb,
                        const gmx::StepWorkload gmx_unused& stepWork,
                        gmx::InteractionLocality gmx_unused iloc) GPU_FUNC_TERM;
 
@@ -135,7 +133,7 @@ void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused* nb,
  * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
  */
 GPU_FUNC_QUALIFIER
-void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused* nb,
+void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t gmx_unused* nb,
                                  gmx::InteractionLocality gmx_unused iloc,
                                  int gmx_unused numParts) GPU_FUNC_TERM;
 
@@ -144,7 +142,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused* nb,
  * (and energies/shift forces if required).
  */
 GPU_FUNC_QUALIFIER
-void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused* nb,
+void gpu_launch_cpyback(gmx_nbnxm_gpu_t gmx_unused* nb,
                         nbnxn_atomdata_t gmx_unused* nbatom,
                         const gmx::StepWorkload gmx_unused& stepWork,
                         gmx::AtomLocality gmx_unused aloc) GPU_FUNC_TERM;
@@ -187,7 +185,7 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused* nb,
  * \returns                   True if the nonbonded tasks associated with \p aloc locality have completed
  */
 GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused* nb,
+bool gpu_try_finish_task(gmx_nbnxm_gpu_t gmx_unused* nb,
                          const gmx::StepWorkload gmx_unused& stepWork,
                          gmx::AtomLocality gmx_unused aloc,
                          real gmx_unused* e_lj,
@@ -211,7 +209,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused* nb,
  * \param[out] shiftForces Shift forces buffer to accumulate into
  * \param[out] wcycle         Pointer to wallcycle data structure               */
 GPU_FUNC_QUALIFIER
-float gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused* nb,
+float gpu_wait_finish_task(gmx_nbnxm_gpu_t gmx_unused* nb,
                            const gmx::StepWorkload gmx_unused& stepWork,
                            gmx::AtomLocality gmx_unused aloc,
                            real gmx_unused* e_lj,
@@ -228,7 +226,7 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unused& ic)
  * Called on the NS step and performs (re-)allocations and memory copies. !*/
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
-                                gmx_nbnxn_gpu_t gmx_unused* gpu_nbv) CUDA_FUNC_TERM;
+                                gmx_nbnxm_gpu_t gmx_unused* gpu_nbv) CUDA_FUNC_TERM;
 
 /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  *
@@ -244,7 +242,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                            bool gmx_unused setFillerCoords,
-                           gmx_nbnxn_gpu_t gmx_unused* gpu_nbv,
+                           gmx_nbnxm_gpu_t gmx_unused* gpu_nbv,
                            DeviceBuffer<float> gmx_unused d_x,
                            GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                            gmx::AtomLocality gmx_unused locality,
@@ -256,7 +254,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
  * \param[in] interactionLocality  Local or NonLocal sync point
  */
 CUDA_FUNC_QUALIFIER
-void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused* nb,
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxm_gpu_t gmx_unused* nb,
                                       gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM;
 
 /*! \brief Set up internal flags that indicate what type of short-range work there is.
@@ -272,7 +270,7 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused* nb,
  * \param[in]     iLocality  Interaction locality identifier
  */
 GPU_FUNC_QUALIFIER
-void setupGpuShortRangeWork(gmx_nbnxn_gpu_t gmx_unused* nb,
+void setupGpuShortRangeWork(gmx_nbnxm_gpu_t gmx_unused* nb,
                             const gmx::GpuBonded gmx_unused* gpuBonded,
                             gmx::InteractionLocality gmx_unused iLocality) GPU_FUNC_TERM;
 
@@ -286,13 +284,13 @@ void setupGpuShortRangeWork(gmx_nbnxn_gpu_t gmx_unused* nb,
  * \param[in]     aLocality Atom locality identifier
  */
 GPU_FUNC_QUALIFIER
-bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality)
+bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality)
         GPU_FUNC_TERM_WITH_RETURN(false);
 
 /*! \brief Initialization for F buffer operations on GPU */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
-                                    gmx_nbnxn_gpu_t gmx_unused* gpu_nbv,
+                                    gmx_nbnxm_gpu_t gmx_unused* gpu_nbv,
                                     int gmx_unused       natoms_total,
                                     GpuEventSynchronizer gmx_unused* localReductionDone) CUDA_FUNC_TERM;
 
@@ -315,7 +313,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
                                DeviceBuffer<float> gmx_unused totalForcesDevice,
-                               gmx_nbnxn_gpu_t gmx_unused* gpu_nbv,
+                               gmx_nbnxm_gpu_t gmx_unused* gpu_nbv,
                                void gmx_unused*                           pmeForcesDevice,
                                gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
                                int gmx_unused atomStart,
@@ -327,7 +325,7 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
  * \param[in] nb                   The nonbonded data GPU structure
  */
 CUDA_FUNC_QUALIFIER
-void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
+void nbnxn_wait_x_on_device(gmx_nbnxm_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
 
 } // namespace Nbnxm
 #endif
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp
index cb3a230929..745414b67d 100644
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -49,6 +49,7 @@
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist_tuning.h"
@@ -56,8 +57,6 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/logger.h"
 
-#include "atomdata.h"
-#include "gpu_types.h"
 #include "grid.h"
 #include "nbnxm_geometry.h"
 #include "nbnxm_simd.h"
@@ -321,7 +320,7 @@ namespace Nbnxm
 {
 
 /*! \brief Gets and returns the minimum i-list count for balacing based on the GPU used or env.var. when set */
-static int getMinimumIlistCountForGpuBalancing(gmx_nbnxn_gpu_t* nbnxmGpu)
+static int getMinimumIlistCountForGpuBalancing(gmx_nbnxm_gpu_t* nbnxmGpu)
 {
     int minimumIlistCount;
 
@@ -441,7 +440,7 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
                         fr->nbfp, mimimumNumEnergyGroupNonbonded,
                         (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
 
-    gmx_nbnxn_gpu_t* gpu_nbv                          = nullptr;
+    gmx_nbnxm_gpu_t* gpu_nbv                          = nullptr;
     int              minimumIlistCountForGpuBalancing = 0;
     if (useGpu)
     {
@@ -470,7 +469,7 @@ nonbonded_verlet_t::nonbonded_verlet_t(std::unique_ptr<PairlistSets>     pairlis
                                        std::unique_ptr<PairSearch>       pairSearch,
                                        std::unique_ptr<nbnxn_atomdata_t> nbat_in,
                                        const Nbnxm::KernelSetup&         kernelSetup,
-                                       gmx_nbnxn_gpu_t*                  gpu_nbv_ptr,
+                                       gmx_nbnxm_gpu_t*                  gpu_nbv_ptr,
                                        gmx_wallcycle*                    wcycle) :
     pairlistSets_(std::move(pairlistSets)),
     pairSearch_(std::move(pairSearch)),
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 356afeb9fb..c795937a84 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -341,7 +341,7 @@ static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool fir
  *  OpenCL kernel objects are cached in nb. If the requested kernel is not
  *  found in the cache, it will be created and the cache will be updated.
  */
-static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
+static inline cl_kernel select_nbnxn_kernel(gmx_nbnxm_gpu_t* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
 {
     const char* kernel_name_to_run;
     cl_kernel*  kernel_ptr;
@@ -471,7 +471,7 @@ static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
 }
 
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+void gpu_copy_xq_to_gpu(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -575,7 +575,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t* nb, const nbnxn_atomdata_t* nbatom, con
    misc_ops_done event to record the point in time when the above  operations
    are finished and synchronize with this event in the non-local stream.
  */
-void gpu_launch_kernel(gmx_nbnxn_ocl_t* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
+void gpu_launch_kernel(gmx_nbnxm_gpu_t* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
 {
     cl_atomdata_t*   adat   = nb->atdat;
     cl_nbparam_t*    nbp    = nb->nbparam;
@@ -713,7 +713,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
  * Launch the pairlist prune only kernel for the given locality.
  * \p numParts tells in how many parts, i.e. calls the list will be pruned.
  */
-void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t* nb, const InteractionLocality iloc, const int numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxm_gpu_t* nb, const InteractionLocality iloc, const int numParts)
 {
     cl_atomdata_t*   adat    = nb->atdat;
     cl_nbparam_t*    nbp     = nb->nbparam;
@@ -839,7 +839,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t* nb, const InteractionLocality
  * Launch asynchronously the download of nonbonded forces from the GPU
  * (and energies/shift forces if required).
  */
-void gpu_launch_cpyback(gmx_nbnxn_ocl_t*         nb,
+void gpu_launch_cpyback(gmx_nbnxm_gpu_t*         nb,
                         struct nbnxn_atomdata_t* nbatom,
                         const gmx::StepWorkload& stepWork,
                         const AtomLocality       aloc)
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index cb42ce9cca..4943a8e0dd 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -52,11 +52,6 @@
 
 #include <cmath>
 
-// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
-//      is currently declared means this has to be before gpu_types.h
-#include "nbnxm_ocl_types.h"
-
-// TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
@@ -79,6 +74,7 @@
 #include "gromacs/utility/smalloc.h"
 
 #include "nbnxm_ocl_internal.h"
+#include "nbnxm_ocl_types.h"
 
 namespace Nbnxm
 {
@@ -411,7 +407,7 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti
     {
         return;
     }
-    gmx_nbnxn_ocl_t* nb  = nbv->gpu_nbv;
+    gmx_nbnxm_gpu_t* nb  = nbv->gpu_nbv;
     cl_nbparam_t*    nbp = nb->nbparam;
 
     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
@@ -531,7 +527,7 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData,
 }
 
 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
-static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t* nb, const char* kernel_name)
+static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxm_gpu_t* nb, const char* kernel_name)
 {
     cl_kernel kernel;
     cl_int    cl_error;
@@ -548,7 +544,7 @@ static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t* nb, const char* kernel
 
 /*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
  */
-static void nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t* nb)
+static void nbnxn_ocl_clear_e_fshift(gmx_nbnxm_gpu_t* nb)
 {
 
     cl_int           cl_error;
@@ -581,7 +577,7 @@ static void nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t* nb)
 }
 
 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
-static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t* nb)
+static void nbnxn_gpu_init_kernels(gmx_nbnxm_gpu_t* nb)
 {
     /* Init to 0 main kernel arrays */
     /* They will be later on initialized in select_nbnxn_kernel */
@@ -610,7 +606,7 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t* nb)
  *  Initializes members of the atomdata and nbparam structs and
  *  clears e/fshift output buffers.
  */
-static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t*                nb,
+static void nbnxn_ocl_init_const(gmx_nbnxm_gpu_t*                nb,
                                  const interaction_const_t*      ic,
                                  const PairlistParams&           listParams,
                                  const nbnxn_atomdata_t::Params& nbatParams)
@@ -621,14 +617,14 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t*                nb,
 
 
 //! This function is documented in the header file
-gmx_nbnxn_ocl_t* gpu_init(const gmx_device_info_t*   deviceInfo,
+gmx_nbnxm_gpu_t* gpu_init(const gmx_device_info_t*   deviceInfo,
                           const interaction_const_t* ic,
                           const PairlistParams&      listParams,
                           const nbnxn_atomdata_t*    nbat,
                           const int                  rank,
                           const gmx_bool             bLocalAndNonlocal)
 {
-    gmx_nbnxn_ocl_t*            nb;
+    gmx_nbnxm_gpu_t*            nb;
     cl_int                      cl_error;
     cl_command_queue_properties queue_properties;
 
@@ -732,7 +728,7 @@ gmx_nbnxn_ocl_t* gpu_init(const gmx_device_info_t*   deviceInfo,
 
 /*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
  */
-static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t* nb, int natoms_clear)
+static void nbnxn_ocl_clear_f(gmx_nbnxm_gpu_t* nb, int natoms_clear)
 {
     if (natoms_clear == 0)
     {
@@ -752,7 +748,7 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t* nb, int natoms_clear)
 }
 
 //! This function is documented in the header file
-void gpu_clear_outputs(gmx_nbnxn_ocl_t* nb, bool computeVirial)
+void gpu_clear_outputs(gmx_nbnxm_gpu_t* nb, bool computeVirial)
 {
     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
     /* clear shift force array and energies if the outputs were
@@ -769,7 +765,7 @@ void gpu_clear_outputs(gmx_nbnxn_ocl_t* nb, bool computeVirial)
 }
 
 //! This function is documented in the header file
-void gpu_init_pairlist(gmx_nbnxn_ocl_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
+void gpu_init_pairlist(gmx_nbnxm_gpu_t* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 {
     char sbuf[STRLEN];
     // Timing accumulation should happen only if there was work to do
@@ -830,7 +826,7 @@ void gpu_init_pairlist(gmx_nbnxn_ocl_t* nb, const NbnxnPairlistGpu* h_plist, con
 }
 
 //! This function is documented in the header file
-void gpu_upload_shiftvec(gmx_nbnxn_ocl_t* nb, const nbnxn_atomdata_t* nbatom)
+void gpu_upload_shiftvec(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbatom)
 {
     cl_atomdata_t*   adat = nb->atdat;
     cl_command_queue ls   = nb->stream[InteractionLocality::Local];
@@ -845,7 +841,7 @@ void gpu_upload_shiftvec(gmx_nbnxn_ocl_t* nb, const nbnxn_atomdata_t* nbatom)
 }
 
 //! This function is documented in the header file
-void gpu_init_atomdata(gmx_nbnxn_ocl_t* nb, const nbnxn_atomdata_t* nbat)
+void gpu_init_atomdata(gmx_nbnxm_gpu_t* nb, const nbnxn_atomdata_t* nbat)
 {
     cl_int           cl_error;
     int              nalloc, natoms;
@@ -1005,7 +1001,7 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t* runData)
 }
 
 //! This function is documented in the header file
-void gpu_free(gmx_nbnxn_ocl_t* nb)
+void gpu_free(gmx_nbnxm_gpu_t* nb)
 {
     if (nb == nullptr)
     {
@@ -1106,7 +1102,7 @@ void gpu_free(gmx_nbnxn_ocl_t* nb)
 }
 
 //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxn_ocl_t* nb)
+gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(gmx_nbnxm_gpu_t* nb)
 {
     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
@@ -1121,13 +1117,13 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv)
 }
 
 //! This function is documented in the header file
-int gpu_min_ci_balanced(gmx_nbnxn_ocl_t* nb)
+int gpu_min_ci_balanced(gmx_nbnxm_gpu_t* nb)
 {
     return nb != nullptr ? gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
 }
 
 //! This function is documented in the header file
-gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t* nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxm_gpu_t* nb)
 {
     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) || (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
 }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
index 36c55d9037..3ea5cc186d 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
@@ -167,7 +167,7 @@ static std::string makeDefinesForKernelTypes(bool bFastGen, int eeltype, int vdw
  *
  * Does not throw
  */
-void nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t* nb)
+void nbnxn_gpu_compile_kernels(gmx_nbnxm_gpu_t* nb)
 {
     gmx_bool   bFastGen = TRUE;
     cl_program program  = nullptr;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index b71628443b..303968ea96 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -274,7 +274,7 @@ typedef struct Nbnxm::gpu_timers_t cl_timers_t;
 /*! \internal
  * \brief Main data structure for OpenCL nonbonded force calculations.
  */
-struct gmx_nbnxn_ocl_t
+struct gmx_nbnxm_gpu_t
 {
     const gmx_device_info_t*          dev_info;    /**< OpenCL device information    */
     struct gmx_device_runtime_data_t* dev_rundata; /**< OpenCL runtime data (context, kernels) */
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp
index 6dee69782e..b76ad3e9a7 100644
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -54,6 +54,7 @@
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdtypes/group.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -65,7 +66,6 @@
 #include "gromacs/utility/listoflists.h"
 #include "gromacs/utility/smalloc.h"
 
-#include "atomdata.h"
 #include "boundingboxes.h"
 #include "clusterdistancekerneltype.h"
 #include "gridset.h"
diff --git a/src/gromacs/nbnxm/pairsearch.h b/src/gromacs/nbnxm/pairsearch.h
index 4263d9ba5b..357e78a19a 100644
--- a/src/gromacs/nbnxm/pairsearch.h
+++ b/src/gromacs/nbnxm/pairsearch.h
@@ -56,12 +56,12 @@
 
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/utility/alignedallocator.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/real.h"
 
-#include "atomdata.h"
 #include "gridset.h"
 #include "pairlist.h"
 
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp
index 141b1ea903..574d9c143a 100644
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -41,6 +41,7 @@
 #include "gromacs/utility/gmxassert.h"
 
 #include "clusterdistancekerneltype.h"
+#include "nbnxm_gpu.h"
 #include "pairlistset.h"
 #include "pairlistsets.h"
 #include "kernels_reference/kernel_ref_prune.h"
-- 
2.22.0