From 543f83db962d1c1faee55b32c0f82dadc9e411c9 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Sat, 11 Nov 2017 12:50:20 -0700
Subject: [PATCH] Support pinning in HostAllocator

We want the resize / reserve behaviour to handle page locking that is
useful for efficient GPU transfer, while making it possible to avoid
locking more pages than required for that vector. By embedding the
pin()/unpin() behaviour into malloc() and free() for the allocation
policy, this can be safely handled in all cases.

Additionally, high-level code can now choose for any individual vector
when and whether a pinning policy is required, and even manually
pin and unpin in any special cases that might arise.

When using the policy that does not support pinning, we now use
AlignedAllocator, so that we minimize memory consumption.

Change-Id: I807464222c7cc7718282b1e08204f563869322a0
---
 src/gromacs/gpu_utils/CMakeLists.txt          |   7 +-
 src/gromacs/gpu_utils/cudautils.cuh           |  42 +++
 src/gromacs/gpu_utils/hostallocator.cpp       | 147 +++++++++--
 src/gromacs/gpu_utils/hostallocator.h         | 240 +++++++++++++-----
 src/gromacs/gpu_utils/pinning.cu              | 124 +++++++++
 .../gpu_utils/{hostallocator.cu => pinning.h} |  73 ++----
 .../gpu_utils/tests/devicetransfers.cpp       |  11 +-
 .../gpu_utils/tests/devicetransfers.cu        |   6 +-
 src/gromacs/gpu_utils/tests/devicetransfers.h |   5 +-
 src/gromacs/gpu_utils/tests/hostallocator.cpp | 208 +++++++++++----
 src/gromacs/mdlib/mdatoms.cpp                 |  13 +-
 src/gromacs/mdlib/mdatoms.h                   |   4 +-
 src/gromacs/utility/allocator.h               |   6 +
 .../utility/tests/alignedallocator-impl.h     | 129 ++++++++++
 .../utility/tests/alignedallocator.cpp        |  66 +----
 15 files changed, 825 insertions(+), 256 deletions(-)
 create mode 100644 src/gromacs/gpu_utils/pinning.cu
 rename src/gromacs/gpu_utils/{hostallocator.cu => pinning.h} (55%)
 create mode 100644 src/gromacs/utility/tests/alignedallocator-impl.h
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt
index 27a190cf7a..04e9b752a1 100644
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -32,10 +32,12 @@
 # To help us fund GROMACS development, we humbly ask that you cite
 # the research papers on the package. Check out http://www.gromacs.org.
 
+gmx_add_libgromacs_sources(
+        hostallocator.cpp
+        )
 if(GMX_USE_OPENCL)
     gmx_add_libgromacs_sources(
         gpu_utils_ocl.cpp
-        hostallocator.cpp
         ocl_compiler.cpp
         ocl_caching.cpp
         oclutils.cpp
@@ -43,14 +45,13 @@ if(GMX_USE_OPENCL)
 elseif(GMX_USE_CUDA)
     gmx_add_libgromacs_sources(
         cudautils.cu
-        hostallocator.cu
         gpu_utils.cu
+        pinning.cu
         pmalloc_cuda.cu
         )
 else()
     gmx_add_libgromacs_sources(
         gpu_utils.cpp
-        hostallocator.cpp
         )
 endif()
 
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index f8d73c9697..dae3548fdd 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -42,9 +42,51 @@
 #include <nvml.h>
 #endif /* HAVE_NVML */
 
+#include <string>
+
 #include "gromacs/math/vec.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+namespace gmx
+{
+namespace
+{
+
+/*! \brief Helper function to ensure no pending error silently
+ * disrupts error handling.
+ *
+ * Asserts in a debug build if an unhandled error is present. Issues a
+ * warning at run time otherwise.
+ *
+ * \todo This is similar to CU_CHECK_PREV_ERR, which should be
+ * consolidated.
+ */
+static inline void ensureNoPendingCudaError(const char *errorMessage)
+{
+    // Ensure there is no pending error that would otherwise affect
+    // the behaviour of future error handling.
+    cudaError_t stat = cudaGetLastError();
+    if (stat == cudaSuccess)
+    {
+        return;
+    }
+
+    // If we would find an error in a release build, we do not know
+    // what is appropriate to do about it, so assert only for debug
+    // builds.
+    auto fullMessage = formatString("%s An unhandled error from a previous CUDA operation was detected. %s: %s",
+                                    errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat));
+    GMX_ASSERT(stat == cudaSuccess, fullMessage.c_str());
+    // TODO When we evolve a better logging framework, use that
+    // for release-build error reporting.
+    gmx_warning(fullMessage.c_str());
+}
+
+}   // namespace
+}   // namespace
 
 enum class GpuApiCallBehavior;
 
diff --git a/src/gromacs/gpu_utils/hostallocator.cpp b/src/gromacs/gpu_utils/hostallocator.cpp
index 3ceaefeb7b..a23c0cb12c 100644
--- a/src/gromacs/gpu_utils/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/hostallocator.cpp
@@ -34,12 +34,7 @@
  */
 /*! \internal \file
  * \brief Implements gmx::HostAllocationPolicy for allocating memory
- * suitable for GPU transfers on OpenCL, and when no GPU
- * implementation is used.
- *
- * \todo The same implementation can be used because we do not
- * currently attempt to optimize the allocation of host-side buffers
- * for OpenCL transfers, but this might be good to do.
+ * suitable for e.g. GPU transfers on CUDA.
  *
  * \author Mark Abraham <mark.j.abraham@gmail.com>
  */
@@ -47,39 +42,149 @@
 
 #include "hostallocator.h"
 
-#include <cstdlib>
+#include "config.h"
+
+#include <cstddef>
+
+#include <memory>
 
 #include "gromacs/utility/alignedallocator.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "pinning.h"
 
 namespace gmx
 {
 
-HostAllocationPolicy::HostAllocationPolicy(Impl s) : allocateForGpu_(s) {}
+//! Private implementation class.
+class HostAllocationPolicy::Impl
+{
+    public:
+        /*! \brief Pointer to the last unfreed allocation, or nullptr
+         * if no allocation exists.
+         *
+         * Note that during e.g. std::vector.resize() a call to its
+         * allocator's allocate() function precedes the call to its
+         * allocator's deallocate() function for freeing the old
+         * buffer after the data has been copied from it. So in
+         * general, pointer_ will not match the argument received by
+         * free(). */
+        void         *pointer_ = nullptr;
+        //! Number of bytes in the last unfreed allocation.
+        std::size_t   numBytes_ = 0;
+        //! The pointer to any storage that has been pinned, or nullptr if none has been pinned.
+        void         *pinnedPointer_ = nullptr;
+        //! Whether this object is in mode where new allocations will be pinned by default.
+        PinningPolicy pinningPolicy_ = PinningPolicy::CannotBePinned;
+};
 
-void *
-HostAllocationPolicy::malloc(std::size_t bytes) const
+HostAllocationPolicy::HostAllocationPolicy() : impl_(std::make_shared<Impl>())
 {
-    GMX_UNUSED_VALUE(allocateForGpu_);
-    // TODO if/when this is properly supported for OpenCL, we
-    // should explore whether it is needed, and if so what
-    // page size is desirable for alignment.
-    return AlignedAllocationPolicy::malloc(bytes);
 }
 
-void
-HostAllocationPolicy::free(void *buffer) const
+std::size_t HostAllocationPolicy::alignment()
 {
+    return (impl_->pinningPolicy_ == PinningPolicy::CanBePinned ?
+            PageAlignedAllocationPolicy::alignment() :
+            AlignedAllocationPolicy::alignment());
+}
+void *HostAllocationPolicy::malloc(std::size_t bytes) const noexcept
+{
+    // A container could have a pinned allocation that is being
+    // extended, in which case we must un-pin while we still know the
+    // old pinned vector, and which also ensures we don't pin two
+    // buffers at the same time. If there's no allocation, or it isn't
+    // pinned, then attempting to unpin it is OK, too.
+    unpin();
+    impl_->pointer_ = (impl_->pinningPolicy_ == PinningPolicy::CanBePinned ?
+                       PageAlignedAllocationPolicy::malloc(bytes) :
+                       AlignedAllocationPolicy::malloc(bytes));
+
+    if (impl_->pointer_ != nullptr)
+    {
+        impl_->numBytes_ = bytes;
+    }
+    pin();
+    return impl_->pointer_;
+}
+
+void HostAllocationPolicy::free(void *buffer) const noexcept
+{
+    unpin();
     if (buffer == nullptr)
     {
+        // Nothing to do
         return;
     }
-    GMX_UNUSED_VALUE(allocateForGpu_);
-    AlignedAllocationPolicy::free(buffer);
+    if (impl_->pinningPolicy_ == PinningPolicy::CanBePinned)
+    {
+        PageAlignedAllocationPolicy::free(buffer);
+    }
+    else
+    {
+        AlignedAllocationPolicy::free(buffer);
+    }
+    impl_->pointer_  = nullptr;
+    impl_->numBytes_ = 0;
 }
 
-HostAllocationPolicy makeHostAllocationPolicyForGpu()
+PinningPolicy HostAllocationPolicy::pinningPolicy() const
 {
-    return HostAllocationPolicy(HostAllocationPolicy::Impl::AllocateForGpu);
+    return impl_->pinningPolicy_;
+}
+
+void HostAllocationPolicy::setPinningPolicy(PinningPolicy pinningPolicy)
+{
+    if (GMX_GPU != GMX_GPU_CUDA)
+    {
+        GMX_RELEASE_ASSERT(pinningPolicy == PinningPolicy::CannotBePinned,
+                           "A suitable build of GROMACS (e.g. with CUDA) is required for a "
+                           "HostAllocationPolicy to be set to a mode that produces pinning.");
+    }
+    impl_->pinningPolicy_ = pinningPolicy;
+}
+
+void HostAllocationPolicy::pin() const noexcept
+{
+    if (impl_->pinningPolicy_ == PinningPolicy::CannotBePinned ||
+        impl_->pointer_ == nullptr ||
+        impl_->pinnedPointer_ != nullptr)
+    {
+        // Do nothing if we're not in pinning mode, or the allocation
+        // is empty, or it is already pinned.
+        return;
+    }
+#if GMX_GPU == GMX_GPU_CUDA
+    pinBuffer(impl_->pointer_, impl_->numBytes_);
+    impl_->pinnedPointer_ = impl_->pointer_;
+#else
+    const char *errorMessage = "Could not register the host memory for pinning.";
+
+    GMX_RELEASE_ASSERT(impl_->pinningPolicy_ == PinningPolicy::CannotBePinned,
+                       formatString("%s This build configuration must only have pinning policy "
+                                    "that leads to no pinning.", errorMessage).c_str());
+#endif
+}
+
+void HostAllocationPolicy::unpin() const noexcept
+{
+    if (impl_->pinnedPointer_ == nullptr)
+    {
+        return;
+    }
+
+#if GMX_GPU == GMX_GPU_CUDA
+    // Note that if the caller deactivated pinning mode, we still want
+    // to be able to unpin if the allocation is still pinned.
+
+    unpinBuffer(impl_->pointer_);
+    impl_->pinnedPointer_ = nullptr;
+#else
+    GMX_RELEASE_ASSERT(impl_->pinnedPointer_ == nullptr,
+                       "Since the build configuration does not support pinning, then "
+                       "the pinned pointer must be nullptr.");
+#endif
 }
 
 } // namespace gmx
diff --git a/src/gromacs/gpu_utils/hostallocator.h b/src/gromacs/gpu_utils/hostallocator.h
index dae54f2144..a34c26ed2b 100644
--- a/src/gromacs/gpu_utils/hostallocator.h
+++ b/src/gromacs/gpu_utils/hostallocator.h
@@ -33,9 +33,11 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \libinternal \file
- * \brief Declares gmx::HostAllocationPolicy and gmx::HostAllocator,
- * which are used to make standard library containers that can
- * allocate memory suitable for GPU transfers.
+ * \brief Declares gmx::HostAllocationPolicy, gmx::HostAllocator, and
+ * gmx::HostVector, which are used to make/be standard library
+ * containers that can allocate memory suitable for transfers.
+ * Currently the only supported transfers using pinned memory are
+ * to CUDA GPUs, but other possibilities exist in future.
  *
  * \author Mark Abraham <mark.j.abraham@gmail.com>
  * \inlibraryapi
@@ -45,57 +47,115 @@
 
 #include <cstddef>
 
-#include "gromacs/utility/allocator.h"
+#include <memory>
+#include <vector>
+
+#include "gromacs/utility/alignedallocator.h"
+#include "gromacs/utility/exceptions.h"
 
 namespace gmx
 {
 
+/*! \brief Helper enum for pinning policy of the allocation of
+ * HostAllocationPolicy.
+ *
+ * For an efficient non-blocking transfer (e.g. to a GPU), the memory
+ * pages for a buffer need to be pinned to a physical page. Aligning
+ * such buffers to a physical page should miminize the number of pages
+ * that need to be pinned. However, some buffers that may be used for
+ * such transfers may also be used in either GROMACS builds or run
+ * paths that cannot use such a device, so the policy can be
+ * configured so that the resource consumption is no higher than
+ * required for correct, efficient operation in all cases. */
+enum class PinningPolicy : int
+{
+    CannotBePinned,     // Memory is not known to be suitable for pinning.
+    CanBePinned,        // Memory is suitable for efficient pinning, e.g. because it is
+                        // allocated to be page aligned, and will be pinned when non-empty.
+};
+
+//! Forward declaration of host allocation policy class.
+class HostAllocationPolicy;
+
+/*! \brief Memory allocator that uses HostAllocationPolicy.
+ *
+ *  \tparam T          Type of objects to allocate
+ *
+ * This convenience partial specialization can be used for the
+ * optional allocator template parameter in standard library
+ * containers whose memory may be used for e.g. GPU transfers. The
+ * memory will always be allocated according to the behavior of
+ * HostAllocationPolicy.
+ */
+template <class T>
+using HostAllocator = Allocator<T, HostAllocationPolicy>;
+
+//! Convenience alias for std::vector that uses HostAllocator.
+template <class T>
+using HostVector = std::vector<T, HostAllocator<T> >;
+
 /*! \libinternal
  * \brief Policy class for configuring gmx::Allocator, to manage
- * allocations of memory that is suitable for GPU transfers.
+ * allocations of memory that may be needed for e.g. GPU transfers.
  *
  * This allocator has state, so is most useful in cases where it is
  * not known at compile time whether the allocated memory will be
- * transferred to a GPU. It will increase the size of containers that
- * use it. Memory allocated will always be aligned by the GPU
- * framework, or by AlignedAllocationPolicy.
+ * transferred to some device. It will increase the size of containers
+ * that use it. If the GROMACS build is configured with CUDA support,
+ * then memory will be allocated with PageAlignedAllocator, and that
+ * page pinned to physical memory if the pinning mode has been
+ * activated. If pinning mode is deactivated, or the GROMACS build
+ * does not support CUDA, then the memory will be allocated with
+ * AlignedAllocator. The pin() and unpin() methods work with the CUDA
+ * build, and silently do nothing otherwise. In future, we may modify
+ * or generalize this to work differently in other cases.
  *
- * \todo Consider also having a stateless version of this policy,
- * which might be slightly faster or more convenient to use in the
- * cases where it is known at compile time that the allocation will be
- * used to transfer to a GPU.
+ * The intended use is to configure gmx::Allocator with this class as
+ * its policy class, and then to use e.g.
+ * std::vector::get_allocator().getPolicy() to control whether the
+ * allocation policy should activate its pinning mode. The policy
+ * object can also be used to explicitly pin() and unpin() the buffer
+ * when it is using PinningPolicy::CanBePinned. The policy object is
+ * returned by value (as required by the C++ standard for
+ * get_allocator(), which copies a std::shared_ptr, so the policy
+ * object should be retrieved sparingly, e.g. only upon resize of the
+ * allocation. (Normal operation of the vector, e.g. during resize,
+ * incurs only the cost of the pointer indirection needed to consult
+ * the current state of the allocation policy.)
+ *
+ * \todo As a minor optimization, consider also having a stateless
+ * version of this policy, which might be slightly faster or more
+ * convenient to use in the cases where it is known at compile time
+ * that the allocation will be used to transfer to a GPU.
  */
 class HostAllocationPolicy
 {
     public:
-        //! Helper construction enum
-        enum class Impl : int
-        {
-            AllocateAligned  = 0,
-            AllocateForGpu   = 1
-        };
-        //! Constructor.
-        explicit HostAllocationPolicy(Impl s = Impl::AllocateAligned);
-        /*! \brief Allocate GPU memory
-         *
-         *  \param bytes Amount of memory (bytes) to allocate. It is
-         *               valid to ask for 0 bytes, which will return a
-         *               non-null pointer that is properly aligned in
-         *               page-locked memory (but that you should not
-         *               use). TODO check this.
-         *
-         * \return Valid pointer if the allocation worked, otherwise nullptr.
-         *
-         * The memory will always be allocated according to the requirements
-         * of the acceleration platform in use (e.g. CUDA).
+        //! Default constructor.
+        HostAllocationPolicy();
+        /*! \brief Return the alignment size currently used by the active pinning policy. */
+        std::size_t alignment();
+        /*! \brief Allocate and perhaps pin page-aligned memory suitable for
+         * e.g. GPU transfers.
+         *
+         * Before attempting to allocate, unpin() is called. After a
+         * successful allocation, pin() is called. (Whether these do
+         * things depends on the PinningPolicy that is in effect.)
+         *
+         *  \param bytes Amount of memory (bytes) to allocate. It is valid to ask for
+         *               0 bytes, which will return a non-null pointer that is properly
+         *               aligned and padded (but that you should not use).
+         *
+         *  \return Valid pointer if the allocation+optional pinning worked, otherwise nullptr.
          *
          *  \note Memory allocated with this routine must be released
          *        with gmx::HostAllocationPolicy::free(), and
          *        absolutely not the system free().
+         *
+         * Does not throw.
          */
-        void *
-        malloc(std::size_t bytes) const;
-        /*! \brief Free GPU memory
+        void *malloc(std::size_t bytes) const noexcept;
+        /*! \brief Free the memory, after unpinning (if appropriate).
          *
          *  \param buffer  Memory pointer previously returned from gmx::HostAllocationPolicy::malloc()
          *
@@ -103,36 +163,102 @@ class HostAllocationPolicy
          *        obtained from gmx:HostAllocationPolicy::malloc(),
          *        and absolutely not any pointers obtained the system
          *        malloc().
+         *
+         * Does not throw.
          */
-        void
-        free(void *buffer) const;
+        void free(void *buffer) const noexcept;
+        /*! \brief Pin the allocation to physical memory, if appropriate.
+         *
+         * If the allocation policy is not in pinning mode, or the
+         * allocation is empty, ot the allocation is already pinned,
+         * then do nothing.
+         *
+         * Does not throw.
+         */
+        void pin() const noexcept;
+        /*! \brief Unpin the allocation, if appropriate.
+         *
+         * Regardless of the allocation policy, unpin the memory if
+         * previously pinned, otherwise do nothing.
+         *
+         * Does not throw.
+         */
+        void unpin() const noexcept;
+        /*! \brief Return the current pinning policy (which is semi-independent
+         * of whether the buffer is actually pinned).
+         *
+         * Does not throw.
+         */
+        PinningPolicy pinningPolicy() const;
+        //! Specify an allocator trait so that the stateful allocator should propagate.
+        using propagate_on_container_copy_assignment = std::true_type;
+        //! Specify an allocator trait so that the stateful allocator should propagate.
+        using propagate_on_container_move_assignment = std::true_type;
+        //! Specify an allocator trait so that the stateful allocator should propagate.
+        using propagate_on_container_swap = std::true_type;
     private:
+        /*! \brief Set the current pinning policy.
+         *
+         * Does not pin any current buffer. Use changePinningPolicy to
+         * orchestrate the necessary unpin, allocate, copy, pin for
+         * effectively changing the pinning policy of a HostVector.
+         *
+         * Does not throw.
+         */
+        // cppcheck-suppress unusedPrivateFunction
+        void setPinningPolicy(PinningPolicy pinningPolicy);
+        /*! \brief Declare as a friend function the only supported way
+         * to change the pinning policy.
+         *
+         * When the pinning policy changes, we want the state of the
+         * allocation to match the new policy. However, that requires
+         * a copy and swap of the buffers, which can only take place
+         * at the level of the container. So we wrap the required
+         * operations in a helper friend function.
+         *
+         * Of course, if there is no allocation because the vector is
+         * empty, then nothing will change. */
+        template <class T> friend
+        void changePinningPolicy(HostVector<T> *v, PinningPolicy pinningPolicy);
+        //! Private implementation class.
+        class Impl;
         /*! \brief State of the allocator.
          *
-         * This could change through assignment of one policy to
-         * another, so isn't const. */
-        Impl allocateForGpu_;
+         * This could change through move- or copy-assignment of one
+         * policy to another, so isn't const. */
+        std::shared_ptr<Impl> impl_;
 };
 
-/*! \brief Convenience function
- *
- * The default construction is for non-GPU allocation, and this
- * function makes it less verbose to get allocation intended for use
- * with a GPU. */
-HostAllocationPolicy makeHostAllocationPolicyForGpu();
-
-/*! \brief Memory allocator for host-side memory for GPU transfers.
+/*! \brief Helper function for changing the pinning policy of a HostVector.
  *
- *  \tparam T          Type of objects to allocate
- *
- * This convenience partial specialization can be used for the
- * optional allocator template parameter in standard library
- * containers whose memory will be used for GPU transfers. The memory
- * will always be allocated according to the behavior of
- * HostAllocationPolicy.
- */
+ * If the vector has contents, then a full reallocation and buffer
+ * copy are needed if the policy change requires tighter restrictions,
+ * and desirable even if the policy change requires looser
+ * restrictions. That cost is OK, because GROMACS will do this
+ * operation very rarely (e.g. when auto-tuning and deciding to switch
+ * whether a task will run on a GPU, or not). */
 template <class T>
-using HostAllocator = Allocator<T, HostAllocationPolicy>;
+void changePinningPolicy(HostVector<T> *v, PinningPolicy pinningPolicy)
+{
+    // Do we have anything to do?
+    HostAllocationPolicy vAllocationPolicy = v->get_allocator().getPolicy();
+    if (pinningPolicy == vAllocationPolicy.pinningPolicy())
+    {
+        return;
+    }
+    // Make sure we never have two allocated buffers that are both pinned.
+    vAllocationPolicy.unpin();
+
+    // Construct a new vector that has the requested
+    // allocation+pinning policy, to swap into *v. If *v is empty,
+    // then no real work is done.
+    HostAllocator<T> newAllocator;
+    newAllocator.getPolicy().setPinningPolicy(pinningPolicy);
+    HostVector<T>    newV(v->begin(), v->end(), newAllocator);
+    // Replace the contents of *v, including the stateful allocator.
+    v->swap(newV);
+    // The destructor of newV cleans up the memory formerly managed by *v.
+}
 
 }      // namespace gmx
 
diff --git a/src/gromacs/gpu_utils/pinning.cu b/src/gromacs/gpu_utils/pinning.cu
new file mode 100644
index 0000000000..fb95d90f24
--- /dev/null
+++ b/src/gromacs/gpu_utils/pinning.cu
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Implements functions for pinning memory to be suitable for
+ * efficient GPU transfers on CUDA.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "pinning.h"
+
+#include <cstddef>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/utility/alignedallocator.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+namespace gmx
+{
+
+//! Is \c ptr aligned on a boundary that is a multiple of \c bytes.
+static inline bool isAligned(const void *ptr, size_t bytes)
+{
+    return (reinterpret_cast<intptr_t>(ptr) % bytes) == 0;
+}
+
+void pinBuffer(void *pointer, std::size_t numBytes) noexcept
+{
+    const char *errorMessage = "Could not register the host memory for page locking for GPU transfers.";
+
+    GMX_ASSERT(isAligned(pointer, PageAlignedAllocationPolicy::alignment()),
+               formatString("%s Host memory needs to be page aligned.", errorMessage).c_str());
+
+    ensureNoPendingCudaError(errorMessage);
+    cudaError_t stat = cudaHostRegister(pointer, numBytes, cudaHostRegisterDefault);
+
+    // These errors can only arise from a coding error somewhere.
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue &&
+                       stat != cudaErrorNotSupported &&
+                       stat != cudaErrorHostMemoryAlreadyRegistered,
+                       formatString("%s %s: %s", errorMessage,
+                                    cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+
+    // We always handle the error, but if it's a type we didn't expect
+    // (e.g. because CUDA changes the set of errors it returns) then
+    // we should get a descriptive assertion in Debug mode so we know
+    // to fix our expectations.
+    GMX_ASSERT(stat != cudaErrorMemoryAllocation,
+               formatString("%s %s: %s which was an unexpected error", errorMessage,
+                            cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+
+    // It might be preferable to throw InternalError here, because the
+    // failing condition can only happen when GROMACS is used with a
+    // CUDA API that can return some other error code. But we can't
+    // engineer GROMACS to be forward-compatible with future CUDA
+    // versions, so if this proves to be a problem in practice, then
+    // GROMACS must be patched, or a supported CUDA version used.
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       formatString("%s %s: %s", errorMessage,
+                                    cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+}
+
+void unpinBuffer(void *pointer) noexcept
+{
+    const char *errorMessage = "Could not unregister pinned host memory used for GPU transfers.";
+
+    GMX_ASSERT(pointer != nullptr,
+               formatString("%s pointer should not be nullptr when pinned.", errorMessage).c_str());
+
+    ensureNoPendingCudaError(errorMessage);
+    cudaError_t stat = cudaHostUnregister(pointer);
+    // These errors can only arise from a coding error somewhere.
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
+                       formatString("%s %s: %s", errorMessage,
+                                    cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+    // If there's an error whose type we didn't expect (e.g. because a
+    // future CUDA changes the set of errors it returns) then we
+    // should assert, because our code is wrong.
+    //
+    // The approach differs from that in pin() because we might
+    // unpin() from a destructor, in which case any attempt to throw
+    // an uncaught exception would anyway terminate the program. A
+    // release assertion is a better behaviour than that.
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       formatString("%s %s: %s which was an unexpected error", errorMessage,
+                                    cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+}
+
+} // namespace gmx
diff --git a/src/gromacs/gpu_utils/hostallocator.cu b/src/gromacs/gpu_utils/pinning.h
similarity index 55%
rename from src/gromacs/gpu_utils/hostallocator.cu
rename to src/gromacs/gpu_utils/pinning.h
index bde92207da..267b4f392b 100644
--- a/src/gromacs/gpu_utils/hostallocator.cu
+++ b/src/gromacs/gpu_utils/pinning.h
@@ -33,69 +33,32 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \internal \file
- * \brief Implements gmx::HostAllocationPolicy for allocating memory
- * suitable for GPU transfers on CUDA.
+ * \brief Declares functions for pinning memory to be suitable for
+ * efficient GPU transfers on CUDA.
  *
  * \author Mark Abraham <mark.j.abraham@gmail.com>
  */
-#include "gmxpre.h"
 
-#include "hostallocator.h"
-
-#include <cstdlib>
-
-#include "gromacs/utility/alignedallocator.h"
+#include <cstddef>
 
 namespace gmx
 {
 
-HostAllocationPolicy::HostAllocationPolicy(Impl s) : allocateForGpu_(s) {}
-
-void *
-HostAllocationPolicy::malloc(std::size_t bytes) const
-{
-    void *buffer = nullptr;
-    if (allocateForGpu_ == Impl::AllocateForGpu)
-    {
-        if (bytes != 0)
-        {
-            // Alternatively, this could become a pair of
-            // e.g. PageAlignedAllocationPolicy and cudaHostRegister
-            // calls if that is useful for something.
-            cudaError_t stat = cudaMallocHost(&buffer, bytes, cudaHostAllocDefault);
-            // TODO Throw an exception upon failure, particularly
-            // for cudaErrorMemoryAllocation.
-            if (stat != cudaSuccess)
-            {
-                buffer = nullptr;
-            }
-        }
-    }
-    else
-    {
-        buffer = AlignedAllocationPolicy::malloc(bytes);
-    }
-    return buffer;
-}
-
-void
-HostAllocationPolicy::free(void *buffer) const
-{
-    if (buffer == nullptr)
-    {
-        return;
-    }
-    if (allocateForGpu_ == Impl::AllocateForGpu)
-    {
-        cudaFreeHost(buffer);
-        return;
-    }
-    AlignedAllocationPolicy::free(buffer);
-}
+/*! \brief Pin the allocation to physical memory.
+ *
+ * Requires that \c pointer is not nullptr.
+ *
+ * Does not throw.
+ */
+void pinBuffer(void *pointer, std::size_t numBytes) noexcept;
 
-HostAllocationPolicy makeHostAllocationPolicyForGpu()
-{
-    return HostAllocationPolicy(HostAllocationPolicy::Impl::AllocateForGpu);
-}
+/*! \brief Unpin the allocation.
+ *
+ * Requries that \c pointer is not nullptr and was previously pinned
+ * with pinBuffer().
+ *
+ * Does not throw.
+ */
+void unpinBuffer(void *pointer) noexcept;
 
 } // namespace gmx
diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.cpp b/src/gromacs/gpu_utils/tests/devicetransfers.cpp
index 039f4a0583..733cac2245 100644
--- a/src/gromacs/gpu_utils/tests/devicetransfers.cpp
+++ b/src/gromacs/gpu_utils/tests/devicetransfers.cpp
@@ -42,15 +42,20 @@
 
 #include "devicetransfers.h"
 
+#include <algorithm>
+
 #include "gromacs/utility/arrayref.h"
 
 namespace gmx
 {
 
-void doDeviceTransfers(const gmx_gpu_info_t & /*gpuInfo*/,
-                       ArrayRef<const char>   /*input*/,
-                       ArrayRef<char>         /* output */)
+void doDeviceTransfers(const gmx_gpu_info_t   & /*gpuInfo*/,
+                       ArrayRef<const char>   input,
+                       ArrayRef<char>         output)
 {
+    GMX_RELEASE_ASSERT(input.size() == output.size(), "Input and output must have matching size");
+    // We can't have any valid GPUs for this build configuration.
+    std::copy(input.begin(), input.end(), output.begin());
 }
 
 } // namespace gmx
diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.cu b/src/gromacs/gpu_utils/tests/devicetransfers.cu
index 940712d489..e2d6a0d2ba 100644
--- a/src/gromacs/gpu_utils/tests/devicetransfers.cu
+++ b/src/gromacs/gpu_utils/tests/devicetransfers.cu
@@ -79,8 +79,12 @@ void doDeviceTransfers(const gmx_gpu_info_t &gpuInfo,
                        ArrayRef<char>        output)
 {
     GMX_RELEASE_ASSERT(input.size() == output.size(), "Input and output must have matching size");
+    if (gpuInfo.n_dev == 0)
+    {
+        std::copy(input.begin(), input.end(), output.begin());
+        return;
+    }
     cudaError_t status;
-    GMX_RELEASE_ASSERT(gpuInfo.n_dev > 0, "Must have a GPU device");
 
     const auto &device = gpuInfo.gpu_dev[0];
     int         oldDeviceId;
diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.h b/src/gromacs/gpu_utils/tests/devicetransfers.h
index 7ea88a1723..9ce7a42815 100644
--- a/src/gromacs/gpu_utils/tests/devicetransfers.h
+++ b/src/gromacs/gpu_utils/tests/devicetransfers.h
@@ -60,8 +60,9 @@ namespace gmx
 
 /*! \brief Helper function for GPU test code to be platform agnostic.
  *
- * Transfers \c input to device 0, which must be present, and
- * transfers it back into \c output. Both sizes must match.
+ * Transfers \c input to device 0, if present, and transfers it back
+ * into \c output. Both sizes must match. If no devices are present,
+ * do a simple host-side buffer copy instead.
  *
  * \throws InternalError  Upon any GPU API error condition. */
 void doDeviceTransfers(const gmx_gpu_info_t &gpuInfo,
diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp
index 689987bec7..809cce2b5a 100644
--- a/src/gromacs/gpu_utils/tests/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp
@@ -42,12 +42,16 @@
 
 #include "gromacs/gpu_utils/hostallocator.h"
 
+#include "config.h"
+
 #include <type_traits>
 #include <vector>
 
 #include <gtest/gtest.h>
 
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/real.h"
 
 #include "devicetransfers.h"
@@ -59,25 +63,20 @@ namespace gmx
 namespace
 {
 
-//! The types used in testing.
-typedef ::testing::Types<int, real, RVec> TestTypes;
-
-//! Typed test fixture
+/*! \internal \brief Typed test fixture for infrastructure for
+ * host-side memory used for GPU transfers. */
 template <typename T>
-class HostAllocatorTest : public test::GpuTest
+class HostMemoryTest : public test::GpuTest
 {
     public:
         //! Convenience type
         using ValueType = T;
         //! Convenience type
-        using AllocatorType = HostAllocator<T>;
-        //! Convenience type
-        using VectorType = std::vector<ValueType, AllocatorType>;
-        //! Convenience type
         using ViewType = ArrayRef<ValueType>;
         //! Convenience type
         using ConstViewType = ArrayRef<const ValueType>;
         //! Prepare contents of a VectorType.
+        template <typename VectorType>
         void fillInput(VectorType *input) const;
         //! Compares input and output vectors.
         void compareVectors(ConstViewType input,
@@ -87,25 +86,30 @@ class HostAllocatorTest : public test::GpuTest
 };
 
 // Already documented
-template <typename T>
-void HostAllocatorTest<T>::fillInput(VectorType *input) const
+template <typename T> template <typename VectorType>
+void HostMemoryTest<T>::fillInput(VectorType *input) const
 {
-    input->push_back(1);
-    input->push_back(2);
-    input->push_back(3);
+    input->resize(3);
+    (*input)[0] = 1;
+    (*input)[1] = 2;
+    (*input)[2] = 3;
 }
 
 //! Initialization specialization for RVec
-template <>
-void HostAllocatorTest<RVec>::fillInput(VectorType *input) const
+template <> template <typename VectorType>
+void HostMemoryTest<RVec>::fillInput(VectorType *input) const
 {
-    input->push_back({1, 2, 3});
+    input->reserve(3);
+    input->resize(3);
+    (*input)[0] = {1, 2, 3};
+    (*input)[1] = {4, 5, 6};
+    (*input)[2] = {7, 8, 9};
 }
 
 // Already documented
 template <typename T>
-void HostAllocatorTest<T>::compareVectors(ConstViewType input,
-                                          ConstViewType output) const
+void HostMemoryTest<T>::compareVectors(ConstViewType input,
+                                       ConstViewType output) const
 {
     for (size_t i = 0; i != input.size(); ++i)
     {
@@ -115,8 +119,8 @@ void HostAllocatorTest<T>::compareVectors(ConstViewType input,
 
 //! Comparison specialization for RVec
 template <>
-void HostAllocatorTest<RVec>::compareVectors(ConstViewType input,
-                                             ConstViewType output) const
+void HostMemoryTest<RVec>::compareVectors(ConstViewType input,
+                                          ConstViewType output) const
 {
     for (size_t i = 0; i != input.size(); ++i)
     {
@@ -146,15 +150,8 @@ ArrayRef<char> charArrayRefFromArray(T *data, size_t size)
 }
 
 template <typename T>
-void HostAllocatorTest<T>::runTest(ConstViewType input, ViewType output) const
+void HostMemoryTest<T>::runTest(ConstViewType input, ViewType output) const
 {
-    // We can't do a test that does a transfer unless we have a
-    // compatible device.
-    if (!this->haveValidGpus())
-    {
-        return;
-    }
-
     // Convert the views of input and output to flat non-const chars,
     // so that there's no templating when we call doDeviceTransfers.
     auto inputRef  = charArrayRefFromArray(input.data(), input.size());
@@ -164,6 +161,22 @@ void HostAllocatorTest<T>::runTest(ConstViewType input, ViewType output) const
     this->compareVectors(input, output);
 }
 
+//! The types used in testing.
+typedef ::testing::Types<int, real, RVec> TestTypes;
+
+//! Typed test fixture
+template <typename T>
+class HostAllocatorTest : public HostMemoryTest<T>
+{
+    public:
+        //! Convenience type
+        using ValueType = T;
+        //! Convenience type
+        using AllocatorType = HostAllocator<T>;
+        //! Convenience type
+        using VectorType = std::vector<ValueType, AllocatorType>;
+};
+
 TYPED_TEST_CASE(HostAllocatorTest, TestTypes);
 
 // Note that in GoogleTest typed tests, the use of TestFixture:: and
@@ -178,48 +191,132 @@ TYPED_TEST(HostAllocatorTest, EmptyMemoryAlwaysWorks)
     typename TestFixture::VectorType v;
 }
 
-TYPED_TEST(HostAllocatorTest, TransfersUsingDefaultHostAllocatorWork)
+TYPED_TEST(HostAllocatorTest, VectorsWithDefaultHostAllocatorAlwaysWorks)
 {
     typename TestFixture::VectorType input = {{1, 2, 3}}, output;
     output.resize(input.size());
+}
+
+// Several tests actually do CUDA transfers. This is not necessary
+// because the state of page alignment or pinning is not currently
+// relevant to the success of a CUDA transfer. CUDA checks happen only
+// during cudaHostRegister and cudaHostUnregister. Such tests are of
+// value only when this behaviour changes, if ever.
+
+TYPED_TEST(HostAllocatorTest, TransfersWithoutPinningWork)
+{
+    typename TestFixture::VectorType input;
+    this->fillInput(&input);
+    typename TestFixture::VectorType output;
+    output.resize(input.size());
 
     this->runTest(input, output);
 }
 
-TYPED_TEST(HostAllocatorTest, TransfersUsingNormalCpuHostAllocatorWork)
+TYPED_TEST(HostAllocatorTest, FillInputAlsoWorksAfterCallingReserve)
 {
-    // Make an allocator with a 'normal CPU' allocation policy. This
-    // might be slower than another policy, but still works.
-    using AllocatorType       = typename TestFixture::AllocatorType;
-    using AllocatorPolicyType = typename AllocatorType::allocation_policy;
-    AllocatorPolicyType              policy(AllocatorPolicyType::Impl::AllocateAligned);
-    AllocatorType                    allocator(policy);
+    typename TestFixture::VectorType input;
+    input.reserve(3);
+    this->fillInput(&input);
+}
+
+#if GMX_GPU == GMX_GPU_CUDA
 
-    typename TestFixture::VectorType input(allocator);
+// Policy suitable for pinning is only supported for a CUDA build
+
+TYPED_TEST(HostAllocatorTest, TransfersWithPinningWorkWithCuda)
+{
+    typename TestFixture::VectorType input;
+    changePinningPolicy(&input, PinningPolicy::CanBePinned);
     this->fillInput(&input);
-    typename TestFixture::VectorType output(allocator);
+    typename TestFixture::VectorType output;
+    changePinningPolicy(&output, PinningPolicy::CanBePinned);
     output.resize(input.size());
 
     this->runTest(input, output);
 }
 
-TYPED_TEST(HostAllocatorTest, TransfersUsingGpuHostAllocatorWork)
+//! Helper function for wrapping a call to isHostMemoryPinned.
+template <typename VectorType>
+bool isPinned(const VectorType &v)
 {
-    // Make an allocator with a 'for GPU' allocation policy. This
-    // should be more efficient, but we can't test that.
-    using AllocatorType       = typename TestFixture::AllocatorType;
-    using AllocatorPolicyType = typename AllocatorType::allocation_policy;
-    AllocatorPolicyType              policy(AllocatorPolicyType::Impl::AllocateForGpu);
-    AllocatorType                    allocator(policy);
+    void *data = const_cast<void *>(static_cast<const void *>(v.data()));
+    return isHostMemoryPinned(data);
+}
+
+TYPED_TEST(HostAllocatorTest, ManualPinningOperationsWorkWithCuda)
+{
+    typename TestFixture::VectorType input;
+    changePinningPolicy(&input, PinningPolicy::CanBePinned);
+    EXPECT_FALSE(isPinned(input));
+
+    // Unpin before allocation is fine, but does nothing.
+    input.get_allocator().getPolicy().unpin();
+    EXPECT_FALSE(isPinned(input));
 
-    typename TestFixture::VectorType input(allocator);
+    // Pin with no contents is fine, but does nothing.
+    input.get_allocator().getPolicy().pin();
+    EXPECT_FALSE(isPinned(input));
+
+    // Fill some contents, which will be pinned because of the policy.
     this->fillInput(&input);
-    typename TestFixture::VectorType output(allocator);
-    output.resize(input.size());
+    EXPECT_TRUE(isPinned(input));
 
-    this->runTest(input, output);
+    // Unpin after pin is fine.
+    input.get_allocator().getPolicy().unpin();
+    EXPECT_FALSE(isPinned(input));
+
+    // Repeated unpin should be a no-op.
+    input.get_allocator().getPolicy().unpin();
+
+    // Pin after unpin is fine.
+    input.get_allocator().getPolicy().pin();
+    EXPECT_TRUE(isPinned(input));
+
+    // Repeated pin should be a no-op, and still pinned.
+    input.get_allocator().getPolicy().pin();
+    EXPECT_TRUE(isPinned(input));
+
+    // Switching policy to CannotBePinned must unpin the buffer (via
+    // realloc and copy).
+    auto oldInputData = input.data();
+    changePinningPolicy(&input, PinningPolicy::CannotBePinned);
+    EXPECT_FALSE(isPinned(input));
+    // These cannot be equal as both had to be allocated at the same
+    // time for the contents to be able to be copied.
+    EXPECT_NE(oldInputData, input.data());
+
+    // Switching policy to CanBePinned must pin the buffer (via
+    // realloc and copy).
+    oldInputData = input.data();
+    changePinningPolicy(&input, PinningPolicy::CanBePinned);
+    EXPECT_TRUE(isPinned(input));
+    // These cannot be equal as both had to be allocated at the same
+    // time for the contents to be able to be copied.
+    EXPECT_NE(oldInputData, input.data());
+}
+
+#else
+
+TYPED_TEST(HostAllocatorTest, ChangingPinningPolicyRequiresCuda)
+{
+    typename TestFixture::VectorType input;
+    EXPECT_DEATH(changePinningPolicy(&input, PinningPolicy::CanBePinned),
+                 ".*A suitable build of GROMACS.* is required.*");
 }
 
+TYPED_TEST(HostAllocatorTest, ManualPinningOperationsWorkEvenWithoutCuda)
+{
+    typename TestFixture::VectorType input;
+
+    // Since the buffer can't be pinned and isn't pinned, and the
+    // calling code can't be unhappy about this, these are OK.
+    input.get_allocator().getPolicy().pin();
+    input.get_allocator().getPolicy().unpin();
+}
+
+#endif
+
 TYPED_TEST(HostAllocatorTest, StatefulAllocatorUsesMemory)
 {
     // The HostAllocator has state, so a container using it will be
@@ -229,5 +326,16 @@ TYPED_TEST(HostAllocatorTest, StatefulAllocatorUsesMemory)
               sizeof(typename TestFixture::VectorType));
 }
 
+//! Declare allocator types to test.
+using AllocatorTypesToTest = ::testing::Types<HostAllocator<real>,
+                                              HostAllocator<int>,
+                                              HostAllocator<RVec>
+                                              >;
+
+TYPED_TEST_CASE(AllocatorTest, AllocatorTypesToTest);
+
 } // namespace
 } // namespace
+
+// Includes tests common to all allocation policies.
+#include "gromacs/utility/tests/alignedallocator-impl.h"
diff --git a/src/gromacs/mdlib/mdatoms.cpp b/src/gromacs/mdlib/mdatoms.cpp
index 31efd4e7bd..fe998ff9c5 100644
--- a/src/gromacs/mdlib/mdatoms.cpp
+++ b/src/gromacs/mdlib/mdatoms.cpp
@@ -43,6 +43,7 @@
 #include <memory>
 
 #include "gromacs/compat/make_unique.h"
+#include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/qmmm.h"
@@ -51,7 +52,6 @@
 #include "gromacs/topology/mtop_lookup.h"
 #include "gromacs/topology/mtop_util.h"
 #include "gromacs/topology/topology.h"
-#include "gromacs/utility/alignedallocator.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/smalloc.h"
 
@@ -60,8 +60,8 @@
 namespace gmx
 {
 
-MDAtoms::MDAtoms(HostAllocationPolicy policy)
-    : mdatoms_(nullptr), chargeA_(policy)
+MDAtoms::MDAtoms()
+    : mdatoms_(nullptr), chargeA_()
 {
 }
 
@@ -81,10 +81,9 @@ std::unique_ptr<MDAtoms>
 makeMDAtoms(FILE *fp, const gmx_mtop_t &mtop, const t_inputrec &ir,
             bool useGpuForPme)
 {
-    auto policy = (useGpuForPme ?
-                   makeHostAllocationPolicyForGpu() :
-                   HostAllocationPolicy());
-    auto       mdAtoms = compat::make_unique<MDAtoms>(policy);
+    auto       mdAtoms = compat::make_unique<MDAtoms>();
+    // GPU transfers want to use the pinning mode.
+    changePinningPolicy(&mdAtoms->chargeA_, useGpuForPme ? PinningPolicy::CanBePinned : PinningPolicy::CannotBePinned);
     t_mdatoms *md;
     snew(md, 1);
     mdAtoms->mdatoms_.reset(md);
diff --git a/src/gromacs/mdlib/mdatoms.h b/src/gromacs/mdlib/mdatoms.h
index b537e0df2f..8e708b3e0d 100644
--- a/src/gromacs/mdlib/mdatoms.h
+++ b/src/gromacs/mdlib/mdatoms.h
@@ -69,11 +69,11 @@ class MDAtoms
     //! C-style mdatoms struct.
     unique_cptr<t_mdatoms> mdatoms_;
     //! Memory for chargeA that can be set up for efficient GPU transfer.
-    std::vector < real, HostAllocator < real>> chargeA_;
+    HostVector<real>       chargeA_;
     public:
         // TODO make this private
         //! Constructor.
-        MDAtoms(HostAllocationPolicy policy);
+        MDAtoms();
         //! Getter.
         t_mdatoms *mdatoms()
         {
diff --git a/src/gromacs/utility/allocator.h b/src/gromacs/utility/allocator.h
index 50ee1f0834..3d243adb71 100644
--- a/src/gromacs/utility/allocator.h
+++ b/src/gromacs/utility/allocator.h
@@ -201,6 +201,12 @@ class Allocator : public AllocationPolicy
             AllocationPolicy::free(p);
         }
 
+        //! Return the policy object for this allocator.
+        AllocationPolicy getPolicy() const
+        {
+            return *this;
+        }
+
         /*! \brief Construct an object without allocating memory
          *
          * \tparam Args  Variable-length list of types for constructor args
diff --git a/src/gromacs/utility/tests/alignedallocator-impl.h b/src/gromacs/utility/tests/alignedallocator-impl.h
new file mode 100644
index 0000000000..d592288200
--- /dev/null
+++ b/src/gromacs/utility/tests/alignedallocator-impl.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2015,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Tests for allocators that offer a minimum alignment.
+ *
+ * This implementation header can be included in multiple modules
+ * tests, which is currently needed because gpu_utils is physically
+ * separate from the utility module.
+ *
+ * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \inlibraryapi
+ * \ingroup module_utility
+ */
+#ifndef GMX_UTILITY_TESTS_ALIGNEDALLOCATOR_IMPL_H
+#define GMX_UTILITY_TESTS_ALIGNEDALLOCATOR_IMPL_H
+
+#include <cstddef>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/real.h"
+
+namespace gmx
+{
+namespace test
+{
+
+/*! \libinternal
+ * \brief Templated test fixture. */
+template <typename T>
+class AllocatorTest : public ::testing::Test
+{
+    public:
+        /*! \brief Return a bitmask for testing the alignment.
+         *
+         * e.g. for 128-byte alignment the mask is 128-1 - all of
+         * these bits should be zero in pointers that have the
+         * intended alignment. */
+        std::size_t mask(const T &allocator)
+        {
+            return allocator.getPolicy().alignment() - 1;
+        }
+};
+
+// NB need to use this->mask() because of GoogleTest quirks
+
+TYPED_TEST(AllocatorTest, AllocatorAlignAllocatesWithAlignment)
+{
+    using pointer = typename TypeParam::pointer;
+    TypeParam a;
+    pointer   p = a.allocate(1000);
+
+    EXPECT_EQ(0, reinterpret_cast<std::size_t>(p) & this->mask(a));
+    a.deallocate(p, 1000);
+}
+
+
+TYPED_TEST(AllocatorTest, VectorAllocatesAndResizesWithAlignment)
+{
+    using value_type = typename TypeParam::value_type;
+    std::vector<value_type, TypeParam> v(10);
+    EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask(v.get_allocator()));
+
+    // Reserve a few times to check things work ok, making sure we
+    // will trigger several reallocations on common vector
+    // implementations.
+    for (std::size_t i = 1000; i <= 10000; i += 1000)
+    {
+        v.resize(i);
+        EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask(v.get_allocator()));
+    }
+}
+
+TYPED_TEST(AllocatorTest, VectorAllocatesAndReservesWithAlignment)
+{
+    using value_type = typename TypeParam::value_type;
+    std::vector<value_type, TypeParam> v(10);
+    EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask(v.get_allocator()));
+
+    // Reserve a few times to check things work ok, making sure we
+    // will trigger several reallocations on common vector
+    // implementations.
+    for (std::size_t i = 1000; i <= 10000; i += 1000)
+    {
+        v.reserve(i);
+        EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask(v.get_allocator()));
+    }
+}
+
+} // namespace
+} // namespace
+
+#endif
diff --git a/src/gromacs/utility/tests/alignedallocator.cpp b/src/gromacs/utility/tests/alignedallocator.cpp
index 09dc610157..11a695336f 100644
--- a/src/gromacs/utility/tests/alignedallocator.cpp
+++ b/src/gromacs/utility/tests/alignedallocator.cpp
@@ -33,7 +33,7 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \internal \file
- * \brief Tests for gmx::AlignedAllocator
+ * \brief Tests for gmx::AlignedAllocator and gmx::PageAlignedAllocator.
  *
  * \author Erik Lindahl <erik.lindahl@gmail.com>
  * \author Mark Abraham <mark.j.abraham@gmail.com>
@@ -49,23 +49,11 @@
 #include <gtest/gtest.h>
 
 #include "gromacs/math/vectypes.h"
-#include "gromacs/utility/real.h"
 
 namespace gmx
 {
-
-//! Templated test fixture.
-template <typename T>
-class AllocatorTest : public ::testing::Test
+namespace test
 {
-    public:
-        /*! \brief Bitmask for testing the alignment.
-         *
-         * e.g. for 128-byte alignment the mask is 128-1 - all of
-         * these bits should be zero in pointers that have the
-         * intended alignment. */
-        std::size_t mask_ = T::allocation_policy::alignment()-1;
-};
 
 //! Declare allocator types to test.
 using AllocatorTypesToTest = ::testing::Types<AlignedAllocator<real>,
@@ -75,52 +63,19 @@ using AllocatorTypesToTest = ::testing::Types<AlignedAllocator<real>,
                                               AlignedAllocator<RVec>,
                                               PageAlignedAllocator<RVec>
                                               >;
-TYPED_TEST_CASE(AllocatorTest, AllocatorTypesToTest);
 
-// NB need to use this->mask_ because of GoogleTest quirks
-
-TYPED_TEST(AllocatorTest, AllocatorAlignAllocatesWithAlignment)
-{
-    using pointer = typename TypeParam::pointer;
-    TypeParam a;
-    pointer   p = a.allocate(1000);
+TYPED_TEST_CASE(AllocatorTest, AllocatorTypesToTest);
 
-    EXPECT_EQ(0, reinterpret_cast<std::size_t>(p) & this->mask_);
-    a.deallocate(p, 1000);
-}
+} // namespace
+} // namespace
 
+// Includes tests common to all allocation policies.
+#include "gromacs/utility/tests/alignedallocator-impl.h"
 
-TYPED_TEST(AllocatorTest, VectorAllocatesAndResizesWithAlignment)
+namespace gmx
 {
-    using value_type = typename TypeParam::value_type;
-    std::vector<value_type, TypeParam> v(10);
-    EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask_);
-
-    // Reserve a few times to check things work ok, making sure we
-    // will trigger several reallocations on common vector
-    // implementations.
-    for (std::size_t i = 1000; i <= 10000; i += 1000)
-    {
-        v.resize(i);
-        EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask_);
-    }
-}
-
-TYPED_TEST(AllocatorTest, VectorAllocatesAndReservesWithAlignment)
+namespace test
 {
-    using value_type = typename TypeParam::value_type;
-    std::vector<value_type, TypeParam> v(10);
-    EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask_);
-
-    // Reserve a few times to check things work ok, making sure we
-    // will trigger several reallocations on common vector
-    // implementations.
-    for (std::size_t i = 1000; i <= 10000; i += 1000)
-    {
-        v.reserve(i);
-        EXPECT_EQ(0, reinterpret_cast<std::size_t>(v.data()) & this->mask_);
-    }
-}
 
 TYPED_TEST(AllocatorTest, StatelessAllocatorUsesNoMemory)
 {
@@ -129,4 +84,5 @@ TYPED_TEST(AllocatorTest, StatelessAllocatorUsesNoMemory)
               sizeof(std::vector<value_type, TypeParam>));
 }
 
-}
+} // namespace
+} // namespace
-- 
2.22.0