Set up event consumption counter in DomDecMpiTests
[alexxy/gromacs.git] / src / gromacs / domdec / tests / haloexchange_mpi.cpp
index be86dbe01f396d64451c7314287dac4e9ac3355d..93aad8c288d62356af6124ba40995e6075bc5f67 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 /*! \internal \file
  * \brief Tests for the halo exchange
  *
- *  The test sets up a 2D rank topology and performs a coordinate halo
- *  exchange (for both CPU and GPU codepaths), with 2 pulses in
- *  the first dimension and 1 pulse in the second. Each pulse involves
- *  a few non-contiguous indices. The sending rank, atom number and
- *  spatial 3D index are encoded in the x values, to allow correctness
- *  checking following the halo exchange.
+ *  The test sets up the rank topology and performs a coordinate halo
+ *  exchange (for both CPU and GPU codepaths) for several 1D and 2D
+ *  pulse configirations. Each pulse involves a few non-contiguous
+ *  indices. The sending rank, atom number and spatial 3D index are
+ *  encoded in the x values, to allow correctness checking following
+ *  the halo exchange.
  *
- * \todo Add more test variations
+ * \todo Add 3D case
  *
  * \author Alan Gray <alang@nvidia.com>
  * \ingroup module_domdec
 
 #include "gmxpre.h"
 
+#include "config.h"
+
 #include <array>
+#include <numeric>
+#include <vector>
 
 #include <gtest/gtest.h>
 
@@ -60,8 +64,8 @@
 #if GMX_GPU_CUDA
 #    include "gromacs/gpu_utils/device_stream.h"
 #    include "gromacs/gpu_utils/devicebuffer.h"
-#    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #endif
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/mdtypes/inputrec.h"
 
@@ -108,7 +112,6 @@ void initHaloData(RVec* x, const int numHomeAtoms, const int numAtomsTotal)
     }
 }
 
-#if (GMX_GPU_CUDA && GMX_THREAD_MPI)
 /*! \brief Perform GPU halo exchange, including required setup and data transfers
  *
  * \param [in] dd             Domain decomposition object
@@ -116,8 +119,11 @@ void initHaloData(RVec* x, const int numHomeAtoms, const int numAtomsTotal)
  * \param [in] h_x            Atom coordinate data array on host
  * \param [in] numAtomsTotal  Total number of atoms, including halo
  */
-void gpuHalo(gmx_domdec_t* dd, matrix box, RVec* h_x, int numAtomsTotal)
+void gpuHalo(gmx_domdec_t* dd, matrix box, HostVector<RVec>* h_x, int numAtomsTotal)
 {
+#if (GMX_GPU_CUDA && GMX_THREAD_MPI)
+    // pin memory if possible
+    changePinningPolicy(h_x, PinningPolicy::PinnedIfSupported);
     // Set up GPU hardware environment and assign this MPI rank to a device
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
@@ -133,33 +139,70 @@ void gpuHalo(gmx_domdec_t* dd, matrix box, RVec* h_x, int numAtomsTotal)
     int                d_x_size_alloc = -1;
     reallocateDeviceBuffer(&d_x, numAtomsTotal, &d_x_size, &d_x_size_alloc, deviceContext);
 
-    copyToDeviceBuffer(&d_x, h_x, 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyToDeviceBuffer(&d_x, h_x->data(), 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
 
-    GpuEventSynchronizer coordinatesReadyOnDeviceEvent;
+    const int numPulses = std::accumulate(
+            dd->comm->cd.begin(), dd->comm->cd.end(), 0, [](const int a, const auto& b) {
+                return a + b.numPulses();
+            });
+    const int numExtraConsumptions = GMX_THREAD_MPI ? 1 : 0;
+    // Will be consumed once for each pulse, and, with tMPI, once more for dim=0,pulse=0 case
+    GpuEventSynchronizer coordinatesReadyOnDeviceEvent(numPulses + numExtraConsumptions,
+                                                       numPulses + numExtraConsumptions);
     coordinatesReadyOnDeviceEvent.markEvent(deviceStream);
 
+    std::array<std::vector<GpuHaloExchange>, DIM> gpuHaloExchange;
+
+    // Create halo exchange objects
+    for (int d = 0; d < dd->ndim; d++)
+    {
+        for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
+        {
+            gpuHaloExchange[d].push_back(
+                    GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr));
+        }
+    }
+
     // Perform GPU halo exchange
     for (int d = 0; d < dd->ndim; d++)
     {
         for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
         {
-            GpuHaloExchange gpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, deviceStream,
-                                            deviceStream, pulse, nullptr);
-            gpuHaloExchange.reinitHalo(d_x, nullptr);
-            gpuHaloExchange.communicateHaloCoordinates(box, &coordinatesReadyOnDeviceEvent);
+            gpuHaloExchange[d][pulse].reinitHalo(d_x, nullptr);
+            gpuHaloExchange[d][pulse].communicateHaloCoordinates(box, &coordinatesReadyOnDeviceEvent);
         }
     }
+    // Barrier is needed to avoid other threads using events after its owner has exited and destroyed the context.
+    MPI_Barrier(MPI_COMM_WORLD);
 
-    GpuEventSynchronizer haloCompletedEvent;
-    haloCompletedEvent.markEvent(deviceStream);
-    haloCompletedEvent.waitForEvent();
+    deviceStream.synchronize();
 
     // Copy results back to host
-    copyFromDeviceBuffer(h_x, &d_x, 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(
+            h_x->data(), &d_x, 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
 
     freeDeviceBuffer(d_x);
-}
+#else
+    GMX_UNUSED_VALUE(dd);
+    GMX_UNUSED_VALUE(box);
+    GMX_UNUSED_VALUE(h_x);
+    GMX_UNUSED_VALUE(numAtomsTotal);
 #endif
+}
+
+/*! \brief Define 1D rank topology with 4 MPI tasks
+ *
+ * \param [in] dd  Domain decomposition object
+ */
+void define1dRankTopology(gmx_domdec_t* dd)
+{
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    const int numRanks = getNumberOfTestMpiRanks();
+    dd->neighbor[0][0] = (rank + 1) % numRanks;
+    dd->neighbor[0][1] = (rank == 0) ? (numRanks - 1) : rank - 1;
+}
 
 /*! \brief Define 2D rank topology with 4 MPI tasks
  *
@@ -205,12 +248,142 @@ void define2dRankTopology(gmx_domdec_t* dd)
     }
 }
 
+/*! \brief Define a 1D halo with 1 pulses
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define1dHaloWith1Pulse(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim     = 1;
+    int nzone    = 1;
+    int dimIndex = 0;
+
+    // Set up indices involved in halo
+    indexvec.clear();
+    indvec->clear();
+
+    dd->comm->cd[dimIndex].receiveInPlace = true;
+    dd->dim[dimIndex]                     = 0;
+    dd->ci[dimIndex]                      = rank;
+
+    // First pulse involves (arbitrary) indices 1 and 3
+    indexvec.push_back(1);
+    indexvec.push_back(3);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 2;
+    ind.nrecv[nzone + 1] = 2;
+    indvec->push_back(ind);
+
+    dd->comm->cd[dimIndex].ind = *indvec;
+}
+
+/*! \brief Define a 1D halo with 2 pulses
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define1dHaloWith2Pulses(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim     = 1;
+    int nzone    = 1;
+    int dimIndex = 0;
+
+    // Set up indices involved in halo
+    indexvec.clear();
+    indvec->clear();
+
+    dd->comm->cd[dimIndex].receiveInPlace = true;
+    dd->dim[dimIndex]                     = 0;
+    dd->ci[dimIndex]                      = rank;
+
+    // First pulse involves (arbitrary) indices 1 and 3
+    indexvec.push_back(1);
+    indexvec.push_back(3);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 2;
+    ind.nrecv[nzone + 1] = 2;
+    indvec->push_back(ind);
+
+    // Add another pulse with (arbitrary) indices 4,5,7
+    indexvec.clear();
+
+    indexvec.push_back(4);
+    indexvec.push_back(5);
+    indexvec.push_back(7);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 3;
+    ind.nrecv[nzone + 1] = 3;
+    indvec->push_back(ind);
+
+    dd->comm->cd[dimIndex].ind = *indvec;
+}
+
+/*! \brief Define a 2D halo with 1 pulse in each dimension
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define2dHaloWith1PulseInEachDim(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim  = 2;
+    int nzone = 1;
+    for (int dimIndex = 0; dimIndex < dd->ndim; dimIndex++)
+    {
+
+        // Set up indices involved in halo
+        indexvec.clear();
+        indvec->clear();
+
+        dd->comm->cd[dimIndex].receiveInPlace = true;
+        dd->dim[dimIndex]                     = 0;
+        dd->ci[dimIndex]                      = rank;
+
+        // Single pulse involving (arbitrary) indices 1 and 3
+        indexvec.push_back(1);
+        indexvec.push_back(3);
+
+        ind.index            = indexvec;
+        ind.nsend[nzone + 1] = 2;
+        ind.nrecv[nzone + 1] = 2;
+        indvec->push_back(ind);
+
+        dd->comm->cd[dimIndex].ind = *indvec;
+
+        nzone += nzone;
+    }
+}
+
 /*! \brief Define a 2D halo with 2 pulses in the first dimension
  *
  * \param [in] dd      Domain decomposition object
  * \param [in] indvec  Vector of index vectors
  */
-void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t> indvec)
+void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
 {
 
     int rank;
@@ -226,7 +399,7 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
 
         // Set up indices involved in halo
         indexvec.clear();
-        indvec.clear();
+        indvec->clear();
 
         dd->comm->cd[dimIndex].receiveInPlace = true;
         dd->dim[dimIndex]                     = 0;
@@ -239,14 +412,12 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
         ind.index            = indexvec;
         ind.nsend[nzone + 1] = 2;
         ind.nrecv[nzone + 1] = 2;
-        indvec.push_back(ind);
+        indvec->push_back(ind);
 
         if (dimIndex == 0) // Add another pulse with (arbitrary) indices 4,5,7
         {
             indexvec.clear();
 
-            dd->comm->cd[dimIndex].ind = indvec;
-
             indexvec.push_back(4);
             indexvec.push_back(5);
             indexvec.push_back(7);
@@ -254,15 +425,72 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
             ind.index            = indexvec;
             ind.nsend[nzone + 1] = 3;
             ind.nrecv[nzone + 1] = 3;
-            indvec.push_back(ind);
+            indvec->push_back(ind);
         }
 
-        dd->comm->cd[dimIndex].ind = indvec;
+        dd->comm->cd[dimIndex].ind = *indvec;
 
         nzone += nzone;
     }
 }
 
+/*! \brief Check results for above-defined 1D halo with 1 pulse
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults1dHaloWith1Pulse(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+    }
+}
+
+/*! \brief Check results for above-defined 1D halo with 2 pulses
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults1dHaloWith2Pulses(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+        // Second Pulse in first dim: atoms 4,5,7 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms + 2][j], encodedValue(dd->neighbor[0][0], 4, j));
+        EXPECT_EQ(x[numHomeAtoms + 3][j], encodedValue(dd->neighbor[0][0], 5, j));
+        EXPECT_EQ(x[numHomeAtoms + 4][j], encodedValue(dd->neighbor[0][0], 7, j));
+    }
+}
+
+/*! \brief Check results for above-defined 2D halo with 1 pulse in each dimension
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults2dHaloWith1PulseInEachDim(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+        // First Pulse in second dim: atoms 1 and 3 from forward vertical neighbour
+        EXPECT_EQ(x[numHomeAtoms + 2][j], encodedValue(dd->neighbor[1][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 3][j], encodedValue(dd->neighbor[1][0], 3, j));
+    }
+}
 
 /*! \brief Check results for above-defined 2D halo with 2 pulses in the first dimension
  *
@@ -288,16 +516,128 @@ void checkResults2dHaloWith2PulsesInDim1(const RVec* x, const gmx_domdec_t* dd,
     }
 }
 
-TEST(HaloExchangeTest, Coordinates2dHaloWith2PulsesInDim1)
+TEST(HaloExchangeTest, Coordinates1dHaloWith1Pulse)
 {
-    GMX_MPI_TEST(4);
+    GMX_MPI_TEST(RequireRankCount<4>);
 
     // Set up atom data
     const int        numHomeAtoms  = 10;
-    const int        numHaloAtoms  = 7;
+    const int        numHaloAtoms  = 2;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define1dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define1dHaloWith1Pulse(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
+
+    // Check results
+    checkResults1dHaloWith1Pulse(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults1dHaloWith1Pulse(h_x.data(), &dd, numHomeAtoms);
+    }
+}
+
+TEST(HaloExchangeTest, Coordinates1dHaloWith2Pulses)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 5;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define1dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define1dHaloWith2Pulses(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
+
+    // Check results
+    checkResults1dHaloWith2Pulses(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults1dHaloWith2Pulses(h_x.data(), &dd, numHomeAtoms);
+    }
+}
+
+
+TEST(HaloExchangeTest, Coordinates2dHaloWith1PulseInEachDim)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 4;
     const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
     HostVector<RVec> h_x;
-    changePinningPolicy(&h_x, PinningPolicy::PinnedIfSupported);
     h_x.resize(numAtomsTotal);
 
     initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
@@ -317,25 +657,88 @@ TEST(HaloExchangeTest, Coordinates2dHaloWith2PulsesInDim1)
     define2dRankTopology(&dd);
 
     std::vector<gmx_domdec_ind_t> indvec;
-    define2dHaloWith2PulsesInDim1(&dd, indvec);
+    define2dHaloWith1PulseInEachDim(&dd, &indvec);
 
     // Perform halo exchange
     matrix box = { { 0., 0., 0. } };
     dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
 
     // Check results
-    checkResults2dHaloWith2PulsesInDim1(h_x.data(), &dd, numHomeAtoms);
+    checkResults2dHaloWith1PulseInEachDim(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults2dHaloWith1PulseInEachDim(h_x.data(), &dd, numHomeAtoms);
+    }
+}
+
+TEST(HaloExchangeTest, Coordinates2dHaloWith2PulsesInDim1)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 7;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
 
-#if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
-    // Re-initialize input
     initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
 
-    // Perform GPU halo exchange
-    gpuHalo(&dd, box, h_x.data(), numAtomsTotal);
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define2dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define2dHaloWith2PulsesInDim1(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
 
     // Check results
     checkResults2dHaloWith2PulsesInDim1(h_x.data(), &dd, numHomeAtoms);
-#endif
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults2dHaloWith2PulsesInDim1(h_x.data(), &dd, numHomeAtoms);
+    }
 }
 
 } // namespace