Set up event consumption counter in DomDecMpiTests
[alexxy/gromacs.git] / src / gromacs / domdec / tests / haloexchange_mpi.cpp
index 3237c12612f127c4bd85d95cdbde653846caf95a..93aad8c288d62356af6124ba40995e6075bc5f67 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 /*! \internal \file
  * \brief Tests for the halo exchange
  *
- *  The test sets up a 2D rank topology and performs a coordinate halo
- *  exchange (using the pre-existing CPU codepath), with 2 pulses in
- *  the first dimension and 1 pulse in the second. Each pulse involves
- *  a few non-contiguous indices. The sending rank, atom number and
- *  spatial 3D index are encoded in the x values, to allow correctness
- *  checking following the halo exchange.
+ *  The test sets up the rank topology and performs a coordinate halo
+ *  exchange (for both CPU and GPU codepaths) for several 1D and 2D
+ *  pulse configirations. Each pulse involves a few non-contiguous
+ *  indices. The sending rank, atom number and spatial 3D index are
+ *  encoded in the x values, to allow correctness checking following
+ *  the halo exchange.
  *
- * \todo Add more test variations
- * \todo Port to GPU codepath
+ * \todo Add 3D case
  *
  * \author Alan Gray <alang@nvidia.com>
  * \ingroup module_domdec
 
 #include "gmxpre.h"
 
+#include "config.h"
+
 #include <array>
+#include <numeric>
+#include <vector>
 
 #include <gtest/gtest.h>
 
 #include "gromacs/domdec/atomdistribution.h"
 #include "gromacs/domdec/domdec_internal.h"
 #include "gromacs/domdec/gpuhaloexchange.h"
+#if GMX_GPU_CUDA
+#    include "gromacs/gpu_utils/device_stream.h"
+#    include "gromacs/gpu_utils/devicebuffer.h"
+#endif
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
+#include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/mdtypes/inputrec.h"
 
 #include "testutils/mpitest.h"
+#include "testutils/test_hardware_environment.h"
 
 namespace gmx
 {
+namespace test
+{
 namespace
 {
 
@@ -100,6 +112,98 @@ void initHaloData(RVec* x, const int numHomeAtoms, const int numAtomsTotal)
     }
 }
 
+/*! \brief Perform GPU halo exchange, including required setup and data transfers
+ *
+ * \param [in] dd             Domain decomposition object
+ * \param [in] box            Box matrix
+ * \param [in] h_x            Atom coordinate data array on host
+ * \param [in] numAtomsTotal  Total number of atoms, including halo
+ */
+void gpuHalo(gmx_domdec_t* dd, matrix box, HostVector<RVec>* h_x, int numAtomsTotal)
+{
+#if (GMX_GPU_CUDA && GMX_THREAD_MPI)
+    // pin memory if possible
+    changePinningPolicy(h_x, PinningPolicy::PinnedIfSupported);
+    // Set up GPU hardware environment and assign this MPI rank to a device
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    int         numDevices = getTestHardwareEnvironment()->getTestDeviceList().size();
+    const auto& testDevice = getTestHardwareEnvironment()->getTestDeviceList()[rank % numDevices];
+    const auto& deviceContext = testDevice->deviceContext();
+    setActiveDevice(testDevice->deviceInfo());
+    DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
+
+    // Set up GPU buffer and copy input data from host
+    DeviceBuffer<RVec> d_x;
+    int                d_x_size       = -1;
+    int                d_x_size_alloc = -1;
+    reallocateDeviceBuffer(&d_x, numAtomsTotal, &d_x_size, &d_x_size_alloc, deviceContext);
+
+    copyToDeviceBuffer(&d_x, h_x->data(), 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+
+    const int numPulses = std::accumulate(
+            dd->comm->cd.begin(), dd->comm->cd.end(), 0, [](const int a, const auto& b) {
+                return a + b.numPulses();
+            });
+    const int numExtraConsumptions = GMX_THREAD_MPI ? 1 : 0;
+    // Will be consumed once for each pulse, and, with tMPI, once more for dim=0,pulse=0 case
+    GpuEventSynchronizer coordinatesReadyOnDeviceEvent(numPulses + numExtraConsumptions,
+                                                       numPulses + numExtraConsumptions);
+    coordinatesReadyOnDeviceEvent.markEvent(deviceStream);
+
+    std::array<std::vector<GpuHaloExchange>, DIM> gpuHaloExchange;
+
+    // Create halo exchange objects
+    for (int d = 0; d < dd->ndim; d++)
+    {
+        for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
+        {
+            gpuHaloExchange[d].push_back(
+                    GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr));
+        }
+    }
+
+    // Perform GPU halo exchange
+    for (int d = 0; d < dd->ndim; d++)
+    {
+        for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
+        {
+            gpuHaloExchange[d][pulse].reinitHalo(d_x, nullptr);
+            gpuHaloExchange[d][pulse].communicateHaloCoordinates(box, &coordinatesReadyOnDeviceEvent);
+        }
+    }
+    // Barrier is needed to avoid other threads using events after its owner has exited and destroyed the context.
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    deviceStream.synchronize();
+
+    // Copy results back to host
+    copyFromDeviceBuffer(
+            h_x->data(), &d_x, 0, numAtomsTotal, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+
+    freeDeviceBuffer(d_x);
+#else
+    GMX_UNUSED_VALUE(dd);
+    GMX_UNUSED_VALUE(box);
+    GMX_UNUSED_VALUE(h_x);
+    GMX_UNUSED_VALUE(numAtomsTotal);
+#endif
+}
+
+/*! \brief Define 1D rank topology with 4 MPI tasks
+ *
+ * \param [in] dd  Domain decomposition object
+ */
+void define1dRankTopology(gmx_domdec_t* dd)
+{
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    const int numRanks = getNumberOfTestMpiRanks();
+    dd->neighbor[0][0] = (rank + 1) % numRanks;
+    dd->neighbor[0][1] = (rank == 0) ? (numRanks - 1) : rank - 1;
+}
+
 /*! \brief Define 2D rank topology with 4 MPI tasks
  *
  *    -----
@@ -144,12 +248,142 @@ void define2dRankTopology(gmx_domdec_t* dd)
     }
 }
 
+/*! \brief Define a 1D halo with 1 pulses
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define1dHaloWith1Pulse(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim     = 1;
+    int nzone    = 1;
+    int dimIndex = 0;
+
+    // Set up indices involved in halo
+    indexvec.clear();
+    indvec->clear();
+
+    dd->comm->cd[dimIndex].receiveInPlace = true;
+    dd->dim[dimIndex]                     = 0;
+    dd->ci[dimIndex]                      = rank;
+
+    // First pulse involves (arbitrary) indices 1 and 3
+    indexvec.push_back(1);
+    indexvec.push_back(3);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 2;
+    ind.nrecv[nzone + 1] = 2;
+    indvec->push_back(ind);
+
+    dd->comm->cd[dimIndex].ind = *indvec;
+}
+
+/*! \brief Define a 1D halo with 2 pulses
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define1dHaloWith2Pulses(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim     = 1;
+    int nzone    = 1;
+    int dimIndex = 0;
+
+    // Set up indices involved in halo
+    indexvec.clear();
+    indvec->clear();
+
+    dd->comm->cd[dimIndex].receiveInPlace = true;
+    dd->dim[dimIndex]                     = 0;
+    dd->ci[dimIndex]                      = rank;
+
+    // First pulse involves (arbitrary) indices 1 and 3
+    indexvec.push_back(1);
+    indexvec.push_back(3);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 2;
+    ind.nrecv[nzone + 1] = 2;
+    indvec->push_back(ind);
+
+    // Add another pulse with (arbitrary) indices 4,5,7
+    indexvec.clear();
+
+    indexvec.push_back(4);
+    indexvec.push_back(5);
+    indexvec.push_back(7);
+
+    ind.index            = indexvec;
+    ind.nsend[nzone + 1] = 3;
+    ind.nrecv[nzone + 1] = 3;
+    indvec->push_back(ind);
+
+    dd->comm->cd[dimIndex].ind = *indvec;
+}
+
+/*! \brief Define a 2D halo with 1 pulse in each dimension
+ *
+ * \param [in] dd      Domain decomposition object
+ * \param [in] indvec  Vector of index vectors
+ */
+void define2dHaloWith1PulseInEachDim(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
+{
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    std::vector<int> indexvec;
+    gmx_domdec_ind_t ind;
+
+    dd->ndim  = 2;
+    int nzone = 1;
+    for (int dimIndex = 0; dimIndex < dd->ndim; dimIndex++)
+    {
+
+        // Set up indices involved in halo
+        indexvec.clear();
+        indvec->clear();
+
+        dd->comm->cd[dimIndex].receiveInPlace = true;
+        dd->dim[dimIndex]                     = 0;
+        dd->ci[dimIndex]                      = rank;
+
+        // Single pulse involving (arbitrary) indices 1 and 3
+        indexvec.push_back(1);
+        indexvec.push_back(3);
+
+        ind.index            = indexvec;
+        ind.nsend[nzone + 1] = 2;
+        ind.nrecv[nzone + 1] = 2;
+        indvec->push_back(ind);
+
+        dd->comm->cd[dimIndex].ind = *indvec;
+
+        nzone += nzone;
+    }
+}
+
 /*! \brief Define a 2D halo with 2 pulses in the first dimension
  *
  * \param [in] dd      Domain decomposition object
  * \param [in] indvec  Vector of index vectors
  */
-void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t> indvec)
+void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_t>* indvec)
 {
 
     int rank;
@@ -165,7 +399,7 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
 
         // Set up indices involved in halo
         indexvec.clear();
-        indvec.clear();
+        indvec->clear();
 
         dd->comm->cd[dimIndex].receiveInPlace = true;
         dd->dim[dimIndex]                     = 0;
@@ -178,14 +412,12 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
         ind.index            = indexvec;
         ind.nsend[nzone + 1] = 2;
         ind.nrecv[nzone + 1] = 2;
-        indvec.push_back(ind);
+        indvec->push_back(ind);
 
         if (dimIndex == 0) // Add another pulse with (arbitrary) indices 4,5,7
         {
             indexvec.clear();
 
-            dd->comm->cd[dimIndex].ind = indvec;
-
             indexvec.push_back(4);
             indexvec.push_back(5);
             indexvec.push_back(7);
@@ -193,15 +425,73 @@ void define2dHaloWith2PulsesInDim1(gmx_domdec_t* dd, std::vector<gmx_domdec_ind_
             ind.index            = indexvec;
             ind.nsend[nzone + 1] = 3;
             ind.nrecv[nzone + 1] = 3;
-            indvec.push_back(ind);
+            indvec->push_back(ind);
         }
 
-        dd->comm->cd[dimIndex].ind = indvec;
+        dd->comm->cd[dimIndex].ind = *indvec;
 
         nzone += nzone;
     }
 }
 
+/*! \brief Check results for above-defined 1D halo with 1 pulse
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults1dHaloWith1Pulse(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+    }
+}
+
+/*! \brief Check results for above-defined 1D halo with 2 pulses
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults1dHaloWith2Pulses(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+        // Second Pulse in first dim: atoms 4,5,7 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms + 2][j], encodedValue(dd->neighbor[0][0], 4, j));
+        EXPECT_EQ(x[numHomeAtoms + 3][j], encodedValue(dd->neighbor[0][0], 5, j));
+        EXPECT_EQ(x[numHomeAtoms + 4][j], encodedValue(dd->neighbor[0][0], 7, j));
+    }
+}
+
+/*! \brief Check results for above-defined 2D halo with 1 pulse in each dimension
+ *
+ * \param [in] x             Atom coordinate data array
+ * \param [in] dd            Domain decomposition object
+ * \param [in] numHomeAtoms  Number of home atoms
+ */
+void checkResults2dHaloWith1PulseInEachDim(const RVec* x, const gmx_domdec_t* dd, const int numHomeAtoms)
+{
+    // Check results are expected from values encoded in x data
+    for (int j = 0; j < DIM; j++)
+    {
+        // First Pulse in first dim: atoms 1 and 3 from forward horizontal neighbour
+        EXPECT_EQ(x[numHomeAtoms][j], encodedValue(dd->neighbor[0][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 1][j], encodedValue(dd->neighbor[0][0], 3, j));
+        // First Pulse in second dim: atoms 1 and 3 from forward vertical neighbour
+        EXPECT_EQ(x[numHomeAtoms + 2][j], encodedValue(dd->neighbor[1][0], 1, j));
+        EXPECT_EQ(x[numHomeAtoms + 3][j], encodedValue(dd->neighbor[1][0], 3, j));
+    }
+}
+
 /*! \brief Check results for above-defined 2D halo with 2 pulses in the first dimension
  *
  * \param [in] x             Atom coordinate data array
@@ -226,17 +516,187 @@ void checkResults2dHaloWith2PulsesInDim1(const RVec* x, const gmx_domdec_t* dd,
     }
 }
 
+TEST(HaloExchangeTest, Coordinates1dHaloWith1Pulse)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 2;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define1dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define1dHaloWith1Pulse(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
+
+    // Check results
+    checkResults1dHaloWith1Pulse(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults1dHaloWith1Pulse(h_x.data(), &dd, numHomeAtoms);
+    }
+}
+
+TEST(HaloExchangeTest, Coordinates1dHaloWith2Pulses)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 5;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define1dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define1dHaloWith2Pulses(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
+
+    // Check results
+    checkResults1dHaloWith2Pulses(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults1dHaloWith2Pulses(h_x.data(), &dd, numHomeAtoms);
+    }
+}
+
+
+TEST(HaloExchangeTest, Coordinates2dHaloWith1PulseInEachDim)
+{
+    GMX_MPI_TEST(RequireRankCount<4>);
+
+    // Set up atom data
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 4;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+    // Set up dd
+    t_inputrec   ir;
+    gmx_domdec_t dd(ir);
+    dd.mpi_comm_all = MPI_COMM_WORLD;
+    gmx_domdec_comm_t comm;
+    dd.comm                      = &comm;
+    dd.unitCellInfo.haveScrewPBC = false;
+
+    DDAtomRanges atomRanges;
+    atomRanges.setEnd(DDAtomRanges::Type::Home, numHomeAtoms);
+    dd.comm->atomRanges = atomRanges;
+
+    define2dRankTopology(&dd);
+
+    std::vector<gmx_domdec_ind_t> indvec;
+    define2dHaloWith1PulseInEachDim(&dd, &indvec);
+
+    // Perform halo exchange
+    matrix box = { { 0., 0., 0. } };
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
+
+    // Check results
+    checkResults2dHaloWith1PulseInEachDim(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults2dHaloWith1PulseInEachDim(h_x.data(), &dd, numHomeAtoms);
+    }
+}
 
 TEST(HaloExchangeTest, Coordinates2dHaloWith2PulsesInDim1)
 {
-    GMX_MPI_TEST(4);
+    GMX_MPI_TEST(RequireRankCount<4>);
 
     // Set up atom data
-    const int numHomeAtoms  = 10;
-    const int numHaloAtoms  = 7;
-    const int numAtomsTotal = numHomeAtoms + numHaloAtoms;
-    RVec      x[numAtomsTotal];
-    initHaloData(x, numHomeAtoms, numAtomsTotal);
+    const int        numHomeAtoms  = 10;
+    const int        numHaloAtoms  = 7;
+    const int        numAtomsTotal = numHomeAtoms + numHaloAtoms;
+    HostVector<RVec> h_x;
+    h_x.resize(numAtomsTotal);
+
+    initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
 
     // Set up dd
     t_inputrec   ir;
@@ -253,15 +713,34 @@ TEST(HaloExchangeTest, Coordinates2dHaloWith2PulsesInDim1)
     define2dRankTopology(&dd);
 
     std::vector<gmx_domdec_ind_t> indvec;
-    define2dHaloWith2PulsesInDim1(&dd, indvec);
+    define2dHaloWith2PulsesInDim1(&dd, &indvec);
 
     // Perform halo exchange
     matrix box = { { 0., 0., 0. } };
-    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(x), nullptr);
+    dd_move_x(&dd, box, static_cast<ArrayRef<RVec>>(h_x), nullptr);
 
     // Check results
-    checkResults2dHaloWith2PulsesInDim1(x, &dd, numHomeAtoms);
+    checkResults2dHaloWith2PulsesInDim1(h_x.data(), &dd, numHomeAtoms);
+
+    if (GMX_GPU_CUDA && GMX_THREAD_MPI) // repeat with GPU halo codepath
+    {
+        // early return if no devices are available.
+        if (getTestHardwareEnvironment()->getTestDeviceList().empty())
+        {
+            return;
+        }
+
+        // Re-initialize input
+        initHaloData(h_x.data(), numHomeAtoms, numAtomsTotal);
+
+        // Perform GPU halo exchange
+        gpuHalo(&dd, box, &h_x, numAtomsTotal);
+
+        // Check results
+        checkResults2dHaloWith2PulsesInDim1(h_x.data(), &dd, numHomeAtoms);
+    }
 }
 
 } // namespace
+} // namespace test
 } // namespace gmx