pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]);
}
const int cellCount = c_pmeNeighborUnitcellCount;
- pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx);
- pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky);
- pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz);
- pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx);
- pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky);
- pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz);
+ pmeGPU->common->fsh.resize(0);
+ pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshx, pme->fshx + cellCount * pme->nkx);
+ pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshy, pme->fshy + cellCount * pme->nky);
+ pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshz, pme->fshz + cellCount * pme->nkz);
+ pmeGPU->common->nn.resize(0);
+ pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
+ pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
+ pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
pmeGPU->common->runMode = pme->runMode;
}
/* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
pmeGPU->settings.performGPUGather = true;
+ pme_gpu_set_testing(pmeGPU, false);
+
// GPU initialization
init_gpu(mdlog, cr->nodeid, gpuInfo);
pmeGPU->deviceInfo = gpuInfo;
kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r;
}
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc, PmeSplineDataType type, int dimIndex)
+{
+ // The GPU atom spline data is laid out in a different way currently than the CPU one.
+ // This function converts the data from GPU to CPU layout. Obviously, it is slow.
+ // It is only intended for testing purposes so far.
+ // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their performance
+ // (e.g. spreading on GPU, gathering on CPU).
+ GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+ const uintmax_t threadIndex = 0;
+ const auto atomCount = pme_gpu_get_kernel_params_base_ptr(pmeGPU)->atoms.nAtoms;
+ const auto atomsPerWarp = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
+ const auto pmeOrder = pmeGPU->common->pme_order;
+
+ real *h_splineBuffer;
+ float *d_splineBuffer;
+ switch (type)
+ {
+ case PmeSplineDataType::Values:
+ h_splineBuffer = atc->spline[threadIndex].theta[dimIndex];
+ d_splineBuffer = pmeGPU->staging.h_theta;
+ break;
+
+ case PmeSplineDataType::Derivatives:
+ h_splineBuffer = atc->spline[threadIndex].dtheta[dimIndex];
+ d_splineBuffer = pmeGPU->staging.h_dtheta;
+ break;
+
+ default:
+ GMX_THROW(gmx::InternalError("Unknown spline data type"));
+ }
+
+ for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+ {
+ auto atomWarpIndex = atomIndex % atomsPerWarp;
+ auto warpIndex = atomIndex / atomsPerWarp;
+ for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+ {
+ const auto gpuValueIndex = ((pmeOrder * warpIndex + orderIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex;
+ const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+ GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder, "Atom spline data index out of bounds (while transforming GPU data layout for host)");
+ h_splineBuffer[cpuValueIndex] = d_splineBuffer[gpuValueIndex];
+ }
+ }
+}
+
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize)
+{
+ GMX_ASSERT(gridSize != nullptr, "");
+ GMX_ASSERT(paddedGridSize != nullptr, "");
+ GMX_ASSERT(pmeGPU != nullptr, "");
+ auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+ for (int i = 0; i < DIM; i++)
+ {
+ (*gridSize)[i] = kernelParamsPtr->grid.realGridSize[i];
+ (*paddedGridSize)[i] = kernelParamsPtr->grid.realGridSizePadded[i];
+ }
+}
+
void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
{
if (!pme_gpu_active(pme))
class MDLogger;
}
+//! Type of spline data
+enum class PmeSplineDataType
+{
+ Values, // theta
+ Derivatives, // dtheta
+}; //TODO move this into new and shiny pme.h (pme-types.h?)
+
/* Some general constants for PME GPU behaviour follow. */
/*! \brief \libinternal
CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
+/*! \libinternal \brief
+ * Copies the spread output spline data and gridline indices from the GPU to the host.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
/*! \libinternal \brief
* Waits for the grid copying to the host-side buffer after spreading to finish.
*
CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
+/* The PME stages themselves */
+
+/*! \libinternal \brief
+ * A GPU spline computation and charge spreading function.
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
+ * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
+ * e.g. testing or host-side FFT)
+ * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
+ * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
+ int CUDA_FUNC_ARGUMENT(gridIndex),
+ real *CUDA_FUNC_ARGUMENT(h_grid),
+ bool CUDA_FUNC_ARGUMENT(computeSplines),
+ bool CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
+
+
/* The inlined convenience PME GPU status getters */
/*! \libinternal \brief
return pmeGPU->settings.performGPUSolve;
}
+/*! \libinternal \brief
+ * Enables or disables the testing mode.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] testing Should the testing mode be enabled, or disabled.
+ */
+gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing)
+{
+ pmeGPU->settings.copyAllOutputs = testing;
+}
+
+/*! \libinternal \brief
+ * Tells if PME is in the testing mode.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns true if testing mode is enabled, false otherwise.
+ */
+gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->settings.copyAllOutputs;
+}
+
/* A block of C++ functions that live in pme-gpu-internal.cpp */
/*! \libinternal \brief
void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcForces,
const bool bCalcEnerVir);
+/*! \libinternal \brief
+ * Rearranges the atom spline data, copied from GPU to the host after pme_gpu_sync_spline_atom_data() call,
+ * to have the atom data layout of a single-threaded PME CPU run.
+ * Only used for test purposes so far.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[out] atc The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in] type The spilne data type (values or derivatives).
+ * \param[in] dimIndex Dimension index.
+ */
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc,
+ PmeSplineDataType type, int dimIndex);
+
+/*! \libinternal \brief
+ * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[out] gridSize Pointer to the grid dimensions to fill in.
+ * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
+ */
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
+
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
*
/*! \brief Electrostatics parameter */
real epsilon_r;
/*! \brief Gridline indices - nnx, nny, nnz */
- std::vector<int> nn[DIM];
+ std::vector<int> nn;
/*! \brief Fractional shifts - fshx, fshy, fshz */
- std::vector<real> fsh[DIM];
+ std::vector<real> fsh;
/*! \brief Precomputed B-spline values */
std::vector<real> bsp_mod[DIM];
/*! \brief The PME codepath being taken */
#include "gmxpre.h"
+#include "config.h"
+
+#include <list>
+
#include "gromacs/ewald/pme.h"
+#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
#include "pme-gpu-internal.h"
#include "pme-grid.h"
return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
}
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
+{
+ std::list<std::string> errorReasons;
+ if (!EEL_PME(ir->coulombtype))
+ {
+ errorReasons.push_back("systems that do not use PME for electrostatics");
+ }
+ if (ir->pme_order != 4)
+ {
+ errorReasons.push_back("interpolation orders other than 4");
+ }
+ if (ir->efep != efepNO)
+ {
+ errorReasons.push_back("free energy calculations (multiple grids)");
+ }
+ if (EVDW_PME(ir->vdwtype))
+ {
+ errorReasons.push_back("Lennard-Jones PME");
+ }
+#if GMX_DOUBLE
+ {
+ errorReasons.push_back("double precision");
+ }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+ {
+ errorReasons.push_back("non-CUDA build of GROMACS");
+ }
+#endif
+ if (ir->cutoff_scheme == ecutsGROUP)
+ {
+ errorReasons.push_back("group cutoff scheme");
+ }
+ if (EI_TPI(ir->eI))
+ {
+ errorReasons.push_back("test particle insertion");
+ }
+
+ bool inputSupported = errorReasons.empty();
+ if (!inputSupported && error)
+ {
+ std::string regressionTestMarker = "PME GPU does not support";
+ // this prefix is tested for in the regression tests script gmxtest.pl
+ *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+ }
+ return inputSupported;
+}
+
void pme_gpu_reset_timings(const gmx_pme_t *pme)
{
if (pme_gpu_active(pme))
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013-2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements PME GPU spline calculation and charge spreading in CUDA.
+ * TODO: consider always pre-sorting particles (as in DD case).
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+#include "pme-grid.h"
+#include "pme-timings.cuh"
+
+/*
+ * This define affects the spline calculation behaviour in the kernel.
+ * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
+ * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
+ * The only efficiency difference is less global store operations, countered by more redundant spline computation.
+ *
+ * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
+ */
+#define PME_GPU_PARALLEL_SPLINE 0
+
+
+//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
+constexpr int c_spreadMaxWarpsPerBlock = 8;
+/* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
+ * only on some GPUs (660Ti) with large enough grid (>= 48^3) due to load/store units being overloaded
+ * (ldst_fu_utilization metric maxed out in nvprof). Runtime block size choice might be nice to have.
+ * This has been tried on architectures up to Maxwell (GTX 750) and it would be good to revisit this.
+ */
+//! Spreading max block size in threads
+constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
+
+//! Texture references for CC 2.x
+texture<int, 1, cudaReadModeElementType> gridlineIndicesTableTextureRef;
+texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref()
+{
+ return gridlineIndicesTableTextureRef;
+}
+
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref()
+{
+ return fractShiftsTableTextureRef;
+}
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T Data type (float/int/...)
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \param[out] sm_destination Shared memory array for output.
+ * \param[in] gm_source Global memory array for input.
+ */
+template<typename T,
+ const int atomsPerBlock,
+ const int dataCountPerAtom>
+__device__ __forceinline__
+void pme_gpu_stage_atom_data(const pme_gpu_cuda_kernel_params_t kernelParams,
+ T * __restrict__ sm_destination,
+ const T * __restrict__ gm_source)
+{
+ static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+ const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+ const int localIndex = threadLocalIndex;
+ const int globalIndexBase = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+ const int globalIndex = globalIndexBase + localIndex;
+ const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+ if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+ {
+ assert(isfinite(float(gm_source[globalIndex])));
+ sm_destination[localIndex] = gm_source[globalIndex];
+ }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order PME interpolation order.
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for
+ * in the sizes of the shared memory arrays.
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
+ * \param[in] sm_coordinates Atom coordinates in the shared memory.
+ * \param[in] sm_coefficients Atom charges/coefficients in the shared memory.
+ * \param[out] sm_theta Atom spline values in the shared memory.
+ * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory.
+ */
+template <const int order,
+ const int atomsPerBlock>
+__device__ __forceinline__ void calculate_splines(const pme_gpu_cuda_kernel_params_t kernelParams,
+ const int atomIndexOffset,
+ const float3 * __restrict__ sm_coordinates,
+ const float * __restrict__ sm_coefficients,
+ float * __restrict__ sm_theta,
+ int * __restrict__ sm_gridlineIndices)
+{
+ /* Global memory pointers for output */
+ float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
+ float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
+ int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+ /* Fractional coordinates */
+ __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+ /* Thread index w.r.t. block */
+ const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+ + (threadIdx.y * blockDim.x) + threadIdx.x;
+ /* Warp index w.r.t. block - could probably be obtained easier? */
+ const int warpIndex = threadLocalId / warp_size;
+ /* Thread index w.r.t. warp */
+ const int threadWarpIndex = threadLocalId % warp_size;
+ /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+ const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP;
+ /* Atom index w.r.t. block/shared memory */
+ const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex;
+
+ /* Atom index w.r.t. global memory */
+ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+ /* Spline contribution index in one dimension */
+ const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM);
+ /* Dimension index */
+ const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP;
+
+ /* Multi-purpose index of rvec/ivec atom data */
+ const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+ /* Spline parameters need a working buffer.
+ * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
+ * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
+ * The buffer's size, striding and indexing are adjusted accordingly.
+ * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
+ */
+#if PME_GPU_PARALLEL_SPLINE
+ const int splineDataStride = atomsPerBlock * DIM;
+ const int splineDataIndex = sharedMemoryIndex;
+ __shared__ float sm_splineData[splineDataStride * order];
+ float *splineDataPtr = sm_splineData;
+#else
+ const int splineDataStride = 1;
+ const int splineDataIndex = 0;
+ float splineData[splineDataStride * order];
+ float *splineDataPtr = splineData;
+#endif
+
+#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
+#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
+
+ const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
+ const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+ if (localCheck && globalCheck)
+ {
+ /* Indices interpolation */
+
+ if (orderIndex == 0)
+ {
+ int tableIndex, tInt;
+ float n, t;
+ const float3 x = sm_coordinates[atomIndexLocal];
+ /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+ * puts them into local memory(!) instead of accessing the constant memory directly.
+ * That's the reason for the switch, to unroll explicitly.
+ * The commented parts correspond to the 0 components of the recipbox.
+ */
+ switch (dimIndex)
+ {
+ case XX:
+ tableIndex = kernelParams.grid.tablesOffsets[XX];
+ n = kernelParams.grid.realGridSizeFP[XX];
+ t = x.x * kernelParams.step.recipBox[dimIndex][XX] + x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+ break;
+
+ case YY:
+ tableIndex = kernelParams.grid.tablesOffsets[YY];
+ n = kernelParams.grid.realGridSizeFP[YY];
+ t = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + */ x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+ break;
+
+ case ZZ:
+ tableIndex = kernelParams.grid.tablesOffsets[ZZ];
+ n = kernelParams.grid.realGridSizeFP[ZZ];
+ t = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + x.y * kernelParams.step.recipbox[dimIndex][YY] + */ x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+ break;
+ }
+ const float shift = c_pmeMaxUnitcellShift;
+ /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+ t = (t + shift) * n;
+ tInt = (int)t;
+ sm_fractCoords[sharedMemoryIndex] = t - tInt;
+ tableIndex += tInt;
+ assert(tInt >= 0);
+ assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+ // TODO have shared table for both parameters to share the fetch, as index is always same?
+ // TODO compare texture/LDG performance
+ sm_fractCoords[sharedMemoryIndex] += fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable, kernelParams.fractShiftsTableTexture,
+ fractShiftsTableTextureRef, tableIndex);
+ sm_gridlineIndices[sharedMemoryIndex] = fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable, kernelParams.gridlineIndicesTableTexture,
+ gridlineIndicesTableTextureRef, tableIndex);
+ gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+ }
+
+ /* B-spline calculation */
+
+ const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+ if (chargeCheck)
+ {
+ float div;
+ int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+ const float dr = sm_fractCoords[sharedMemoryIndex];
+ assert(isfinite(dr));
+
+ /* dr is relative offset from lower cell limit */
+ *SPLINE_DATA_PTR(order - 1) = 0.0f;
+ *SPLINE_DATA_PTR(1) = dr;
+ *SPLINE_DATA_PTR(0) = 1.0f - dr;
+
+#pragma unroll
+ for (int k = 3; k < order; k++)
+ {
+ div = 1.0f / (k - 1.0f);
+ *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
+#pragma unroll
+ for (int l = 1; l < (k - 1); l++)
+ {
+ *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
+ }
+ *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+ }
+
+ const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+
+ /* Differentiation and storing the spline derivatives (dtheta) */
+#if !PME_GPU_PARALLEL_SPLINE
+ // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
+ // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
+#pragma unroll
+ for (o = 0; o < order; o++)
+#endif
+ {
+ const int thetaIndex = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+ const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+ const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+ assert(isfinite(dtheta));
+ gm_dtheta[thetaGlobalIndex] = dtheta;
+ }
+
+ div = 1.0f / (order - 1.0f);
+ *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
+#pragma unroll
+ for (int k = 1; k < (order - 1); k++)
+ {
+ *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
+ }
+ *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+
+ /* Storing the spline values (theta) */
+#if !PME_GPU_PARALLEL_SPLINE
+ // See comment for the loop above
+#pragma unroll
+ for (o = 0; o < order; o++)
+#endif
+ {
+ const int thetaIndex = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+ const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+ sm_theta[thetaIndex] = SPLINE_DATA(o);
+ assert(isfinite(sm_theta[thetaIndex]));
+ gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
+ }
+ }
+ }
+}
+
+/*! \brief
+ * Charge spreading onto the grid.
+ * This corresponds to the CPU function spread_coefficients_bsplines_thread().
+ * Second stage of the whole kernel.
+ *
+ * \tparam[in] order PME interpolation order.
+ * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
+ * \param[in] sm_coefficients Atom coefficents/charges in the shared memory.
+ * \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
+ * \param[in] sm_theta Atom spline values in the shared memory.
+ */
+template <
+ const int order, const bool wrapX, const bool wrapY>
+__device__ __forceinline__ void spread_charges(const pme_gpu_cuda_kernel_params_t kernelParams,
+ int atomIndexOffset,
+ const float * __restrict__ sm_coefficients,
+ const int * __restrict__ sm_gridlineIndices,
+ const float * __restrict__ sm_theta)
+{
+ /* Global memory pointer to the output grid */
+ float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+
+
+ const int nx = kernelParams.grid.realGridSize[XX];
+ const int ny = kernelParams.grid.realGridSize[YY];
+ const int nz = kernelParams.grid.realGridSize[ZZ];
+ const int pny = kernelParams.grid.realGridSizePadded[YY];
+ const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+ const int offx = 0, offy = 0, offz = 0; // unused for now
+
+ const int atomIndexLocal = threadIdx.z;
+ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+
+ const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+ const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+ if (chargeCheck & globalCheck)
+ {
+ // Spline Y/Z coordinates
+ const int ithy = threadIdx.y;
+ const int ithz = threadIdx.x;
+ const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
+ int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
+ if (wrapY & (iy >= ny))
+ {
+ iy -= ny;
+ }
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+ if (iz >= nz)
+ {
+ iz -= nz;
+ }
+
+ /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+ const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
+ /* Warp index w.r.t. block - could probably be obtained easier? */
+ const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int dimStride = PME_SPLINE_THETA_STRIDE * PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int orderStride = dimStride * DIM;
+ const int thetaOffsetBase = orderStride * order * warpIndex + atomWarpIndex;
+
+ const float thetaZ = sm_theta[thetaOffsetBase + ithz * orderStride + ZZ * dimStride];
+ const float thetaY = sm_theta[thetaOffsetBase + ithy * orderStride + YY * dimStride];
+ const float constVal = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
+ assert(isfinite(constVal));
+ const int constOffset = iy * pnz + iz;
+ const float *sm_thetaX = sm_theta + (thetaOffsetBase + XX * dimStride);
+
+#pragma unroll
+ for (int ithx = 0; (ithx < order); ithx++)
+ {
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + constOffset;
+ assert(isfinite(sm_thetaX[ithx * orderStride]));
+ assert(isfinite(gm_grid[gridIndexGlobal]));
+ atomicAdd(gm_grid + gridIndexGlobal, sm_thetaX[ithx * orderStride] * constVal);
+ }
+ }
+}
+
+/*! \brief
+ * A spline computation and charge spreading kernel function.
+ *
+ * \tparam[in] order PME interpolation order.
+ * \tparam[in] computeSplines A boolean which tells if the spline parameter and
+ * gridline indices' computation should be performed.
+ * \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
+ * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ */
+template <
+ const int order,
+ const bool computeSplines,
+ const bool spreadCharges,
+ const bool wrapX,
+ const bool wrapY
+ >
+__launch_bounds__(c_spreadMaxThreadsPerBlock)
+__global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t kernelParams)
+{
+ const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+ // Gridline indices, ivec
+ __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
+ // Charges
+ __shared__ float sm_coefficients[atomsPerBlock];
+ // Spline values
+ __shared__ float sm_theta[atomsPerBlock * DIM * order];
+
+ const int atomIndexOffset = blockIdx.x * atomsPerBlock;
+
+ /* Staging coefficients/charges for both spline and spread */
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+ if (computeSplines)
+ {
+ /* Staging coordinates */
+ __shared__ float sm_coordinates[DIM * atomsPerBlock];
+ pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+
+ __syncthreads();
+ calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
+ sm_coefficients, sm_theta, sm_gridlineIndices);
+ }
+ else
+ {
+ /* Staging the data for spread
+ * (the data is assumed to be in GPU global memory with proper layout already,
+ * as in after running the spline kernel)
+ */
+ /* Spline data - only thetas (dthetas will only be needed in gather) */
+ pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta, kernelParams.atoms.d_theta);
+ /* Gridline indices */
+ pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices, kernelParams.atoms.d_gridlineIndices);
+
+ __syncthreads();
+ }
+
+ /* Spreading */
+ if (spreadCharges)
+ {
+ spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+ }
+}
+
+void pme_gpu_spread(const pme_gpu_t *pmeGpu,
+ int gmx_unused gridIndex,
+ real *h_grid,
+ bool computeSplines,
+ bool spreadCharges)
+{
+ GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
+ cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
+ const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
+ GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
+
+ const int order = pmeGpu->common->pme_order;
+ const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+ // TODO: pick smaller block size in runtime if needed
+ // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
+ // If doing so, change atomsPerBlock in the kernels as well.
+ // TODO: test varying block sizes on modern arch-s as well
+ // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
+ //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+ GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+
+ dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+ dim3 dimBlock(order, order, atomsPerBlock);
+
+ // These should later check for PME decomposition
+ const bool wrapX = true;
+ const bool wrapY = true;
+ GMX_UNUSED_VALUE(wrapX);
+ GMX_UNUSED_VALUE(wrapY);
+ switch (order)
+ {
+ case 4:
+ {
+ // TODO: cleaner unroll with some template trick?
+ if (computeSplines)
+ {
+ if (spreadCharges)
+ {
+ pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+ pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+ pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+ }
+ else
+ {
+ pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
+ pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+ pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
+ }
+ }
+ else
+ {
+ pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
+ pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+ CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+ pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
+ }
+ }
+ break;
+
+ default:
+ GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!"));
+ }
+
+ const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+ if (copyBackGrid)
+ {
+ pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
+ }
+ const bool copyBackAtomData = computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+ if (copyBackAtomData)
+ {
+ pme_gpu_copy_output_spread_atom_data(pmeGpu);
+ }
+}
int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
{
const int order = pmeGPU->common->pme_order;
- GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+ GMX_ASSERT(order > 0, "Invalid PME order");
return PME_ATOM_DATA_ALIGNMENT;
}
int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
{
const int order = pmeGPU->common->pme_order;
- GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+ GMX_ASSERT(order > 0, "Invalid PME order");
return PME_SPREADGATHER_ATOMS_PER_WARP;
}
void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
{
- cudaStream_t stream = pmeGPU->archSpecific->pmeStream;
+ pme_gpu_free_fract_shifts(pmeGPU);
+
auto *kernelParamsPtr = pmeGPU->kernelParams.get();
const int nx = kernelParamsPtr->grid.realGridSize[XX];
const int newFractShiftsSize = cellCount * (nx + ny + nz);
- /* Two arrays, same size */
- int currentSizeTemp = pmeGPU->archSpecific->fractShiftsSize;
- int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
- cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
- ¤tSizeTemp, ¤tSizeTempAlloc,
- newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
- float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
- cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
- &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
- newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
- int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
-
- /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
+ initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+ kernelParamsPtr->fractShiftsTableTexture,
+ &pme_gpu_get_fract_shifts_texref(),
+ pmeGPU->common->fsh.data(),
+ newFractShiftsSize,
+ pmeGPU->deviceInfo);
- for (int i = 0; i < DIM; i++)
- {
- kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
- cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
- cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
- cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
- cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
- }
-
- pme_gpu_make_fract_shifts_textures(pmeGPU);
+ initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+ kernelParamsPtr->gridlineIndicesTableTexture,
+ &pme_gpu_get_gridline_texref(),
+ pmeGPU->common->nn.data(),
+ newFractShiftsSize,
+ pmeGPU->deviceInfo);
}
void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
{
- pme_gpu_free_fract_shifts_textures(pmeGPU);
- /* Two arrays, same size */
- cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
- cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
+ auto *kernelParamsPtr = pmeGPU->kernelParams.get();
+ destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+ kernelParamsPtr->fractShiftsTableTexture,
+ &pme_gpu_get_fract_shifts_texref(),
+ pmeGPU->deviceInfo);
+ destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+ kernelParamsPtr->gridlineIndicesTableTexture,
+ &pme_gpu_get_gridline_texref(),
+ pmeGPU->deviceInfo);
}
void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
CU_RET_ERR(stat, "PME spread grid sync event record failure");
}
+void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu)
+{
+ const int alignment = pme_gpu_get_atom_spline_data_alignment(pmeGpu);
+ const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
+ const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
+ auto *kernelParamsPtr = pmeGpu->kernelParams.get();
+ cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+ kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+ cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME spread atom data sync event record failure");
+}
+
void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
{
cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
#ifndef GMX_EWALD_PME_CUH
#define GMX_EWALD_PME_CUH
+#include "config.h"
+
#include <cassert>
#include <array>
*/
#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
-//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
-//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
-//! which is asserted in case we use atom data padding at all.
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+/*! \brief
+ * Atom data alignment (in terms of number of atoms).
+ * If the GPU atom data buffers are padded (c_usePadding == true),
+ * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
+ * The literal number (16) expresses maximum spread/gather block width in warps.
+ * Accordingly, spread and gather block widths in warps should be divisors of this
+ * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
+ * There are debug asserts for this divisibility.
+ */
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
/*! \brief \internal
* An inline CUDA function for checking the global atom data indices against the atom data array sizes.
int splineValuesSize;
/*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
int splineValuesSizeAlloc;
- /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
- int fractShiftsSize;
- /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
- int fractShiftsSizeAlloc;
/*! \brief The kernelParams.grid.realGrid float element count (actual) */
int realGridSize;
/*! \brief The kernelParams.grid.realGrid float element count (reserved) */
cudaTextureObject_t gridlineIndicesTableTexture;
};
-/* CUDA texture functions which will reside in respective kernel files
+/* CUDA texture reference functions which reside in respective kernel files
* (due to texture references having scope of a translation unit).
*/
-
-/*! \brief \internal
- * Creates/binds 2 textures used in the spline parameter computation.
- *
- * \param[in, out] pmeGPU The PME GPU structure.
- */
-inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
-
-/*! \brief \internal
- * Frees/unbinds 2 textures used in the spline parameter computation.
- *
- * \param[in] pmeGPU The PME GPU structure.
- */
-inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
+extern texture<int, 1, cudaReadModeElementType> gridlineIndicesTableTextureRef;
+extern texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+#endif
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref();
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref();
#endif
/* A block of PME GPU functions */
+/*! \brief Checks whether the input system allows to run PME on GPU.
+ * TODO: this mostly duplicates an internal PME assert function
+ * pme_gpu_check_restrictions(), except that works with a
+ * formed gmx_pme_t structure. Should that one go away/work with inputrec?
+ *
+ * \param[in] ir Input system.
+ * \param[out] error The error message if the input is not supported on GPU.
+ *
+ * \returns true if PME can run on GPU with this input, false otherwise.
+ */
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error);
+
/*! \brief
* Tells if PME is enabled to run on GPU (not necessarily active at the moment).
* For now, this decision is stored in the PME structure itself.
# the research papers on the package. Check out http://www.gromacs.org.
file(GLOB EWALD_TEST_SOURCES *.cpp)
+if (GMX_USE_CUDA)
+ file(GLOB EWALD_CUDA_SOURCES ../*.cu)
+endif()
+
gmx_add_unit_test(EwaldUnitTests ewald-test
- ${EWALD_TEST_SOURCES})
+ ${EWALD_TEST_SOURCES} ${EWALD_CUDA_SOURCES})
(inputForceTreatment == PmeGatherInputHandling::ReduceWith) ? "with reduction" : "without reduction"
));
- PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, inputAtomData.coordinates, inputAtomData.charges, box);
+ PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, nullptr, inputAtomData.coordinates, inputAtomData.charges, box);
/* Setting some more inputs */
pmeSetRealGrid(pmeSafe.get(), mode.first, nonZeroGridValues);
));
/* Running the test */
- PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, box, ewaldCoeff_q, ewaldCoeff_lj);
+ PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, mode.first, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj);
pmeSetComplexGrid(pmeSafe.get(), mode.first, gridOrdering.first, nonZeroGridValues);
const real cellVolume = box[0] * box[4] * box[8];
//FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
inputRec.nkz = gridSize[ZZ];
inputRec.pme_order = pmeOrder;
inputRec.coulombtype = eelPME;
+ inputRec.epsilon_r = 1.0;
TestReferenceData refData;
- const std::map<CodePath, std::string> modesToTest = {{CodePath::CPU, "CPU"}};
+ const std::map<CodePath, std::string> modesToTest = {{CodePath::CPU, "CPU"},
+ {CodePath::CUDA, "CUDA"}};
+
const std::map<PmeSplineAndSpreadOptions, std::string> optionsToTest = {{PmeSplineAndSpreadOptions::SplineAndSpreadUnified, "spline computation and charge spreading (fused)"},
{PmeSplineAndSpreadOptions::SplineOnly, "spline computation"},
{PmeSplineAndSpreadOptions::SpreadOnly, "charge spreading"}};
+
+ // There is a subtle problem with multiple comparisons against same reference data:
+ // The subsequent (GPU) spreading runs at one point didn't actually copy the output grid into the proper buffer,
+ // but the reference data was already marked as checked (hasBeenChecked_) by the CPU run, so nothing failed.
+ // For now we will manually track that the count of the grid entries is the same on each run.
+ // This is just a hack for a single specific output though.
+ // What would be much better TODO is to split different codepaths into separate tests,
+ // while making them use the same reference files.
+ bool gridValuesSizeAssigned = false;
+ size_t previousGridValuesSize;
+
for (const auto &mode : modesToTest)
{
- for (const auto &option : optionsToTest)
+ const bool supportedInput = pmeSupportsInputForMode(&inputRec, mode.first);
+ if (!supportedInput)
{
- /* Describing the test uniquely in case it fails */
-
- SCOPED_TRACE(formatString("Testing %s with %s for PME grid size %d %d %d"
- ", order %d, %zu atoms",
- option.second.c_str(), mode.second.c_str(),
- gridSize[XX], gridSize[YY], gridSize[ZZ],
- pmeOrder,
- atomCount));
+ /* Testing the failure for the unsupported input */
+ EXPECT_THROW(pmeInitAtoms(&inputRec, mode.first, nullptr, coordinates, charges, box), NotImplementedError);
+ continue;
+ }
- /* Running the test */
+ const auto contextsToTest = pmeEnv->getHardwareContexts(mode.first);
+ for (const auto &context : contextsToTest)
+ {
+ for (const auto &option : optionsToTest)
+ {
+ /* Describing the test uniquely in case it fails */
- PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, coordinates, charges, box);
+ SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
+ ", order %d, %zu atoms",
+ option.second.c_str(), mode.second.c_str(),
+ context.getDescription().c_str(),
+ gridSize[XX], gridSize[YY], gridSize[ZZ],
+ pmeOrder,
+ atomCount));
- const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
- const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+ /* Running the test */
- if (!computeSplines)
- {
- // Here we should set up the results of the spline computation so that the spread can run.
- // What is lazy and works is running the separate spline so that it will set it up for us:
- pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
- // We know that it is tested in another iteration.
- // TODO: Clean alternative: read and set the reference gridline indices, spline params
- }
+ PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, context.getDeviceInfo(), coordinates, charges, box);
- pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+ const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+ const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
- /* Outputs correctness check */
- /* All tolerances were picked empirically for single precision on CPU */
+ if (!computeSplines)
+ {
+ // Here we should set up the results of the spline computation so that the spread can run.
+ // What is lazy and works is running the separate spline so that it will set it up for us:
+ pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
+ // We know that it is tested in another iteration.
+ // TODO: Clean alternative: read and set the reference gridline indices, spline params
+ }
- TestReferenceChecker rootChecker(refData.rootChecker());
+ pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+ pmeFinalizeTest(pmeSafe.get(), mode.first);
- const auto maxGridSize = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
- const auto ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
- /* 2 is empiric, the rest follows from the amount of operations */
+ /* Outputs correctness check */
+ /* All tolerances were picked empirically for single precision on CPU */
- if (computeSplines)
- {
- const char *dimString[] = { "X", "Y", "Z" };
+ TestReferenceChecker rootChecker(refData.rootChecker());
- /* Spline values */
- SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
- TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
- splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
- for (int i = 0; i < DIM; i++)
- {
- auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
- splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
- }
+ const auto maxGridSize = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
+ const auto ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
+ /* 2 is empiric, the rest follows from the amount of operations */
- /* Spline derivatives */
- const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
- /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
- SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
- TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
- splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
- for (int i = 0; i < DIM; i++)
+ if (computeSplines)
{
- auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
- splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+ const char *dimString[] = { "X", "Y", "Z" };
+
+ /* Spline values */
+ SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
+ TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
+ splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
+ for (int i = 0; i < DIM; i++)
+ {
+ auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
+ splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
+ }
+
+ /* Spline derivatives */
+ const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
+ /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
+ SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
+ TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
+ splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
+ for (int i = 0; i < DIM; i++)
+ {
+ auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
+ splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+ }
+
+ /* Particle gridline indices */
+ auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
+ rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
}
- /* Particle gridline indices */
- auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
- rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
- }
-
- if (spreadCharges)
- {
- /* The wrapped grid */
- SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
- TestReferenceChecker gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
- const auto ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
- /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
- SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
- gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
- for (const auto &point : nonZeroGridValues)
+ if (spreadCharges)
{
- gridValuesChecker.checkReal(point.second, point.first.c_str());
+ /* The wrapped grid */
+ SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
+ TestReferenceChecker gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
+ const auto ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
+ /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
+ SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
+ if (!gridValuesSizeAssigned)
+ {
+ previousGridValuesSize = nonZeroGridValues.size();
+ gridValuesSizeAssigned = true;
+ }
+ else
+ {
+ EXPECT_EQ(previousGridValuesSize, nonZeroGridValues.size());
+ }
+
+ gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
+ for (const auto &point : nonZeroGridValues)
+ {
+ gridValuesChecker.checkReal(point.second, point.first.c_str());
+ }
}
}
}
};
-/*! \brief Test for PME B-spline moduli computation */
+/*! \brief Test for spline parameter computation and charge spreading. */
TEST_P(PmeSplineAndSpreadTest, ReproducesOutputs)
{
EXPECT_NO_THROW(runTest());
/* Valid input instances */
//! A couple of valid inputs for boxes.
-static std::vector<Matrix3x3> const sampleBoxes
+static std::vector<Matrix3x3> const c_sampleBoxes
{
// normal box
Matrix3x3 {{
};
//! A couple of valid inputs for grid sizes.
-static std::vector<IVec> const sampleGridSizes
+static std::vector<IVec> const c_sampleGridSizes
{
IVec {
16, 12, 14
};
//! Random charges
-static std::vector<real> const sampleChargesFull
+static std::vector<real> const c_sampleChargesFull
{
4.95f, 3.11f, 3.97f, 1.08f, 2.09f, 1.1f, 4.13f, 3.31f, 2.8f, 5.83f, 5.09f, 6.1f, 2.86f, 0.24f, 5.76f, 5.19f, 0.72f
};
//! 1 charge
-static auto const sampleCharges1 = ChargesVector::fromVector(sampleChargesFull.begin(), sampleChargesFull.begin() + 1);
+static auto const c_sampleCharges1 = ChargesVector::fromVector(c_sampleChargesFull.begin(), c_sampleChargesFull.begin() + 1);
//! 2 charges
-static auto const sampleCharges2 = ChargesVector::fromVector(sampleChargesFull.begin() + 1, sampleChargesFull.begin() + 3);
+static auto const c_sampleCharges2 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 1, c_sampleChargesFull.begin() + 3);
//! 13 charges
-static auto const sampleCharges13 = ChargesVector::fromVector(sampleChargesFull.begin() + 3, sampleChargesFull.begin() + 16);
+static auto const c_sampleCharges13 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 3, c_sampleChargesFull.begin() + 16);
//! Random coordinate vectors
-static CoordinatesVector const sampleCoordinatesFull
+static CoordinatesVector const c_sampleCoordinatesFull
{
{
5.59f, 1.37f, 0.95f
}
};
//! 1 coordinate vector
-static CoordinatesVector const sampleCoordinates1(sampleCoordinatesFull.begin(), sampleCoordinatesFull.begin() + 1);
+static CoordinatesVector const c_sampleCoordinates1(c_sampleCoordinatesFull.begin(), c_sampleCoordinatesFull.begin() + 1);
//! 2 coordinate vectors
-static CoordinatesVector const sampleCoordinates2(sampleCoordinatesFull.begin() + 1, sampleCoordinatesFull.begin() + 3);
+static CoordinatesVector const c_sampleCoordinates2(c_sampleCoordinatesFull.begin() + 1, c_sampleCoordinatesFull.begin() + 3);
//! 13 coordinate vectors
-static CoordinatesVector const sampleCoordinates13(sampleCoordinatesFull.begin() + 3, sampleCoordinatesFull.begin() + 16);
+static CoordinatesVector const c_sampleCoordinates13(c_sampleCoordinatesFull.begin() + 3, c_sampleCoordinatesFull.begin() + 16);
//! moved out from instantiantions for readability
-auto inputBoxes = ::testing::ValuesIn(sampleBoxes);
+auto c_inputBoxes = ::testing::ValuesIn(c_sampleBoxes);
//! moved out from instantiantions for readability
-auto inputPmeOrders = ::testing::Range(3, 5 + 1);
+auto c_inputPmeOrders = ::testing::Range(3, 5 + 1);
//! moved out from instantiantions for readability
-auto inputGridSizes = ::testing::ValuesIn(sampleGridSizes);
+auto c_inputGridSizes = ::testing::ValuesIn(c_sampleGridSizes);
-/*! \brief Instantiation of the PME spline computation test with valid input and 1 atom */
-INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
- ::testing::Values(sampleCoordinates1),
- ::testing::Values(sampleCharges1)
+/*! \brief Instantiation of the test with valid input and 1 atom */
+INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+ ::testing::Values(c_sampleCoordinates1),
+ ::testing::Values(c_sampleCharges1)
));
-/*! \brief Instantiation of the PME spline computation test with valid input and 2 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
- ::testing::Values(sampleCoordinates2),
- ::testing::Values(sampleCharges2)
+
+/*! \brief Instantiation of the test with valid input and 2 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+ ::testing::Values(c_sampleCoordinates2),
+ ::testing::Values(c_sampleCharges2)
));
-/*! \brief Instantiation of the PME spline computation test with valid input and 13 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
- ::testing::Values(sampleCoordinates13),
- ::testing::Values(sampleCharges13)
+/*! \brief Instantiation of the test with valid input and 13 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+ ::testing::Values(c_sampleCoordinates13),
+ ::testing::Values(c_sampleCharges13)
));
}
}
#include <cstring>
-#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme-gather.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
#include "gromacs/ewald/pme-grid.h"
#include "gromacs/ewald/pme-internal.h"
#include "gromacs/ewald/pme-solve.h"
namespace test
{
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode)
+{
+ bool implemented;
+ switch (mode)
+ {
+ case CodePath::CPU:
+ implemented = true;
+ break;
+
+ case CodePath::CUDA:
+ implemented = pme_gpu_supports_input(inputRec, nullptr);
+ break;
+
+ default:
+ GMX_THROW(InternalError("Test not implemented for this mode"));
+ }
+ return implemented;
+}
+
//! PME initialization - internal
static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec,
+ CodePath mode,
+ gmx_device_info_t *gpuInfo,
size_t atomCount,
const Matrix3x3 &box,
real ewaldCoeff_q = 1.0f,
{
gmx_pme_t *pmeDataRaw = nullptr;
const MDLogger dummyLogger;
- const auto runMode = PmeRunMode::CPU;
+ const auto runMode = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
t_commrec dummyCommrec = {0};
gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
- ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger);
+ ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
PmeSafePointer pme(pmeDataRaw); // taking ownership
// TODO get rid of this with proper matrix type
}
const char *boxError = check_box(-1, boxTemp);
GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
- invertBoxMatrix(boxTemp, pme->recipbox);
+
+ switch (mode)
+ {
+ case CodePath::CPU:
+ invertBoxMatrix(boxTemp, pme->recipbox);
+ break;
+
+ case CodePath::CUDA:
+ pme_gpu_update_input_box(pme->gpu, boxTemp);
+ break;
+ default:
+ GMX_THROW(InternalError("Test not implemented for this mode"));
+ }
return pme;
}
//! Simple PME initialization based on input, no atom data
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
+ CodePath mode,
+ gmx_device_info_t *gpuInfo,
const Matrix3x3 &box,
real ewaldCoeff_q,
real ewaldCoeff_lj
)
{
- return pmeInitInternal(inputRec, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+ return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
// hiding the fact that PME actually needs to know the number of atoms in advance
}
//! PME initialization with atom data
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
+ CodePath mode,
+ gmx_device_info_t *gpuInfo,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
const Matrix3x3 &box
{
const size_t atomCount = coordinates.size();
GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
- PmeSafePointer pmeSafe = pmeInitInternal(inputRec, atomCount, box);
- pme_atomcomm_t *atc = &(pmeSafe->atc[0]);
- atc->x = const_cast<rvec *>(as_rvec_array(coordinates.data()));
- atc->coefficient = const_cast<real *>(charges.data());
- /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+ PmeSafePointer pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+ pme_atomcomm_t *atc = nullptr;
+
+ switch (mode)
+ {
+ case CodePath::CPU:
+ atc = &(pmeSafe->atc[0]);
+ atc->x = const_cast<rvec *>(as_rvec_array(coordinates.data()));
+ atc->coefficient = const_cast<real *>(charges.data());
+ /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+ break;
+
+ case CodePath::CUDA:
+ pme_gpu_set_testing(pmeSafe->gpu, true);
+ gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
+ pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+ break;
+
+ default:
+ GMX_THROW(InternalError("Test not implemented for this mode"));
+ }
+
return pmeSafe;
}
//! Getting local PME real grid dimensions
static void pmeGetRealGridSizesInternal(const gmx_pme_t *pme,
+ CodePath mode,
IVec &gridSize,
IVec &paddedGridSize)
{
const size_t gridIndex = 0;
IVec gridOffsetUnused;
- gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+ switch (mode)
+ {
+ case CodePath::CPU:
+ gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+ break;
+
+ case CodePath::CUDA:
+ pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
+ break;
+
+ default:
+ GMX_THROW(InternalError("Test not implemented for this mode"));
+ }
}
//! Getting local PME complex grid pointer for test I/O
}
//! Getting the PME grid memory buffer and its sizes - template definition
-template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, ValueType * &, IVec &, IVec &)
+template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, CodePath, ValueType * &, IVec &, IVec &)
{
GMX_THROW(InternalError("Deleted function call"));
// explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
}
//! Getting the PME real grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, real * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, CodePath mode, real * &grid, IVec &gridSize, IVec &paddedGridSize)
{
grid = pmeGetRealGridInternal(pme);
- pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSize);
+ pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
}
//! Getting the PME complex grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, CodePath, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
{
grid = pmeGetComplexGridInternal(pme);
pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
}
//! PME spline calculation and charge spreading
-void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers
+void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers elsewhere
bool computeSplines, bool spreadCharges)
{
GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
const size_t gridIndex = 0;
const bool computeSplinesForZeroCharges = true;
real *fftgrid = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+ real *pmegrid = pme->pmegrid[gridIndex].grid.grid;
switch (mode)
{
fftgrid, computeSplinesForZeroCharges, gridIndex);
if (spreadCharges && !pme->bUseThreads)
{
- wrap_periodic_pmegrid(pme, pme->pmegrid[gridIndex].grid.grid);
- copy_pmegrid_to_fftgrid(pme, pme->pmegrid[gridIndex].grid.grid, fftgrid, gridIndex);
+ wrap_periodic_pmegrid(pme, pmegrid);
+ copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
}
break;
+ case CodePath::CUDA:
+ pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+ break;
+
default:
GMX_THROW(InternalError("Test not implemented for this mode"));
}
}
}
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode)
+{
+ switch (mode)
+ {
+ case CodePath::CPU:
+ break;
+
+ case CodePath::CUDA:
+ pme_gpu_synchronize(pme->gpu);
+ break;
+
+ default:
+ GMX_THROW(InternalError("Test not implemented for this mode"));
+ }
+}
+
//! Setting atom spline values/derivatives to be used in spread/gather
void pmeSetSplineData(const gmx_pme_t *pme, CodePath mode,
const SplineParamsDimVector &splineValues, PmeSplineDataType type, int dimIndex)
GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
IVec paddedGridSizeUnused, gridSize;
- pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSizeUnused);
+ pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
for (const auto &index : gridLineIndices)
{
{
IVec gridSize, paddedGridSize;
ValueType *grid;
- pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+ pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
switch (mode)
{
SplineParamsDimVector result;
switch (mode)
{
+ case CodePath::CUDA:
+ pme_gpu_transform_spline_atom_data_for_host(pme->gpu, atc, type, dimIndex);
+ // fallthrough
+
case CodePath::CPU:
result = SplineParamsDimVector::fromArray(sourceBuffer, dimSize);
break;
GridLineIndicesVector gridLineIndices;
switch (mode)
{
+ case CodePath::CUDA:
+ gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(pme->gpu->staging.h_gridlineIndices), atomCount);
+ break;
+
case CodePath::CPU:
gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(atc->idx), atomCount);
break;
{
IVec gridSize, paddedGridSize;
ValueType *grid;
- pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+ pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
SparseGridValuesOutput<ValueType> gridValues;
switch (mode)
{
+ case CodePath::CUDA: // intentional absence of break
case CodePath::CPU:
gridValues.clear();
for (int ix = 0; ix < gridSize[XX]; ix++)
#include <map>
#include <vector>
+#include <gtest/gtest.h>
+
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
#include "gromacs/math/gmxcomplex.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/unique_cptr.h"
-struct t_inputrec;
+#include "testhardwarecontexts.h"
namespace gmx
{
typedef ArrayRef<RVec> ForcesVector;
//! Gridline indices
typedef ConstArrayRef<IVec> GridLineIndicesVector;
-//! Type of spline data
-enum class PmeSplineDataType
-{
- Values, // theta
- Derivatives, // dtheta
-};
/*! \brief Spline parameters (theta or dtheta).
* A reference to a single dimension's spline data; this means (atomCount * pmeOrder) values or derivatives.
*/
typedef SparseGridValuesOutput<t_complex> SparseComplexGridValuesOutput;
//! TODO: make proper C++ matrix for the whole Gromacs, get rid of this
typedef std::array<real, DIM * DIM> Matrix3x3;
-//! PME code path being tested
-enum class CodePath
-{
- CPU, // serial CPU code
-};
//! PME gathering input forces treatment
enum class PmeGatherInputHandling
{
//! PME solver results - reciprocal energy and virial
typedef std::tuple<real, Matrix3x3> PmeSolveOutput;
+// Misc.
+
+//! Tells if this generally valid PME input is supported for this mode
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode);
+
// PME stages
+// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
//! Simple PME initialization (no atom data)
PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
+ CodePath mode = CodePath::CPU,
+ gmx_device_info_t *gpuInfo = nullptr,
const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
-//! PME initialization with atom data and system box; lacks Ewald coefficients
+//! PME initialization with atom data and system box
PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec,
+ CodePath mode,
+ gmx_device_info_t *gpuInfo,
const CoordinatesVector &coordinates,
const ChargesVector &charges,
const Matrix3x3 &box
//! PME force gathering
void pmePerformGather(gmx_pme_t *pme, CodePath mode,
PmeGatherInputHandling inputTreatment, ForcesVector &forces);
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode);
// PME state setters
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontexts.h"
+
+#include "config.h"
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/taskassignment/hardwareassign.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+
+namespace gmx
+{
+namespace test
+{
+
+//! This constructs the test environment
+PmeTestEnvironment * const pmeEnv = (PmeTestEnvironment *)::testing::AddGlobalTestEnvironment(new PmeTestEnvironment);
+
+//! Simple hardware initialization
+void PmeTestEnvironment::hardwareInit()
+{
+ unique_cptr<t_commrec, done_commrec> commrec(init_commrec());
+ gmx_init_intranode_counters(commrec.get());
+ LoggerBuilder builder;
+ LoggerOwner logOwner(builder.build());
+ MDLogger log(logOwner.logger());
+ hardwareInfo_.reset(gmx_detect_hardware(log, commrec.get()));
+}
+
+void PmeTestEnvironment::SetUp()
+{
+ TestHardwareContext emptyContext("", nullptr);
+ hardwareContextsByMode_[CodePath::CPU].push_back(emptyContext);
+
+ hardwareInit();
+
+ // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds
+ TestHardwareContexts gpuContexts;
+ const auto compatibleGpus = getCompatibleGpus(hardwareInfo_->gpu_info);
+ for (int gpuIndex : compatibleGpus)
+ {
+ char stmp[200] = {};
+ get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
+ std::string description = "(GPU " + std::string(stmp) + ") ";
+ auto *gpuInfo = reinterpret_cast<gmx_device_info_t *>(reinterpret_cast<char *>(hardwareInfo_->gpu_info.gpu_dev) + gpuIndex * sizeof_gpu_dev_info());
+ //TODO move previous line to gpu_utils and reuse in hardwareassign.cpp
+ gpuContexts.emplace_back(TestHardwareContext(description.c_str(), gpuInfo));
+ }
+#if GMX_GPU == GMX_GPU_CUDA
+ hardwareContextsByMode_[CodePath::CUDA] = gpuContexts;
+#endif
+}
+
+}
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <list>
+#include <map>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/gpu_hw_info.h"
+#include "gromacs/utility/unique_cptr.h"
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+ CPU,
+ CUDA
+};
+
+/*! \internal \brief
+ * A structure to describe a hardware context - an abstraction over
+ * gmx_device_info_t with a human-readable string.
+ * TODO: currently this does not know which CodePath it belongs too.
+ * It probably should! That would save us one loop in all the PME tests.
+ */
+struct TestHardwareContext
+{
+ //! Readable description
+ std::string description_;
+ //! Device information pointer
+ gmx_device_info_t *deviceInfo_;
+
+ public:
+ //! Returns a human-readable context description line
+ std::string getDescription() const{return description_; }
+//! Returns the device info pointer
+ gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+ //! Constructs the context
+ TestHardwareContext(const char *description, gmx_device_info_t *deviceInfo) : description_(description), deviceInfo_(deviceInfo){}
+};
+
+//! A list of hardware contexts
+typedef std::list<TestHardwareContext> TestHardwareContexts;
+
+/*! \internal \brief
+ * This class performs one-time test initialization (enumerating the hardware)
+ */
+class PmeTestEnvironment : public ::testing::Environment
+{
+ private:
+ //! General hardware info
+ unique_cptr<gmx_hw_info_t, gmx_hardware_info_free> hardwareInfo_;
+ //! Storage of hardware contexts
+ std::map<CodePath, TestHardwareContexts> hardwareContextsByMode_;
+ //! Simple GPU initialization, allowing for PME to work on GPU
+ void hardwareInit();
+
+ public:
+ //! Default
+ ~PmeTestEnvironment() = default;
+ //! This is called by GTest framework once to query the hardware
+ void SetUp();
+ //! Get available hardware contexts for given code path
+ const TestHardwareContexts &getHardwareContexts(CodePath mode){return hardwareContextsByMode_.at(mode); }
+};
+
+//! The test environment
+extern PmeTestEnvironment * const pmeEnv;
+}
+}
+#endif
#endif
}
+/*! \brief Fetch the value by \p index from the texture object or reference.
+ * Fetching from the object is the preferred behaviour on CC >= 3.0.
+ *
+ * \tparam[in] T Raw data type
+ * \param[in] texObj Table texture object
+ * \param[in] texRef Table texture reference
+ * \param[in] index Non-negative element index
+ * \returns The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromTexture(const cudaTextureObject_t texObj,
+ const struct texture<T, 1, cudaReadModeElementType> texRef,
+ int index)
+{
+ assert(index >= 0);
+ assert(!c_disableCudaTextures);
+ T result;
+#if GMX_PTX_ARCH >= 300 // Preferring texture objects on any new arch
+ GMX_UNUSED_VALUE(texRef);
+ result = tex1Dfetch<T>(texObj, index);
+#else
+ GMX_UNUSED_VALUE(texObj);
+ result = tex1Dfetch(texRef, index);
+#endif
+ return result;
+}
+
+/*! \brief Fetch the value by \p index from the parameter lookup table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texture references.
+ *
+ * \tparam[in] T Raw data type
+ * \param[in] d_ptr Device pointer to the raw table memory
+ * \param[in] texObj Table texture object
+ * \param[in] texRef Table texture reference
+ * \param[in] index Non-negative element index
+ * \returns The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromParamLookupTable(const T *d_ptr,
+ const cudaTextureObject_t texObj,
+ const struct texture<T, 1, cudaReadModeElementType> texRef,
+ int index)
+{
+ assert(index >= 0);
+ T result;
+#if DISABLE_CUDA_TEXTURES
+ GMX_UNUSED_VALUE(texObj);
+ GMX_UNUSED_VALUE(texRef);
+ result = LDG(d_ptr + index);
+#else
+ GMX_UNUSED_VALUE(d_ptr);
+ result = fetchFromTexture<T>(texObj, texRef, index);
+#endif
+ return result;
+}
+
#endif /* GMX_GPU_UTILS_CUDA_KERNEL_UTILS_CUH */
*/
template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const texture<float, 1, cudaReadModeElementType> *, const float *, int, const gmx_device_info_t *);
template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const texture<float, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
+template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const texture<int, 1, cudaReadModeElementType> *, const int *, int, const gmx_device_info_t *);
+template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const texture<int, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
bool bAsync);
// TODO: the 2 functions below are pretty much a constructor/destructor of a simple
-// GPU table object. We just need to add a templated __device__ table data fetching to complete it.
+// GPU table object. There is also almost self-contained fetchFromParamLookupTable()
+// in cuda_kernel_utils.cuh. They could all live in a separate class/struct file,
+// granted storing static texture references in there does not pose problems.
/*! \brief Initialize parameter lookup table.
*
}
}
-/*! \brief Filter the compatible GPUs
- *
- * This function filters gpu_info.gpu_dev for compatible GPUs based
- * on the previously run compatibility tests.
- *
- * \param[in] gpu_info Information detected about GPUs, including compatibility
- * \return vector of IDs of GPUs already recorded as compatible */
-static std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
{
// Possible minor over-allocation here, but not important for anything
std::vector<int> compatibleGpus;
*/
std::vector<int> parseGpuTaskAssignment(const std::string &gpuTaskAssignment);
+/*! \brief Filter the compatible GPUs
+ *
+ * This function filters gpu_info.gpu_dev for compatible GPUs based
+ * on the previously run compatibility tests.
+ *
+ * \param[in] gpu_info Information detected about GPUs, including compatibility
+ * \return vector of IDs of GPUs already recorded as compatible */
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info);
+
/*! \brief Assign PP ranks to valid GPU IDs.
*
* Will return a validated mapping from PP ranks (ie tasks that can