-Dorder=4
-DthreadsPerAtom=16
-Dc_pmeMaxUnitcellShift=2
- -Dc_usePadding=true
-Dc_skipNeutralAtoms=false
-Dc_virialAndEnergyCount=7
-Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
return (pme != nullptr) && (pme_run_mode(pme) != PmeRunMode::CPU);
}
-/*! \brief Returns the size of the padding needed by GPU version of PME in the coordinates array.
+/*! \brief Returns the block size requirement
+ *
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
*
* \param[in] pme The PME data structure.
*/
-GPU_FUNC_QUALIFIER int pme_gpu_get_padding_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER int pme_gpu_get_block_size(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
GPU_FUNC_TERM_WITH_RETURN(0);
// The following functions are all the PME GPU entry points,
const int localGridlineIndicesIndex = threadLocalId;
const int globalGridlineIndicesIndex =
(int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
- const int globalCheckIndices =
- pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
- if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+ if (localGridlineIndicesIndex < gridlineIndicesSize)
{
sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
/* Staging the spline parameters, DIM * order * atomsPerBlock threads */
const int localSplineParamsIndex = threadLocalId;
const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
- const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
- globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
- if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+ if (localSplineParamsIndex < splineParamsSize)
{
sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
float fy = 0.0F;
float fz = 0.0F;
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
- if (chargeCheck & globalCheck)
+ if (chargeCheck)
{
const int nx = kernelParams.grid.realGridSize[XX];
const int ny = kernelParams.grid.realGridSize[YY];
/* Calculating the final forces with no component branching, atomsPerBlock threads */
const int forceIndexLocal = threadLocalId;
const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
- const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
- if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+ if (forceIndexLocal < atomsPerBlock)
{
const float3 atomForces = vload3(forceIndexLocal, sm_forces);
const float negCoefficient = -gm_coefficients[forceIndexGlobal];
{
const int outputIndexLocal = i * iterThreads + threadLocalId;
const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
- const int globalOutputCheck =
- pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
- if (globalOutputCheck)
- {
- const float outputForceComponent = sm_forces[outputIndexLocal];
- gm_forces[outputIndexGlobal] = outputForceComponent;
- }
+ const float outputForceComponent = sm_forces[outputIndexLocal];
+ gm_forces[outputIndexGlobal] = outputForceComponent;
}
}
}
/* Read splines */
const int localGridlineIndicesIndex = threadLocalId;
const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
- const int globalCheckIndices = pme_gpu_check_atom_data_index(
- globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
- if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+ if (localGridlineIndicesIndex < gridlineIndicesSize)
{
sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
threadLocalId
+ i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
- int globalCheckSplineParams = pme_gpu_check_atom_data_index(
- globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
- if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+ if (localSplineParamsIndex < splineParamsSize)
{
sm_theta[localSplineParamsIndex] = gm_theta[globalSplineParamsIndex];
sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
// Coordinates
__shared__ float3 sm_coordinates[atomsPerBlock];
/* Staging coefficients/charges */
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
/* Staging coordinates */
- pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+ pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
__syncthreads();
atomX = sm_coordinates[atomIndexLocal];
atomCharge = sm_coefficients[atomIndexLocal];
float fy = 0.0f;
float fz = 0.0f;
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
- if (chargeCheck & globalCheck)
+ if (chargeCheck)
{
const int nx = kernelParams.grid.realGridSize[XX];
const int ny = kernelParams.grid.realGridSize[YY];
/* Calculating the final forces with no component branching, atomsPerBlock threads */
const int forceIndexLocal = threadLocalId;
const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
- const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
- if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+ if (forceIndexLocal < atomsPerBlock)
{
const float3 atomForces = sm_forces[forceIndexLocal];
const float negCoefficient = -gm_coefficients[forceIndexGlobal];
#pragma unroll
for (int i = 0; i < numIter; i++)
{
- int outputIndexLocal = i * iterThreads + threadLocalId;
- int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
- const int globalOutputCheck =
- pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
- if (globalOutputCheck)
- {
- const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
- gm_forces[outputIndexGlobal] = outputForceComponent;
- }
+ int outputIndexLocal = i * iterThreads + threadLocalId;
+ int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
+ const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+ gm_forces[outputIndexGlobal] = outputForceComponent;
}
}
}
}
}
-int pme_gpu_get_padding_size(const gmx_pme_t* pme)
+int pme_gpu_get_block_size(const gmx_pme_t* pme)
{
if (!pme || !pme_gpu_active(pme))
}
else
{
- return pme_gpu_get_atom_data_alignment(pme->gpu);
+ return pme_gpu_get_atom_data_block_size();
}
}
* Instead of templated parameters this file expects following defines during compilation:
* - order - PME interpolation order;
* - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
- * - c_usePadding and c_skipNeutralAtoms - same as in pme_gpu_constants.h.
+ * - c_skipNeutralAtoms - same as in pme_gpu_constants.h.
*
* \author Aleksei Iupinov <a.yupinov@gmail.com>
* \ingroup module_ewald
return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
}
-/*! \brief
- * A function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] nAtomData The atom data array element count.
- * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding being true.
- */
-inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_t nAtomData)
-{
- return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
/*! \brief
* A function for optionally skipping neutral charges, depending on c_skipNeutralAtoms.
*
return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
}
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex The atom data index.
- * \param[in] nAtomData The atom data array element count.
- * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
- return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
/*! \internal \brief
* An inline CUDA function for skipping the zero-charge atoms.
*
* General purpose function for loading atom-related data from global to shared memory.
*
* \tparam[in] T Data type (float/int/...)
- * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be
+ * accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for
+ * an rvec coordinates array).
* \param[out] sm_destination Shared memory array for output.
* \param[in] gm_source Global memory array for input.
*/
template<typename T, const int atomsPerBlock, const int dataCountPerAtom>
-__device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
- T* __restrict__ sm_destination,
+__device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
const T* __restrict__ gm_source)
{
- static_assert(c_usePadding,
- "With padding disabled, index checking should be fixed to account for spline "
- "theta/dtheta pr-warp alignment");
const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
const int localIndex = threadLocalIndex;
const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
const int globalIndex = globalIndexBase + localIndex;
- const int globalCheck =
- pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
- if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+ if (localIndex < atomsPerBlock * dataCountPerAtom)
{
assertIsFinite(gm_source[globalIndex]);
sm_destination[localIndex] = gm_source[globalIndex];
/* Atom index w.r.t. block/shared memory */
const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
- /* Atom index w.r.t. global memory */
- const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Spline contribution index in one dimension */
const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
const int orderIndex = threadLocalIdXY / DIM;
float splineData[order];
const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
/* we have 4 threads per atom, but can only use 3 here for the dimensions */
- if (localCheck && globalCheck)
+ if (localCheck)
{
/* Indices interpolation */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/* General settings for PME GPU behaviour */
-/*! \brief
- * false: The atom data GPU buffers are sized precisely according to the number of atoms.
- * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
- * The atom index checks in the spread/gather code potentially hinder the performance.
- * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms
- * fitting in is divisible by c_pmeAtomDataAlignment.
- * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
- * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
- * \todo Estimate performance differences
- */
-constexpr bool c_usePadding = true;
-
/*! \brief
* false: Atoms with zero charges are processed by PME. Could introduce some overhead.
* true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
* Atom data alignment (in terms of number of atoms).
* This is the least common multiple of number of atoms processed by
* a single block/workgroup of the spread and gather kernels.
- * If the GPU atom data buffers are padded (c_usePadding == true),
- * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
- * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
+ * The GPU atom data buffers must be padded, which means that
+ * the numbers of atoms used for determining the size of the memory
+ * allocation must be divisible by this.
*/
-constexpr int c_pmeAtomDataAlignment = 64;
+constexpr int c_pmeAtomDataBlockSize = 64;
/*
* The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
return kernelParamsPtr;
}
-int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
+int pme_gpu_get_atom_data_block_size()
{
- // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
- if (c_usePadding)
- {
- return c_pmeAtomDataAlignment;
- }
- else
- {
- return 0;
- }
+ return c_pmeAtomDataBlockSize;
}
int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
- if (c_usePadding)
+
+ const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
+ const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
+ if (paddingCount > 0)
{
- const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
- const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
- if (paddingCount > 0)
- {
- clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
- paddingCount, pmeGpu->archSpecific->pmeStream_);
- }
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+ paddingCount, pmeGpu->archSpecific->pmeStream_);
}
}
void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
{
- const int order = pmeGpu->common->pme_order;
- const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
- const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
- const int newSplineDataSize = DIM * order * nAtomsPadded;
+ const int order = pmeGpu->common->pme_order;
+ const int newSplineDataSize = DIM * order * pmeGpu->nAtomsAlloc;
GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
/* Two arrays of the same size */
const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
{
- const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
- const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
- const size_t splinesCount = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+ const size_t splinesCount = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
{
- const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
- const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
- const size_t splinesCount = DIM * nAtomsPadded * pmeGpu->common->pme_order;
+ const size_t splinesCount = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
- if (c_usePadding)
- {
- // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
- pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
- pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
- pmeGpu->archSpecific->pmeStream_);
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
- pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
- pmeGpu->archSpecific->pmeStream_);
- }
+
+ // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+ pmeGpu->archSpecific->pmeStream_);
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+ pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+ pmeGpu->archSpecific->pmeStream_);
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+ pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
+ pmeGpu->archSpecific->pmeStream_);
+
copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
{
auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
kernelParamsPtr->atoms.nAtoms = nAtoms;
- const int alignment = pme_gpu_get_atom_data_alignment(pmeGpu);
- pmeGpu->nAtomsPadded = ((nAtoms + alignment - 1) / alignment) * alignment;
- const int nAtomsAlloc = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
- const bool haveToRealloc =
- (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
- pmeGpu->nAtomsAlloc = nAtomsAlloc;
+ const int block_size = pme_gpu_get_atom_data_block_size();
+ const int nAtomsNewPadded = ((nAtoms + block_size - 1) / block_size) * block_size;
+ const bool haveToRealloc = (pmeGpu->nAtomsAlloc < nAtomsNewPadded);
+ pmeGpu->nAtomsAlloc = nAtomsNewPadded;
#if GMX_DOUBLE
GMX_RELEASE_ASSERT(false, "Only single precision supported");
// TODO: test varying block sizes on modern arch-s as well
// TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
//(for spline data mostly)
- GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+ GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
"inconsistent atom data padding vs. spreading block size");
// Ensure that coordinates are ready on the device before launching spread;
xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
}
- const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
KernelLaunchConfig config;
const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
: blockSize / c_pmeSpreadGatherThreadsPerAtom;
- GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
+ GMX_ASSERT(!(c_pmeAtomDataBlockSize % atomsPerBlock),
"inconsistent atom data padding vs. gathering block size");
- const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
const int order = pmeGpu->common->pme_order;
};
/*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
- * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ * Returns the size of the block size requirement
*
- * \param[in] pmeGpu The PME GPU structure.
- * \returns Number of atoms in a single GPU atom data chunk.
+ * The GPU version of PME requires that the coordinates array have a
+ * size divisible by the returned number.
+ *
+ * \returns Number of atoms in a single GPU atom data chunk, which
+ * determines a minimum divisior of the size of the memory allocated.
*/
-int pme_gpu_get_atom_data_alignment(const PmeGpu* pmeGpu);
+int pme_gpu_get_atom_data_block_size();
/*! \libinternal \brief
* Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
// forwarding from pme_grid.h, used for spline computation table sizes only
"-Dc_pmeMaxUnitcellShift=%f "
// forwarding PME behavior constants from pme_gpu_constants.h
- "-Dc_usePadding=%d "
"-Dc_skipNeutralAtoms=%d "
"-Dc_virialAndEnergyCount=%d "
// forwarding kernel work sizes
// decomposition parameter placeholders
"-DwrapX=true -DwrapY=true ",
warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
- static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
- static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
- solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+ static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
+ c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
+ gatherWorkGroupSize, DIM, XX, YY, ZZ);
try
{
/* TODO when we have a proper MPI-aware logging module,
PmeGpuStaging staging;
/*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment.
- * Used for kernel scheduling.
- * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
- * TODO: this and the next member represent a memory allocation/padding properties -
- * what a container type should do ideally.
- */
- int nAtomsPadded;
- /*! \brief Number of local atoms, padded to be divisible by c_pmeAtomDataAlignment
- * if c_usePadding is true.
+ *
* Used only as a basic size for almost all the atom data allocations
* (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
- * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
* kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+ *
+ * TODO: memory allocation/padding properties should be handled by
+ * something like a container
*/
int nAtomsAlloc;
"Device context can not be nullptr when building GPU propagator data object.");
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
- stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
- pme_gpu_get_padding_size(pme), wcycle);
+ stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
+ GpuApiCallBehavior::Async,
+ pme_gpu_get_block_size(pme), wcycle);
}
clear_nrnb(mynrnb);
/*! \brief
* General purpose function for loading atom-related data from global to shared memory.
*
- * \param[in] kernelParams Input PME GPU data in constant memory.
* \param[out] sm_destination Local memory array for output.
* \param[in] gm_source Global memory array for input.
* \param[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
*
*/
-inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelParams,
- __local float* __restrict__ sm_destination,
+inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
__global const float* __restrict__ gm_source,
const int dataCountPerAtom)
{
const int localIndex = threadLocalIndex;
const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
const int globalIndex = globalIndexBase + localIndex;
- const int globalCheck =
- pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
- if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+ if (localIndex < atomsPerBlock * dataCountPerAtom)
{
assert(isfinite(float(gm_source[globalIndex])));
sm_destination[localIndex] = gm_source[globalIndex];
/* Atom index w.r.t. block/shared memory */
const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
- /* Atom index w.r.t. global memory */
- const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Spline contribution index in one dimension */
const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
/* Dimension index */
# define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
- if (localCheck && globalCheck)
+ if (localCheck)
{
/* Indices interpolation */
if (orderIndex == 0)
const int offy = 0;
const int offz = 0;
- const int atomIndexLocal = get_local_id(ZZ);
- const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+ const int atomIndexLocal = get_local_id(ZZ);
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
- if (chargeCheck & globalCheck)
+ if (chargeCheck)
{
// Spline Y/Z coordinates
const int ithy = get_local_id(YY);
const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
/* Staging coefficients/charges for both spline and spread */
- pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);
+ pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
if (computeSplines)
{
/* Staging coordinates */
- pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates, DIM);
+ pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates, DIM);
barrier(CLK_LOCAL_MEM_FENCE);
calculate_splines(kernelParams, atomIndexOffset, sm_coordinates, sm_coefficients, sm_theta,
* as in after running the spline kernel)
*/
/* Spline data - only thetas (dthetas will only be needed in gather) */
- pme_gpu_stage_atom_data(kernelParams, sm_theta, gm_theta, DIM * order);
+ pme_gpu_stage_atom_data(sm_theta, gm_theta, DIM * order);
/* Gridline indices - they're actually int and not float, but C99 is angry about overloads */
- pme_gpu_stage_atom_data(kernelParams, (__local float*)sm_gridlineIndices,
+ pme_gpu_stage_atom_data((__local float*)sm_gridlineIndices,
(__global const float*)gm_gridlineIndices, DIM);
barrier(CLK_LOCAL_MEM_FENCE);
* \tparam[in] useOrderThreads Whether we should use order threads per atom (order*order used if false).
*
* \param[in] kernelParams Input PME CUDA data in constant memory.
- * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
* \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
* \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
* \param[in] sm_theta Atom spline values in the shared memory.
*/
template<const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads>
__device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
- int atomIndexOffset,
const float* atomCharge,
const int* __restrict__ sm_gridlineIndices,
const float* __restrict__ sm_theta)
const int offx = 0, offy = 0, offz = 0; // unused for now
- const int atomIndexLocal = threadIdx.z;
- const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+ const int atomIndexLocal = threadIdx.z;
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
- if (chargeCheck & globalCheck)
+ if (chargeCheck)
{
// Spline Z coordinates
const int ithz = threadIdx.x;
if (c_useAtomDataPrefetch)
{
__shared__ float sm_coefficients[atomsPerBlock];
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
- kernelParams.atoms.d_coefficients);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
__syncthreads();
atomCharge = sm_coefficients[atomIndexLocal];
}
__shared__ float3 sm_coordinates[atomsPerBlock];
/* Staging coordinates */
- pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
+ pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
__syncthreads();
atomX = sm_coordinates[atomIndexLocal];
}
* as in after running the spline kernel)
*/
/* Spline data - only thetas (dthetas will only be needed in gather) */
- pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta,
- kernelParams.atoms.d_theta);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(sm_theta, kernelParams.atoms.d_theta);
/* Gridline indices */
- pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices,
+ pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(sm_gridlineIndices,
kernelParams.atoms.d_gridlineIndices);
__syncthreads();
/* Spreading */
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, useOrderThreads>(
- kernelParams, atomIndexOffset, &atomCharge, sm_gridlineIndices, sm_theta);
+ spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, &atomCharge,
+ sm_gridlineIndices, sm_theta);
}
}
// restrict one from using other constructor here.
return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
GpuApiCallBehavior::Sync,
- pme_gpu_get_padding_size(&pme), nullptr);
+ pme_gpu_get_block_size(&pme), nullptr);
}
//! PME initialization with atom data
fr->nbv->gpu_nbv != nullptr
? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
: nullptr;
- const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
? GpuApiCallBehavior::Async
: GpuApiCallBehavior::Sync;
"Device context can not be nullptr when building GPU propagator data object.");
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
- paddingSize, wcycle);
+ pme_gpu_get_block_size(fr->pmedata), wcycle);
fr->stateGpu = stateGpu.get();
}
* \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] allocationBlockSizeDivisor Deterines padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
StatePropagatorDataGpu(const DeviceStream* pmeStream,
const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
* \param[in] pmeStream Device PME stream, nullptr is not allowed.
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] allocationBlockSizeDivisor Determines padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
StatePropagatorDataGpu(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle);
//! Move constructor
const DeviceStream* /* nonLocalStream */,
const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
- int /* paddingSize */,
+ int /* allocationBlockSizeDivisor */,
gmx_wallcycle* /* wcycle */) :
impl_(nullptr)
{
StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */,
const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
- int /* paddingSize */,
+ int /* allocationBlockSizeDivisor */,
gmx_wallcycle* /* wcycle */) :
impl_(nullptr)
{
* \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] allocationBlockSizeDivisor Determines the padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
Impl(const DeviceStream* pmeStream,
const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
* \param[in] pmeStream Device PME stream, nullptr is not allowed.
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
- * \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] allocationBlockSizeDivisor Determines the padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
Impl(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle);
~Impl();
const DeviceContext& deviceContext_;
//! Default GPU calls behavior
GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
- //! Padding size for the coordinates buffer
- int paddingSize_ = 0;
+ //! Required minimum divisor of the allocation size of the coordinates buffer
+ int allocationBlockSizeDivisor_ = 0;
//! Number of local atoms
int numAtomsLocal_ = -1;
const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle) :
deviceContext_(deviceContext),
transferKind_(transferKind),
- paddingSize_(paddingSize),
+ allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
wcycle_(wcycle)
{
static_assert(GMX_GPU != GMX_GPU_NONE,
StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle) :
deviceContext_(deviceContext),
transferKind_(transferKind),
- paddingSize_(paddingSize),
+ allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
wcycle_(wcycle)
{
static_assert(GMX_GPU != GMX_GPU_NONE,
numAtomsAll_ = numAtomsAll;
int numAtomsPadded;
- if (paddingSize_ > 0)
+ if (allocationBlockSizeDivisor_ > 0)
{
- numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
+ numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
+ * allocationBlockSizeDivisor_;
}
else
{
const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle) :
- impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
+ impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
{
}
StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize,
+ int allocationBlockSizeDivisor,
gmx_wallcycle* wcycle) :
- impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
+ impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
{
}