endif()
set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
_gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
endfunction ()
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+ _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
function (gmx_install_headers)
if (NOT GMX_BUILD_MDRUN_ONLY)
file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
REMOTE_HASH)
list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+ get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+ set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
# set up CUDA compilation with clang
if (GMX_CLANG_CUDA)
foreach (_file ${LIBGROMACS_SOURCES})
get_filename_component(_ext ${_file} EXT)
- if (${_ext} STREQUAL ".cu")
+ get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+ if (${_ext} STREQUAL ".cu" OR _cuda_source_format)
gmx_compile_cuda_file_with_clang(${_file})
endif()
endforeach()
#ifndef GMX_EWALD_PME_GPU_TYPES_H
#define GMX_EWALD_PME_GPU_TYPES_H
+#include "gromacs/gpu_utils/devicebuffer.h"
+
/* What follows is all the PME GPU function arguments,
* sorted into several device-side structures depending on the update rate.
* This is GPU agnostic (float3 replaced by float[3], etc.).
float elFactor;
/*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
* The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
- float *d_virialAndEnergy;
+ DeviceBuffer<float> d_virialAndEnergy;
};
/*! \internal \brief
/*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
int complexGridSizePadded[DIM];
- /* Grid pointers */
+ /* Grid arrays */
/*! \brief Real space grid. */
- float *d_realGrid;
- /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
- float *d_fourierGrid;
+ DeviceBuffer<float> d_realGrid;
+ /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same handle as realGrid. */
+ DeviceBuffer<float> d_fourierGrid;
/*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
float ewaldFactor;
/*! \brief Grid spline values as in pme->bsp_mod
* (laid out sequentially (XXX....XYYY......YZZZ.....Z))
*/
- float *d_splineModuli;
+ DeviceBuffer<float> d_splineModuli;
/*! \brief Offsets for X/Y/Z components of d_splineModuli */
int splineValuesOffset[DIM];
/*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
- float *d_fractShiftsTable;
+ DeviceBuffer<float> d_fractShiftsTable;
/*! \brief Gridline indices lookup table
* (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
- int *d_gridlineIndicesTable;
+ DeviceBuffer<int> d_gridlineIndicesTable;
/*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
- int tablesOffsets[DIM];
+ int tablesOffsets[DIM];
};
/*! \internal \brief
{
/*! \brief Number of local atoms */
int nAtoms;
- /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
+ /*! \brief Global GPU memory array handle with input rvec atom coordinates.
* The coordinates themselves change and need to be copied to the GPU for every PME computation,
* but reallocation happens only at DD.
*/
- float *d_coordinates;
- /*! \brief Pointer to the global GPU memory with input atom charges.
+ DeviceBuffer<float> d_coordinates;
+ /*! \brief Global GPU memory array handle with input atom charges.
* The charges only need to be reallocated and copied to the GPU at DD step.
*/
- float *d_coefficients;
- /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
+ DeviceBuffer<float> d_coefficients;
+ /*! \brief Global GPU memory array handle with input/output rvec atom forces.
* The forces change and need to be copied from (and possibly to) the GPU for every PME computation,
* but reallocation happens only at DD.
*/
- float *d_forces;
- /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
+ DeviceBuffer<float> d_forces;
+ /*! \brief Global GPU memory array handle with ivec atom gridline indices.
* Computed on GPU in the spline calculation part.
*/
- int *d_gridlineIndices;
+ DeviceBuffer<int> d_gridlineIndices;
/* B-spline parameters are computed entirely on GPU for every PME computation, not copied.
* Unless we want to try something like GPU spread + CPU gather?
*/
- /*! \brief Pointer to the global GPU memory with B-spline values */
- float *d_theta;
- /*! \brief Pointer to the global GPU memory with B-spline derivative values */
- float *d_dtheta;
+ /*! \brief Global GPU memory array handle with B-spline values */
+ DeviceBuffer<float> d_theta;
+ /*! \brief Global GPU memory array handle with B-spline derivative values */
+ DeviceBuffer<float> d_dtheta;
};
/*! \internal \brief
#include "pme.cuh"
#include "pme-3dfft.cuh"
+#include "pme-gpu-program-impl.h"
#include "pme-grid.h"
int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
{
const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
- cudaError_t stat = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
- CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
+ allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, pmeGpu->archSpecific->context);
pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
}
void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
{
- cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
- CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
- pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
+ freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
pfree(pmeGpu->staging.h_virialAndEnergy);
pmeGpu->staging.h_virialAndEnergy = nullptr;
}
pmeGpu->kernelParams->grid.realGridSize[ZZ];
const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
- &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
if (shouldRealloc)
{
/* Reallocate the host buffer */
const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
- &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
}
const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
- &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->context);
if (c_usePadding)
{
const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
- &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
0, pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
int currentSizeTemp = pmeGpu->archSpecific->splineDataSize;
int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
- ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
+ ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->context);
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
- &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
// the host side reallocation
if (shouldRealloc)
{
const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
- &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
pfree(pmeGpu->staging.h_gridlineIndices);
pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
}
{
/* 2 separate grids */
reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
- &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
- &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
}
else
{
/* A single buffer so that any grid will fit */
const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
- &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+ &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
// the size might get used later for copying the grid
// TODO: Consider turning on by default when we can detect nr of streams.
pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
+ pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
+
// Prepare to use the device that this PME task was assigned earlier.
CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");