From 379b2954fd3efbd1b2c724964932c2ef03078939 Mon Sep 17 00:00:00 2001 From: Aleksei Iupinov Date: Mon, 26 Sep 2016 12:53:29 +0200 Subject: [PATCH] PME GPU/CUDA data framework. This patch adds most of the PME GPU data handling routines (pme.h and pme-gpu.cpp / pme.cu(h) / pme-gpu-internal.h/cpp), data structures used on host and on device (pme-gpu-types.h / pme.cuh). The PME GPU kernels and their host code will live in separate files. There is also cuFFT code (pme-3dfft.cu(h)) and optional CUDA timing events (pme-timings.cu(h)) included. Currently this is a dead code, PME GPU is not actually getting initialized as the corresponding enum is always set for CPU path, which is asserted. Change-Id: I9b03f54a2412885e25ea27f17bf2dca1b01f9f78 --- src/gromacs/CMakeLists.txt | 1 + src/gromacs/ewald/CMakeLists.txt | 4 + src/gromacs/ewald/pme-3dfft.cu | 146 +++++ src/gromacs/ewald/pme-3dfft.cuh | 74 +++ src/gromacs/ewald/pme-gpu-internal.cpp | 397 +++++++++++++ src/gromacs/ewald/pme-gpu-internal.h | 551 ++++++++++++++++++ src/gromacs/ewald/pme-gpu-types.h | 356 +++++++++++ src/gromacs/ewald/pme-gpu.cpp | 114 ++++ src/gromacs/ewald/pme-grid.cpp | 2 + src/gromacs/ewald/pme-grid.h | 13 +- src/gromacs/ewald/pme-internal.h | 108 +++- src/gromacs/ewald/pme-load-balancing.cpp | 11 +- src/gromacs/ewald/pme-only.cpp | 18 +- src/gromacs/ewald/pme-pp.cpp | 6 +- src/gromacs/ewald/pme-timings.cu | 112 ++++ src/gromacs/ewald/pme-timings.cuh | 66 +++ src/gromacs/ewald/pme.cpp | 60 +- src/gromacs/ewald/pme.cu | 515 ++++++++++++++++ src/gromacs/ewald/pme.cuh | 243 ++++++++ src/gromacs/ewald/pme.h | 75 ++- src/gromacs/ewald/tests/pmetestcommon.cpp | 11 +- src/gromacs/gpu_utils/gpuregiontimer.h | 20 +- src/gromacs/gpu_utils/gpuregiontimer_ocl.h | 3 +- src/gromacs/mdlib/mdsetup.cpp | 8 + src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 18 +- .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 7 +- .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h | 6 +- src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h | 5 +- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp | 18 +- .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp | 7 +- src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h | 20 +- src/gromacs/mdlib/sim_util.cpp | 13 +- src/gromacs/mdlib/sim_util.h | 1 + src/gromacs/timing/gpu_timing.h | 52 +- src/gromacs/timing/wallcycle.cpp | 76 ++- src/gromacs/timing/wallcycle.h | 3 +- src/gromacs/timing/wallcyclereporting.h | 8 +- src/programs/mdrun/md.cpp | 14 +- src/programs/mdrun/runner.cpp | 6 +- 39 files changed, 3026 insertions(+), 142 deletions(-) create mode 100644 src/gromacs/ewald/pme-3dfft.cu create mode 100644 src/gromacs/ewald/pme-3dfft.cuh create mode 100644 src/gromacs/ewald/pme-gpu-internal.cpp create mode 100644 src/gromacs/ewald/pme-gpu-internal.h create mode 100644 src/gromacs/ewald/pme-gpu-types.h create mode 100644 src/gromacs/ewald/pme-gpu.cpp create mode 100644 src/gromacs/ewald/pme-timings.cu create mode 100644 src/gromacs/ewald/pme-timings.cuh create mode 100644 src/gromacs/ewald/pme.cu create mode 100644 src/gromacs/ewald/pme.cuh diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt index ba8f6d2a55..333405f21e 100644 --- a/src/gromacs/CMakeLists.txt +++ b/src/gromacs/CMakeLists.txt @@ -174,6 +174,7 @@ if (GMX_USE_CUDA) else() add_library(libgromacs ${LIBGROMACS_SOURCES}) endif() + target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES}) else() add_library(libgromacs ${LIBGROMACS_SOURCES}) endif() diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index 8d1904c469..05210bda31 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -34,6 +34,10 @@ file(GLOB EWALD_SOURCES *.cpp) set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${EWALD_SOURCES} PARENT_SCOPE) +if (GMX_USE_CUDA) + file(GLOB EWALD_CUDA_SOURCES *.cu) + gmx_add_libgromacs_sources(${EWALD_CUDA_SOURCES}) +endif() if (BUILD_TESTING) add_subdirectory(tests) diff --git a/src/gromacs/ewald/pme-3dfft.cu b/src/gromacs/ewald/pme-3dfft.cu new file mode 100644 index 0000000000..38f2e75cb0 --- /dev/null +++ b/src/gromacs/ewald/pme-3dfft.cu @@ -0,0 +1,146 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements CUDA FFT routines for PME GPU. + * + * \author Aleksei Iupinov + */ + +#include "gmxpre.h" + +#include "pme-3dfft.cuh" + +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" + +#include "pme.cuh" +#include "pme-gpu-types.h" + +static void handleCufftError(cufftResult_t status, const char *msg) +{ + if (status != CUFFT_SUCCESS) + { + gmx_fatal(FARGS, "%s (error code %d)\n", msg, status); + } +} + +GpuParallel3dFft::GpuParallel3dFft(const pme_gpu_t *pmeGPU) +{ + const pme_gpu_cuda_kernel_params_t *kernelParamsPtr = pmeGPU->kernelParams.get(); + ivec realGridSize, realGridSizePadded, complexGridSizePadded; + for (int i = 0; i < DIM; i++) + { + realGridSize[i] = kernelParamsPtr->grid.realGridSize[i]; + realGridSizePadded[i] = kernelParamsPtr->grid.realGridSizePadded[i]; + complexGridSizePadded[i] = kernelParamsPtr->grid.complexGridSizePadded[i]; + } + + GMX_RELEASE_ASSERT(!pme_gpu_uses_dd(pmeGPU), "FFT decomposition not implemented"); + + const int complexGridSizePaddedTotal = complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ]; + const int realGridSizePaddedTotal = realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]; + + realGrid_ = (cufftReal *)kernelParamsPtr->grid.d_realGrid; + GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid"); + complexGrid_ = (cufftComplex *)kernelParamsPtr->grid.d_fourierGrid; + GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid"); + + cufftResult_t result; + /* Commented code for a simple 3D grid with no padding */ + /* + result = cufftPlan3d(&planR2C_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_R2C); + handleCufftError(result, "cufftPlan3d R2C plan failure"); + + result = cufftPlan3d(&planC2R_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_C2R); + handleCufftError(result, "cufftPlan3d C2R plan failure"); + */ + + const int rank = 3, batch = 1; + result = cufftPlanMany(&planR2C_, rank, realGridSize, + realGridSizePadded, 1, realGridSizePaddedTotal, + complexGridSizePadded, 1, complexGridSizePaddedTotal, + CUFFT_R2C, + batch); + handleCufftError(result, "cufftPlanMany R2C plan failure"); + + result = cufftPlanMany(&planC2R_, rank, realGridSize, + complexGridSizePadded, 1, complexGridSizePaddedTotal, + realGridSizePadded, 1, realGridSizePaddedTotal, + CUFFT_C2R, + batch); + handleCufftError(result, "cufftPlanMany C2R plan failure"); + + cudaStream_t stream = pmeGPU->archSpecific->pmeStream; + GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT"); + + result = cufftSetStream(planR2C_, stream); + handleCufftError(result, "cufftSetStream R2C failure"); + + result = cufftSetStream(planC2R_, stream); + handleCufftError(result, "cufftSetStream C2R failure"); +} + +GpuParallel3dFft::~GpuParallel3dFft() +{ + cufftResult_t result; + result = cufftDestroy(planR2C_); + handleCufftError(result, "cufftDestroy R2C failure"); + result = cufftDestroy(planC2R_); + handleCufftError(result, "cufftDestroy C2R failure"); +} + +void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir) +{ + cufftResult_t result; + if (dir == GMX_FFT_REAL_TO_COMPLEX) + { + result = cufftExecR2C(planR2C_, realGrid_, complexGrid_); + handleCufftError(result, "cuFFT R2C execution failure"); + } + else + { + result = cufftExecC2R(planC2R_, complexGrid_, realGrid_); + handleCufftError(result, "cuFFT C2R execution failure"); + } +} + +inline void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index) +{ + int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R; + pme_gpu_start_timing(pmeGPU, timerId); + pmeGPU->archSpecific->fftSetup[grid_index]->perform3dFft(dir); + pme_gpu_stop_timing(pmeGPU, timerId); +} diff --git a/src/gromacs/ewald/pme-3dfft.cuh b/src/gromacs/ewald/pme-3dfft.cuh new file mode 100644 index 0000000000..775c3241d3 --- /dev/null +++ b/src/gromacs/ewald/pme-3dfft.cuh @@ -0,0 +1,74 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines the CUDA 3D-FFT functions for PME. + * + * \author Aleksei Iupinov + */ + +#ifndef GMX_EWALD_PME_3DFFT_CUH +#define GMX_EWALD_PME_3DFFT_CUH + +#include // for the cufft types + +#include "gromacs/fft/fft.h" // for the enum gmx_fft_direction + +struct pme_gpu_t; + +/*! \brief \internal A 3D FFT class for performing R2C/C2R transforms + * \todo Make this class actually parallel over multiple GPUs + */ +class GpuParallel3dFft +{ + cufftHandle planR2C_; + cufftHandle planC2R_; + cufftReal *realGrid_; + cufftComplex *complexGrid_; + + public: + /*! \brief + * Constructs CUDA FFT plans for performing 3D FFT on a PME grid. + * + * \param[in] pmeGPU The PME GPU structure. + */ + GpuParallel3dFft(const pme_gpu_t *pmeGPU); + /*! \brief Destroys CUDA FFT plans. */ + ~GpuParallel3dFft(); + /*! \brief Performs the FFT transform in given direction */ + void perform3dFft(gmx_fft_direction dir); +}; + +#endif diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp new file mode 100644 index 0000000000..3349b63b79 --- /dev/null +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -0,0 +1,397 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * + * \brief This file contains internal function implementations + * for performing the PME calculations on GPU. + * + * \author Aleksei Iupinov + * \ingroup module_ewald + */ + +#include "gmxpre.h" + +#include "pme-gpu-internal.h" + +#include "config.h" + +#include +#include + +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/math/invertmatrix.h" +#include "gromacs/math/units.h" +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/stringutil.h" + +#include "pme-grid.h" +#include "pme-internal.h" + +/*! \internal \brief + * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns The pointer to the kernel parameters. + */ +static pme_gpu_kernel_params_base_t *pme_gpu_get_kernel_params_base_ptr(const pme_gpu_t *pmeGPU) +{ + // reinterpret_cast is needed because the derived CUDA structure is not known in this file + auto *kernelParamsPtr = reinterpret_cast(pmeGPU->kernelParams.get()); + return kernelParamsPtr; +} + +void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial) +{ + GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU"); + unsigned int j = 0; + virial[XX][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + virial[YY][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + virial[ZZ][ZZ] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + virial[XX][YY] = virial[YY][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + virial[XX][ZZ] = virial[ZZ][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + virial[YY][ZZ] = virial[ZZ][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++]; + *energy = 0.5f * pmeGPU->staging.h_virialAndEnergy[j++]; +} + +void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box) +{ + auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); + kernelParamsPtr->step.boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ]; + GMX_ASSERT(kernelParamsPtr->step.boxVolume != 0.0f, "Zero volume of the unit cell"); + +#if GMX_DOUBLE + GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU")); +#else + matrix recipBox; + gmx::invertBoxMatrix(box, recipBox); + /* The GPU recipBox is transposed as compared to the CPU recipBox. + * Spread uses matrix columns (while solve and gather use rows). + * There is no particular reason for this; it might be further rethought/optimized for better access patterns. + */ + const real newRecipBox[DIM][DIM] = + { + {recipBox[XX][XX], recipBox[YY][XX], recipBox[ZZ][XX]}, + { 0.0, recipBox[YY][YY], recipBox[ZZ][YY]}, + { 0.0, 0.0, recipBox[ZZ][ZZ]} + }; + memcpy(kernelParamsPtr->step.recipBox, newRecipBox, sizeof(matrix)); +#endif +} + +void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates) +{ + pme_gpu_copy_input_coordinates(pmeGPU, h_coordinates); + if (needToUpdateBox) + { + pme_gpu_update_input_box(pmeGPU, box); + } +} + +/*! \brief \libinternal + * The PME GPU reinitialization function that is called both at the end of any MD step and on any load balancing step. + * + * \param[in] pmeGPU The PME GPU structure. + */ +static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU) +{ + pme_gpu_clear_grids(pmeGPU); + pme_gpu_clear_energy_virial(pmeGPU); +} + +void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir) +{ + /* Needed for copy back as well as timing events */ + pme_gpu_synchronize(pmeGPU); + + if (bCalcF && pme_gpu_performs_gather(pmeGPU)) + { + pme_gpu_sync_output_forces(pmeGPU); + } + if (bCalcEnerVir && pme_gpu_performs_solve(pmeGPU)) + { + pme_gpu_sync_output_energy_virial(pmeGPU); + } + pme_gpu_update_timings(pmeGPU); + pme_gpu_reinit_step(pmeGPU); +} + +/*! \brief \libinternal + * (Re-)initializes all the PME GPU data related to the grid size and cut-off. + * + * \param[in] pmeGPU The PME GPU structure. + */ +static void pme_gpu_reinit_grids(pme_gpu_t *pmeGPU) +{ + auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); + kernelParamsPtr->grid.ewaldFactor = (M_PI * M_PI) / (pmeGPU->common->ewaldcoeff_q * pmeGPU->common->ewaldcoeff_q); + + /* The grid size variants */ + for (int i = 0; i < DIM; i++) + { + kernelParamsPtr->grid.realGridSize[i] = pmeGPU->common->nk[i]; + kernelParamsPtr->grid.realGridSizeFP[i] = (float)kernelParamsPtr->grid.realGridSize[i]; + kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i]; + + // The complex grid currently uses no padding; + // if it starts to do so, then another test should be added for that + kernelParamsPtr->grid.complexGridSize[i] = kernelParamsPtr->grid.realGridSize[i]; + kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i]; + } + /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */ + if (!pme_gpu_performs_FFT(pmeGPU)) + { + // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly + kernelParamsPtr->grid.realGridSizePadded[ZZ] = (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2; + } + + /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */ + kernelParamsPtr->grid.complexGridSize[ZZ] /= 2; + kernelParamsPtr->grid.complexGridSize[ZZ]++; + kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ]; + + pme_gpu_realloc_and_copy_fract_shifts(pmeGPU); + pme_gpu_realloc_and_copy_bspline_values(pmeGPU); + pme_gpu_realloc_grids(pmeGPU); + pme_gpu_reinit_3dfft(pmeGPU); +} + +/* Several GPU functions that refer to the CPU PME data live here. + * We would like to keep these away from the GPU-framework specific code for clarity, + * as well as compilation issues with MPI. + */ + +/*! \brief \libinternal + * Copies everything useful from the PME CPU to the PME GPU structure. + * The goal is to minimize interaction with the PME CPU structure in the GPU code. + * + * \param[in] pme The PME structure. + */ +static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme) +{ + /* TODO: Consider refactoring the CPU PME code to use the same structure, + * so that this function becomes 2 lines */ + pme_gpu_t *pmeGPU = pme->gpu; + pmeGPU->common->ngrids = pme->ngrids; + pmeGPU->common->epsilon_r = pme->epsilon_r; + pmeGPU->common->ewaldcoeff_q = pme->ewaldcoeff_q; + pmeGPU->common->nk[XX] = pme->nkx; + pmeGPU->common->nk[YY] = pme->nky; + pmeGPU->common->nk[ZZ] = pme->nkz; + pmeGPU->common->pmegrid_n[XX] = pme->pmegrid_nx; + pmeGPU->common->pmegrid_n[YY] = pme->pmegrid_ny; + pmeGPU->common->pmegrid_n[ZZ] = pme->pmegrid_nz; + pmeGPU->common->pme_order = pme->pme_order; + for (int i = 0; i < DIM; i++) + { + pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]); + } + const int cellCount = c_pmeNeighborUnitcellCount; + pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx); + pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky); + pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz); + pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx); + pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky); + pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz); + pmeGPU->common->runMode = pme->runMode; +} + +/*! \brief \libinternal + * Finds out if PME with given inputs is possible to run on GPU. + * + * \param[in] pme The PME structure. + * \param[out] error The error message if the input is not supported on GPU. + * \returns True if this PME input is possible to run on GPU, false otherwise. + */ +static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error) +{ + std::list errorReasons; + if (pme->nnodes != 1) + { + errorReasons.push_back("PME decomposition"); + } + if (pme->pme_order != 4) + { + errorReasons.push_back("interpolation orders other than 4"); + } + if (pme->bFEP) + { + errorReasons.push_back("free energy calculations (multiple grids)"); + } + if (pme->doLJ) + { + errorReasons.push_back("Lennard-Jones PME"); + } +#if GMX_DOUBLE + { + errorReasons.push_back("double precision"); + } +#endif +#if GMX_GPU != GMX_GPU_CUDA + { + errorReasons.push_back("non-CUDA build of GROMACS"); + } +#endif + + bool inputSupported = errorReasons.empty(); + if (!inputSupported && error) + { + std::string regressionTestMarker = "PME GPU does not support"; + // this prefix is tested for in the regression tests script gmxtest.pl + *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + "."; + } + return inputSupported; +} + +/*! \libinternal \brief + * Initializes the PME GPU data at the beginning of the run. + * + * \param[in,out] pme The PME structure. + * \param[in,out] gpuInfo The GPU information structure. + * \param[in] mdlog The logger. + * \param[in] cr The communication structure. + */ +static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, + const t_commrec *cr) +{ + std::string errorString; + bool canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString); + if (!canRunOnGpu) + { + GMX_THROW(gmx::NotImplementedError(errorString)); + } + + pme->gpu = new pme_gpu_t(); + pme_gpu_t *pmeGPU = pme->gpu; + pmeGPU->common = std::shared_ptr(new pme_shared_t()); + + /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */ + /* A convenience variable. */ + pmeGPU->settings.useDecomposition = (pme->nnodes == 1); + /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */ + pmeGPU->settings.performGPUGather = true; + + // GPU initialization + init_gpu(mdlog, cr->nodeid, gpuInfo); + pmeGPU->deviceInfo = gpuInfo; + + pme_gpu_init_internal(pmeGPU); + pme_gpu_init_sync_events(pmeGPU); + pme_gpu_alloc_energy_virial(pmeGPU); + + pme_gpu_copy_common_data_from(pme); + + GMX_ASSERT(pmeGPU->common->epsilon_r != 0.0f, "PME GPU: bad electrostatic coefficient"); + + auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); + kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r; +} + +void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr) +{ + if (!pme_gpu_active(pme)) + { + return; + } + + if (!pme->gpu) + { + /* First-time initialization */ + pme_gpu_init(pme, gpuInfo, mdlog, cr); + } + else + { + /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */ + pme_gpu_copy_common_data_from(pme); + } + /* GPU FFT will only get used for a single rank.*/ + pme->gpu->settings.performGPUFFT = (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu); + pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU); + + pme_gpu_reinit_grids(pme->gpu); + pme_gpu_reinit_step(pme->gpu); +} + +void pme_gpu_destroy(pme_gpu_t *pmeGPU) +{ + /* Free lots of data */ + pme_gpu_free_energy_virial(pmeGPU); + pme_gpu_free_bspline_values(pmeGPU); + pme_gpu_free_forces(pmeGPU); + pme_gpu_free_coordinates(pmeGPU); + pme_gpu_free_coefficients(pmeGPU); + pme_gpu_free_spline_data(pmeGPU); + pme_gpu_free_grid_indices(pmeGPU); + pme_gpu_free_fract_shifts(pmeGPU); + pme_gpu_free_grids(pmeGPU); + + pme_gpu_destroy_3dfft(pmeGPU); + pme_gpu_destroy_sync_events(pmeGPU); + + /* Free the GPU-framework specific data last */ + pme_gpu_destroy_specific(pmeGPU); + + delete pmeGPU; +} + +void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, const int nAtoms, const real *charges) +{ + auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU); + kernelParamsPtr->atoms.nAtoms = nAtoms; + const int alignment = pme_gpu_get_atom_data_alignment(pmeGPU); + pmeGPU->nAtomsPadded = ((nAtoms + alignment - 1) / alignment) * alignment; + const int nAtomsAlloc = c_usePadding ? pmeGPU->nAtomsPadded : nAtoms; + const bool haveToRealloc = (pmeGPU->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */ + pmeGPU->nAtomsAlloc = nAtomsAlloc; + +#if GMX_DOUBLE + GMX_RELEASE_ASSERT(false, "Only single precision supported"); + GMX_UNUSED_VALUE(charges); +#else + pme_gpu_realloc_and_copy_input_coefficients(pmeGPU, reinterpret_cast(charges)); + /* Could also be checked for haveToRealloc, but the copy always needs to be performed */ +#endif + + if (haveToRealloc) + { + pme_gpu_realloc_coordinates(pmeGPU); + pme_gpu_realloc_forces(pmeGPU); + pme_gpu_realloc_spline_data(pmeGPU); + pme_gpu_realloc_grid_indices(pmeGPU); + } +} diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h new file mode 100644 index 0000000000..03fe8de852 --- /dev/null +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -0,0 +1,551 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * + * \brief This file contains internal function definitions for performing the PME calculations on GPU. + * These are not meant to be exposed outside of the PME GPU code. + * As of now, their bodies are still in the common pme-gpu.cpp files. + * + * \author Aleksei Iupinov + * \ingroup module_ewald + */ + +#ifndef GMX_EWALD_PME_GPU_INTERNAL_H +#define GMX_EWALD_PME_GPU_INTERNAL_H + +#include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros + +#include "pme-gpu-types.h" // for the inline functions accessing pme_gpu_t members + +struct gmx_hw_info_t; +struct gmx_gpu_opt_t; +struct gmx_pme_t; // only used in pme_gpu_reinit +struct t_commrec; +struct gmx_wallclock_gpu_pme_t; +struct pme_atomcomm_t; + +namespace gmx +{ +class MDLogger; +} + +/* Some general constants for PME GPU behaviour follow. */ + +/*! \brief \libinternal + * false: The atom data GPU buffers are sized precisely according to the number of atoms. + * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp). + * The atom index checks in the spread/gather code potentially hinder the performance. + * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms + * fitting in is divisible by PME_ATOM_DATA_ALIGNMENT. + * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen. + * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now). + * \todo Estimate performance differences + */ +const bool c_usePadding = true; + +/*! \brief \libinternal + * false: Atoms with zero charges are processed by PME. Could introduce some overhead. + * true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather. + * Could be good for performance in specific systems with lots of neutral atoms. + * \todo Estimate performance differences. + */ +const bool c_skipNeutralAtoms = false; + +/*! \brief \libinternal + * Number of PME solve output floating point numbers. + * 6 for symmetric virial matrix + 1 for reciprocal energy. + */ +const int c_virialAndEnergyCount = 7; + +/* A block of CUDA-only functions that live in pme.cu */ + +/*! \libinternal \brief + * Returns the number of atoms per chunk in the atom charges/coordinates data layout. + * Depends on CUDA-specific block sizes, needed for the atom data padding. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns Number of atoms in a single GPU atom data chunk. + */ +CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) + +/*! \libinternal \brief + * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns Number of atoms in a single GPU atom spline data chunk. + */ +CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1) + +/*! \libinternal \brief + * Synchronizes the current step, waiting for the GPU kernels/transfers to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Allocates the fixed size energy and virial buffer both on GPU and CPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the energy and virial memory both on GPU and CPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Clears the energy and virial memory on GPU with 0. + * Should be called at the end of the energy/virial calculation step. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates and copies the pre-computed B-spline values to the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the GPU buffer for the PME forces. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the GPU buffer for the PME forces. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces). + * To be called e.g. after the bonded calculations. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] h_forces The input forces rvec buffer. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + const float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[out] h_forces The output forces rvec buffer. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Waits for the PME GPU output forces copying to the CPU buffer to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed). + * + * \param[in] pmeGPU The PME GPU structure. + * + * Needs to be called on every DD step/in the beginning. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the input coordinates from the CPU buffer onto the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] h_coordinates Input coordinates (XYZ rvec array). + * + * Needs to be called every MD step. The coordinates are then used in the spline calculation. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the coordinates on the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer. + * Clears the padded part if needed. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] h_coefficients The input atom charges/coefficients. + * + * Does not need to be done every MD step, only whenever the local charges change. + * (So, in the beginning of the run, or on DD step). + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + const float *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the charges/coefficients on the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the buffers on the GPU and the host for the atoms spline data. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the buffers on the GPU for the atoms spline data. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the buffers on the GPU and the host for the particle gridline indices. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the buffer on the GPU for the particle gridline indices. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Clears the real space grid on the GPU. + * Should be called at the end of each MD step. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Frees the pre-computed fractional coordinates' shifts on the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Waits for the output virial/energy copying to the intermediate CPU buffer to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the input real-space grid from the host to the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] h_grid The host-side grid buffer. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the output real-space grid from the GPU to the host. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[out] h_grid The host-side grid buffer. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Waits for the grid copying to the host-side buffer after spreading to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Waits for the grid copying to the host-side buffer after solving to finish. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Does the one-time GPU-framework specific PME initialization. + * For CUDA, the PME stream is created with the highest priority. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Destroys the PME GPU-framework specific data. + * Should be called last in the PME GPU destructor. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Initializes the PME GPU synchronization events. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Destroys the PME GPU synchronization events. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Initializes the CUDA FFT structures. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Destroys the CUDA FFT structures. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/* Several CUDA event-based timing functions that live in pme-timings.cu */ + +/*! \libinternal \brief + * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \brief + * Resets the PME GPU timings. To be called at the reset step. + * + * \param[in] pmeGPU The PME GPU structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM + +/*! \libinternal \brief + * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] timings The gmx_wallclock_gpu_pme_t structure. + */ +CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU), + gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM + +/* The inlined convenience PME GPU status getters */ + +/*! \libinternal \brief + * Tells if PME runs on multiple GPUs with the decomposition. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns True if PME runs on multiple GPUs, false otherwise. + */ +gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU) +{ + return !pmeGPU->settings.useDecomposition; +} + +/*! \libinternal \brief + * Tells if PME performs the gathering stage on GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns True if the gathering is performed on GPU, false otherwise. + */ +gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU) +{ + return pmeGPU->settings.performGPUGather; +} + +/*! \libinternal \brief + * Tells if PME performs the FFT stages on GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns True if FFT is performed on GPU, false otherwise. + */ +gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU) +{ + return pmeGPU->settings.performGPUFFT; +} + +/*! \libinternal \brief + * Tells if PME performs the grid (un-)wrapping on GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns True if (un-)wrapping is performed on GPU, false otherwise. + */ +gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU) +{ + return pmeGPU->settings.useDecomposition; +} + +/*! \libinternal \brief + * Tells if PME performs the grid solving on GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \returns True if solving is performed on GPU, false otherwise. + */ +gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU) +{ + return pmeGPU->settings.performGPUSolve; +} + +/* A block of C++ functions that live in pme-gpu-internal.cpp */ + +/*! \libinternal \brief + * Returns the output virial and energy of the PME solving. + * Should be called after pme_gpu_finish_step. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[out] energy The output energy. + * \param[out] virial The output virial matrix. + */ +void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial); + +/*! \libinternal \brief + * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step(). + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] box The unit cell box. + */ +void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box); + +/*! \libinternal \brief + * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters). + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] needToUpdateBox Tells if the stored unit cell parameters should be updated from \p box. + * \param[in] box The unit cell box which does not necessarily change every step (only with pressure coupling enabled). + * Would only be used if \p needToUpdateBox is true. + * \param[in] h_coordinates Input coordinates (XYZ rvec array). + */ +void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates); + +/*! \libinternal \brief + * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] bCalcForces The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME + * \param[in] bCalcEnerVir The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead. + */ +void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcForces, + const bool bCalcEnerVir); + +/*! \libinternal \brief + * (Re-)initializes the PME GPU data at the beginning of the run or on DLB. + * + * \param[in,out] pme The PME structure. + * \param[in,out] gpuInfo The GPU information structure. + * \param[in] mdlog The logger. + * \param[in] cr The communication structure. + * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs. + */ +void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr); + +/*! \libinternal \brief + * Destroys the PME GPU data at the end of the run. + * + * \param[in] pmeGPU The PME GPU structure. + */ +void pme_gpu_destroy(pme_gpu_t *pmeGPU); + +/*! \libinternal \brief + * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU. + * + * \param[in] pmeGPU The PME GPU structure. + * \param[in] nAtoms The number of particles. + * \param[in] charges The pointer to the host-side array of particle charges. + * + * This is a function that should only be called in the beginning of the run and on domain decomposition. + * Should be called before the pme_gpu_set_io_ranges. + */ +void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, + const int nAtoms, + const real *charges); + +#endif diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h new file mode 100644 index 0000000000..1bf3a88c1a --- /dev/null +++ b/src/gromacs/ewald/pme-gpu-types.h @@ -0,0 +1,356 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines the GPU-agnostic PME GPU data structures + * (the host-side PME GPU data, and the GPU function parameters). + * \todo Due to Gerrit workflow and time constraints, some renaming/refactoring + * which does not impair the performance will be performed once + * most of the initial PME CUDA implementation is merged + * into the master branch (likely, after release 2017). + * This should include: + * -- bringing the structure/function names up to guidelines + * ---- pme_gpu_settings_t -> PmeGpuTasks + * -- refining GPU notation application (#2053) + * -- renaming coefficients to charges (?) + * + * \author Aleksei Iupinov + * \ingroup module_ewald + */ + +#ifndef GMX_EWALD_PME_GPU_TYPES_H +#define GMX_EWALD_PME_GPU_TYPES_H + +#include "config.h" + +#include +#include + +#include "gromacs/math/vectypes.h" +#include "gromacs/utility/basedefinitions.h" + +struct gmx_hw_info; +struct gmx_device_info_t; + +/*! \brief Possible PME codepaths + * \todo: make this enum class with gmx_pme_t C++ refactoring + */ +enum PmeRunMode +{ + CPU, //!< Whole PME step is done on CPU + GPU, //!< Whole PME step is done on GPU + Hybrid, //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU. +}; + +#if GMX_GPU == GMX_GPU_CUDA + +struct pme_gpu_cuda_t; +/*! \brief A typedef for including the GPU host data by pointer */ +typedef pme_gpu_cuda_t pme_gpu_specific_t; + +struct pme_gpu_cuda_kernel_params_t; +/*! \brief A typedef for including the GPU kernel arguments data by pointer */ +typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t; + +#else + +/*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */ +typedef int pme_gpu_specific_t; +/*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */ +typedef int pme_gpu_kernel_params_t; + +#endif + +/* What follows is all the PME GPU function arguments, + * sorted into several device-side structures depending on the update rate. + * This is GPU agnostic (float3 replaced by float[3], etc.). + * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described + * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh. + */ + +/*! \internal \brief + * A GPU data structure for storing the constant PME data. + * This only has to be initialized once. + */ +struct pme_gpu_const_params_t +{ + /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */ + float elFactor; + /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats. + * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */ + float *d_virialAndEnergy; +}; + +/*! \internal \brief + * A GPU data structure for storing the PME data related to the grid sizes and cut-off. + * This only has to be updated at every DD step. + */ +struct pme_gpu_grid_params_t +{ + /* Grid sizes */ + /*! \brief Real-space grid data dimensions. */ + int realGridSize[DIM]; + /*! \brief Real-space grid dimensions, only converted to floating point. */ + float realGridSizeFP[DIM]; + /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */ + int realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */ + /*! \brief Fourier grid dimensions. This counts the complex numbers! */ + int complexGridSize[DIM]; + /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */ + int complexGridSizePadded[DIM]; + + /* Grid pointers */ + /*! \brief Real space grid. */ + float *d_realGrid; + /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */ + float *d_fourierGrid; + + /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */ + float ewaldFactor; + + /*! \brief Grid spline values as in pme->bsp_mod + * (laid out sequentially (XXX....XYYY......YZZZ.....Z)) + */ + float *d_splineModuli; + /*! \brief Offsets for X/Y/Z components of d_splineModuli */ + int splineValuesOffset[DIM]; + + /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */ + float *d_fractShiftsTable; + /*! \brief Gridline indices lookup table + * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */ + int *d_gridlineIndicesTable; + /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */ + int tablesOffsets[DIM]; +}; + +/*! \internal \brief + * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition. + * This only has to be updated every DD step. + */ +struct pme_gpu_atom_params_t +{ + /*! \brief Number of local atoms */ + int nAtoms; + /*! \brief Pointer to the global GPU memory with input rvec atom coordinates. + * The coordinates themselves change and need to be copied to the GPU every MD step, + * but reallocation happens only at DD. + */ + float *d_coordinates; + /*! \brief Pointer to the global GPU memory with input atom charges. + * The charges only need to be reallocated and copied to the GPU at DD step. + */ + float *d_coefficients; + /*! \brief Pointer to the global GPU memory with input/output rvec atom forces. + * The forces change and need to be copied from (and possibly to) the GPU every MD step, + * but reallocation happens only at DD. + */ + float *d_forces; + /*! \brief Pointer to the global GPU memory with ivec atom gridline indices. + * Computed on GPU in the spline calculation part. + */ + int *d_gridlineIndices; + + /* B-spline parameters are computed entirely on GPU every MD step, not copied. + * Unless we want to try something like GPU spread + CPU gather? + */ + /*! \brief Pointer to the global GPU memory with B-spline values */ + float *d_theta; + /*! \brief Pointer to the global GPU memory with B-spline derivative values */ + float *d_dtheta; +}; + +/*! \internal \brief + * A GPU data structure for storing the PME data which might change every MD step. + */ +struct pme_gpu_step_params_t +{ + /* The box parameters. The box only changes size each step with pressure coupling enabled. */ + /*! \brief + * Reciprocal (inverted unit cell) box. + * + * The box is transposed as compared to the CPU pme->recipbox. + * Basically, spread uses matrix columns (while solve and gather use rows). + * This storage format might be not the most optimal since the box is always triangular so there are zeroes. + */ + float recipBox[DIM][DIM]; + /*! \brief The unit cell volume for solving. */ + float boxVolume; +}; + +/*! \internal \brief + * A single structure encompassing almost all the PME data used in GPU kernels on device. + * This is inherited by the GPU framework-specific structure + * (pme_gpu_cuda_kernel_params_t in pme.cuh). + * This way, most code preparing the kernel parameters can be GPU-agnostic by casting + * the kernel parameter data pointer to pme_gpu_kernel_params_base_t. + */ +struct pme_gpu_kernel_params_base_t +{ + /*! \brief Constant data that is set once. */ + pme_gpu_const_params_t constants; + /*! \brief Data dependent on the grid size/cutoff. */ + pme_gpu_grid_params_t grid; + /*! \brief Data dependent on the DD and local atoms. */ + pme_gpu_atom_params_t atoms; + /*! \brief Data that possibly changes on every MD step. */ + pme_gpu_step_params_t step; +}; + +/* Here are the host-side structures */ + +/*! \internal \brief + * The PME GPU settings structure, included in the main PME GPU structure by value. + */ +struct pme_gpu_settings_t +{ + /* Permanent settings set on initialization */ + /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */ + bool performGPUSolve; + /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */ + bool performGPUGather; + /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */ + bool performGPUFFT; + /*! \brief A convenience boolean which tells if PME decomposition is used. */ + bool useDecomposition; + /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host. + * Only intended to be used by the test framework. + */ + bool copyAllOutputs; + /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */ + int stepFlags; +}; + +/*! \internal \brief + * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value. + * Buffers are managed by the PME GPU module. + */ +struct pme_gpu_staging_t +{ + /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */ + float *h_virialAndEnergy; + /*! \brief B-spline values intermediate host-side buffer. */ + float *h_splineModuli; + + /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */ + float *h_theta; + /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */ + float *h_dtheta; + /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */ + int *h_gridlineIndices; +}; + +/*! \internal \brief + * The PME GPU structure for all the data copied directly from the CPU PME structure. + * The copying is done when the CPU PME structure is already (re-)initialized + * (pme_gpu_reinit is called at the end of gmx_pme_init). + * All the variables here are named almost the same way as in gmx_pme_t. + * The types are different: pointers are replaced by vectors. + * TODO: use the shared data with the PME CPU. + * Included in the main PME GPU structure by value. + */ +struct pme_shared_t +{ + /*! \brief Grid count - currently always 1 on GPU */ + int ngrids; + /*! \brief Grid dimensions - nkx, nky, nkz */ + int nk[DIM]; + /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz + * TODO: find out if these are really needed for the CPU FFT compatibility. + */ + int pmegrid_n[DIM]; + /*! \brief PME interpolation order */ + int pme_order; + /*! \brief Ewald splitting coefficient for Coulomb */ + real ewaldcoeff_q; + /*! \brief Electrostatics parameter */ + real epsilon_r; + /*! \brief Gridline indices - nnx, nny, nnz */ + std::vector nn[DIM]; + /*! \brief Fractional shifts - fshx, fshy, fshz */ + std::vector fsh[DIM]; + /*! \brief Precomputed B-spline values */ + std::vector bsp_mod[DIM]; + /*! \brief The PME codepath being taken */ + PmeRunMode runMode; +}; + +/*! \internal \brief + * The main PME GPU host structure, included in the PME CPU structure by pointer. + */ +struct pme_gpu_t +{ + /*! \brief The information copied once per reinit from the CPU structure. */ + std::shared_ptr common; // TODO: make the CPU structure use the same type + + /*! \brief The settings. */ + pme_gpu_settings_t settings; + + /*! \brief The host-side buffers. + * The device-side buffers are buried in kernelParams, but that will have to change. + */ + pme_gpu_staging_t staging; + + /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT. + * Used for kernel scheduling. + * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying. + * TODO: this and the next member represent a memory allocation/padding properties - + * what a container type should do ideally. + */ + int nAtomsPadded; + /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT + * if c_usePadding is true. + * Used only as a basic size for almost all the atom data allocations + * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP). + * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms). + * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying. + */ + int nAtomsAlloc; + + /*! \brief A pointer to the device used during the execution. */ + gmx_device_info_t *deviceInfo; + + /*! \brief A single structure encompassing all the PME data used on GPU. + * Its value is the only argument to all the PME GPU kernels. + * \todo Test whether this should be copied to the constant GPU memory once per MD step + * (or even less often with no box updates) instead of being an argument. + */ + std::shared_ptr kernelParams; + + /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */ + std::shared_ptr archSpecific; /* FIXME: make it an unique_ptr */ +}; + +#endif diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp new file mode 100644 index 0000000000..e830c53d8d --- /dev/null +++ b/src/gromacs/ewald/pme-gpu.cpp @@ -0,0 +1,114 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code. + * + * \author Aleksei Iupinov + * \ingroup module_ewald + */ + +#include "gmxpre.h" + +#include "gromacs/ewald/pme.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" + +#include "pme-gpu-internal.h" +#include "pme-grid.h" +#include "pme-internal.h" +#include "pme-solve.h" + +bool pme_gpu_task_enabled(const gmx_pme_t *pme) +{ + return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU); +} + +void pme_gpu_reset_timings(const gmx_pme_t *pme) +{ + if (pme_gpu_active(pme)) + { + pme_gpu_reset_timings(pme->gpu); + } +} + +void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings) +{ + if (pme_gpu_active(pme)) + { + pme_gpu_get_timings(pme->gpu, timings); + } +} + +void pme_gpu_get_results(const gmx_pme_t *pme, + gmx_wallcycle_t wcycle, + matrix vir_q, + real *energy_q) +{ + GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); + + const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR; + const bool haveComputedForces = pme->gpu->settings.stepFlags & GMX_PME_CALC_F; + + // The wallcycle hierarchy is messy - we pause ewcFORCE just to exclude the PME GPU sync time + wallcycle_stop(wcycle, ewcFORCE); + + wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER); + pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial); + wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); + + wallcycle_start_nocount(wcycle, ewcFORCE); + + if (haveComputedEnergyAndVirial) + { + if (pme->doCoulomb) + { + if (pme_gpu_performs_solve(pme->gpu)) + { + pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q); + } + else + { + get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q); + } + } + else + { + *energy_q = 0; + } + } + /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */ +} diff --git a/src/gromacs/ewald/pme-grid.cpp b/src/gromacs/ewald/pme-grid.cpp index 5c01efa499..82479cbf56 100644 --- a/src/gromacs/ewald/pme-grid.cpp +++ b/src/gromacs/ewald/pme-grid.cpp @@ -50,6 +50,8 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" +#include "pme-internal.h" + #ifdef DEBUG_PME #include "gromacs/fileio/pdbio.h" #include "gromacs/utility/cstringutil.h" diff --git a/src/gromacs/ewald/pme-grid.h b/src/gromacs/ewald/pme-grid.h index 7f398a24f9..e5e4f62368 100644 --- a/src/gromacs/ewald/pme-grid.h +++ b/src/gromacs/ewald/pme-grid.h @@ -41,8 +41,7 @@ #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" -#include "pme-internal.h" - +struct gmx_pme_t; /*! \brief * We allow coordinates to be out the unit-cell by up to 2 box lengths, @@ -56,24 +55,26 @@ constexpr int c_pmeMaxUnitcellShift = 2; */ constexpr int c_pmeNeighborUnitcellCount = 2*c_pmeMaxUnitcellShift + 1; +struct pmegrid_t; +struct pmegrids_t; #if GMX_MPI void -gmx_sum_qgrid_dd(struct gmx_pme_t *pme, real *grid, int direction); +gmx_sum_qgrid_dd(gmx_pme_t *pme, real *grid, int direction); #endif int copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, real *pmegrid, real *fftgrid, int grid_index); int -copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index, +copy_fftgrid_to_pmegrid(gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index, int nthread, int thread); void wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid); void -unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid); +unwrap_periodic_pmegrid(gmx_pme_t *pme, real *pmegrid); void pmegrid_init(pmegrid_t *grid, @@ -110,6 +111,6 @@ reuse_pmegrids(const pmegrids_t *oldgrid, pmegrids_t *newgrid); /* This function is called from gmx_pme_do() only from debugging code that is commented out. */ void -dump_local_fftgrid(struct gmx_pme_t *pme, const real *fftgrid); +dump_local_fftgrid(gmx_pme_t *pme, const real *fftgrid); #endif diff --git a/src/gromacs/ewald/pme-internal.h b/src/gromacs/ewald/pme-internal.h index fbb0dfe37a..0feb977419 100644 --- a/src/gromacs/ewald/pme-internal.h +++ b/src/gromacs/ewald/pme-internal.h @@ -64,11 +64,14 @@ #include "gromacs/timing/walltime_accounting.h" #include "gromacs/utility/gmxmpi.h" +#include "pme-gpu-types.h" + //! A repeat of typedef from parallel_3dfft.h typedef struct gmx_parallel_3dfft *gmx_parallel_3dfft_t; struct t_commrec; struct t_inputrec; +struct pme_gpu_t; //@{ //! Grid indices for A state for charge and Lennard-Jones C6 @@ -167,7 +170,7 @@ typedef struct { } splinedata_t; /*! \brief Data structure for coordinating transfer between PP and PME ranks*/ -typedef struct { +struct pme_atomcomm_t{ int dimind; /* The index of the dimension, 0=x, 1=y */ int nslab; int nodeid; @@ -203,20 +206,20 @@ typedef struct { int *thread_idx; /* Which thread should spread which coefficient */ thread_plist_t *thread_plist; splinedata_t *spline; -} pme_atomcomm_t; +}; /*! \brief Data structure for a single PME grid */ -typedef struct { +struct pmegrid_t{ ivec ci; /* The spatial location of this grid */ ivec n; /* The used size of *grid, including order-1 */ ivec offset; /* The grid offset from the full node grid */ int order; /* PME spreading order */ ivec s; /* The allocated size of *grid, s >= n */ real *grid; /* The grid local thread, size n */ -} pmegrid_t; +}; /*! \brief Data structures for PME grids */ -typedef struct { +struct pmegrids_t{ pmegrid_t grid; /* The full node grid (non thread-local) */ int nthread; /* The number of threads operating on this grid */ ivec nc; /* The local spatial decomposition over the threads */ @@ -224,7 +227,7 @@ typedef struct { real *grid_all; /* Allocated array for the grids in *grid_th */ int *g2t[DIM]; /* The grid to thread index */ ivec nthread_comm; /* The number of threads to communicate with */ -} pmegrids_t; +}; /*! \brief Data structure for spline-interpolation working buffers */ struct pme_spline_work; @@ -233,7 +236,7 @@ struct pme_spline_work; struct pme_solve_work_t; /*! \brief Master PME data structure */ -typedef struct gmx_pme_t { +struct gmx_pme_t { int ndecompdim; /* The number of decomposition dimensions */ int nodeid; /* Our nodeid in mpi->mpi_comm */ int nodeid_major; @@ -264,7 +267,26 @@ typedef struct gmx_pme_t { real ewaldcoeff_lj; /* Ewald splitting coefficient for r^-6 */ real epsilon_r; - class EwaldBoxZScaler *boxScaler; /*! The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */ + + enum PmeRunMode runMode; /* Which codepath is the PME runner taking - CPU, GPU, mixed; + * TODO: this is the information that should be owned by the task scheduler, + * and ideally not be duplicated here. + */ + + pme_gpu_t *gpu; /* A pointer to the GPU data. + * TODO: this should be unique or a shared pointer. + * Currently in practice there is a single gmx_pme_t instance while a code + * is partially set up for many of them. The PME tuning calls gmx_pme_reinit() + * which fully reinitializes the one and only PME structure anew while maybe + * keeping the old grid buffers if they were already large enough. + * This small choice should be made clear in the later refactoring - + * do we store many PME objects for different grid sizes, + * or a single PME object that handles different grid sizes gracefully. + */ + + + class EwaldBoxZScaler *boxScaler; /**< The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */ + int ljpme_combination_rule; /* Type of combination rule in LJ-PME */ @@ -279,6 +301,7 @@ typedef struct gmx_pme_t { * This can probably be done in a better way * but this simple hack works for now */ + /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */ int pmegrid_nx, pmegrid_ny, pmegrid_nz; /* pmegrid_nz might be larger than strictly necessary to ensure @@ -329,10 +352,40 @@ typedef struct gmx_pme_t { /* Work data for sum_qgrid */ real * sum_qgrid_tmp; real * sum_qgrid_dd_tmp; -} t_gmx_pme_t; +}; //! @endcond +/*! \brief + * Finds out if PME is currently running on GPU. + * TODO: should this be removed eventually? + * + * \param[in] pme The PME structure. + * \returns True if PME runs on GPU currently, false otherwise. + */ +inline bool pme_gpu_active(const gmx_pme_t *pme) +{ + return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU); +} + +/*! \brief Check restrictions on pme_order and the PME grid nkx,nky,nkz. + * + * With bFatal=TRUE, a fatal error is generated on violation, + * bValidSettings=NULL can be passed. + * With bFatal=FALSE, *bValidSettings reports the validity of the settings. + * bUseThreads tells if any MPI rank doing PME uses more than 1 threads. + * If at calling you bUseThreads is unknown, pass TRUE for conservative + * checking. + * + * TODO: the GPU restrictions are checked separately during pme_gpu_init(). + */ +void gmx_pme_check_restrictions(int pme_order, + int nkx, int nky, int nkz, + int nnodes_major, + gmx_bool bUseThreads, + gmx_bool bFatal, + gmx_bool *bValidSettings); + /*! \brief Initialize the PME-only side of the PME <-> PP communication */ gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr); @@ -349,11 +402,33 @@ enum { /*! \brief Called by PME-only ranks to receive coefficients and coordinates * - * The return value is used to control further processing, with meanings: - * pmerecvqxX: all parameters set, chargeA and chargeB can be NULL - * pmerecvqxFINISH: no parameters set - * pmerecvqxSWITCHGRID: only grid_size and *ewaldcoeff are set - * pmerecvqxRESETCOUNTERS: *step is set + * \param[in,out] pme_pp PME-PP communication structure. + * \param[out] natoms Number of received atoms. + * \param[out] chargeA State A charges, if received. + * \param[out] chargeB State B charges, if received. + * \param[out] sqrt_c6A State A coefficients, if received. + * \param[out] sqrt_c6B State B coefficients, if received. + * \param[out] sigmaA State A coefficients, if received. + * \param[out] sigmaB State B coefficients, if received. + * \param[out] box System box, if received. + * \param[out] x Atoms' coordinates, if received. + * \param[out] f Atoms' PME forces, if received. + * \param[out] maxshift_x Maximum shift in X direction, if received. + * \param[out] maxshift_y Maximum shift in Y direction, if received. + * \param[out] lambda_q Free-energy lambda for electrostatics, if received. + * \param[out] lambda_lj Free-energy lambda for Lennard-Jones, if received. + * \param[out] bEnerVir Set to true if this is an energy/virial calculation step, otherwise set to false. + * \param[out] step MD integration step number. + * \param[out] grid_size PME grid size, if received. + * \param[out] ewaldcoeff_q Ewald cut-off parameter for electrostatics, if received. + * \param[out] ewaldcoeff_lj Ewald cut-off parameter for Lennard-Jones, if received. + * \param[out] atomSetChanged Set to true only if the local domain atom data (charges/coefficients) + * has been received (after DD) and should be reinitialized. Otherwise not changed. + * + * \retval pmerecvqxX All parameters were set, chargeA and chargeB can be NULL. + * \retval pmerecvqxFINISH No parameters were set. + * \retval pmerecvqxSWITCHGRID Only grid_size and *ewaldcoeff were set. + * \retval pmerecvqxRESETCOUNTERS *step was set. */ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp, int *natoms, @@ -365,7 +440,10 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp, real *lambda_q, real *lambda_lj, gmx_bool *bEnerVir, gmx_int64_t *step, - ivec grid_size, real *ewaldcoeff_q, real *ewaldcoeff_lj); + ivec grid_size, + real *ewaldcoeff_q, + real *ewaldcoeff_lj, + bool *atomSetChanged); /*! \brief Send the PME mesh force, virial and energy to the PP-only nodes */ void gmx_pme_send_force_vir_ener(struct gmx_pme_pp *pme_pp, diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index d42337c153..dc85a081cf 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -845,7 +845,14 @@ pme_load_balance(pme_load_balancing_t *pme_lb, if (!pme_lb->bSepPMERanks) { - if (pme_lb->setup[pme_lb->cur].pmedata == nullptr) + /* FIXME: + * CPU PME keeps a list of allocated pmedata's, that's why pme_lb->setup[pme_lb->cur].pmedata is not always nullptr. + * GPU PME, however, currently needs the gmx_pme_reinit always called on load balancing + * (pme_gpu_reinit might be not sufficiently decoupled from gmx_pme_init). + * This can lead to a lot of reallocations for PME GPU. + * Would be nicer if the allocated grid list was hidden within a single pmedata structure. + */ + if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr) || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata)) { /* Generate a new PME data structure, * copying part of the old pointers. @@ -914,7 +921,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, int n_prev; double cycles_prev; - assert(pme_lb != NULL); + assert(pme_lb != nullptr); if (!pme_lb->bActive) { diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp index 66a7728b24..75eb9fe2a0 100644 --- a/src/gromacs/ewald/pme-only.cpp +++ b/src/gromacs/ewald/pme-only.cpp @@ -122,8 +122,13 @@ static void gmx_pmeonly_switch(int *npmedata, struct gmx_pme_t ***pmedata, pme->nky == grid_size[YY] && pme->nkz == grid_size[ZZ]) { + /* Here we have found an existing PME data structure that suits us. + * However, in the GPU case, we have to reinitialize it - there's only one GPU structure. + * This should not cause actual GPU reallocations, at least (the allocated buffers are never shrunk). + * So, just some grid size updates in the GPU kernel parameters. + */ + gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size, ewaldcoeff_q, ewaldcoeff_lj); *pme_ret = pme; - return; } @@ -183,6 +188,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme, do { /* Domain decomposition */ + bool atomSetChanged = false; ret = gmx_pme_recv_coeffs_coords(pme_pp, &natoms, &chargeA, &chargeB, @@ -193,7 +199,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme, &lambda_q, &lambda_lj, &bEnerVir, &step, - grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj); + grid_switch, + &ewaldcoeff_q, + &ewaldcoeff_lj, + &atomSetChanged); if (ret == pmerecvqxSWITCHGRID) { @@ -201,6 +210,11 @@ int gmx_pmeonly(struct gmx_pme_t *pme, gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, ewaldcoeff_q, ewaldcoeff_lj, cr, ir, &pme); } + if (atomSetChanged) + { + gmx_pme_reinit_atoms(pme, natoms, chargeA); + } + if (ret == pmerecvqxRESETCOUNTERS) { /* Reset the cycle and flop counters */ diff --git a/src/gromacs/ewald/pme-pp.cpp b/src/gromacs/ewald/pme-pp.cpp index 24819755e2..6a1de5199c 100644 --- a/src/gromacs/ewald/pme-pp.cpp +++ b/src/gromacs/ewald/pme-pp.cpp @@ -436,7 +436,8 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp, gmx_int64_t *step, ivec grid_size, real *ewaldcoeff_q, - real *ewaldcoeff_lj) + real *ewaldcoeff_lj, + bool *atomSetChanged) { int status = -1; int nat = 0; @@ -493,6 +494,8 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp, if (cnb.flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA)) { + *atomSetChanged = true; + /* Receive the send counts from the other PP nodes */ for (int sender = 0; sender < pme_pp->nnode; sender++) { @@ -644,6 +647,7 @@ int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp, GMX_UNUSED_VALUE(grid_size); GMX_UNUSED_VALUE(ewaldcoeff_q); GMX_UNUSED_VALUE(ewaldcoeff_lj); + GMX_UNUSED_VALUE(atomSetChanged); status = pmerecvqxX; #endif diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu new file mode 100644 index 0000000000..9fbe39dde9 --- /dev/null +++ b/src/gromacs/ewald/pme-timings.cu @@ -0,0 +1,112 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements PME GPU timing events in CUDA. + * + * \author Aleksei Iupinov + */ + +#include "gmxpre.h" + +#include "pme-timings.cuh" + +#include "gromacs/utility/gmxassert.h" + +#include "pme.cuh" + +/*! \brief \internal + * Tells if CUDA-based performance tracking is enabled for PME. + * + * \param[in] pme The PME data structure. + * \returns True if timings are enabled, false otherwise. + */ +gmx_inline bool pme_gpu_timings_enabled(const pme_gpu_t *pmeGPU) +{ + return pmeGPU->archSpecific->useTiming; +} + +void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index"); + pmeGPU->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGPU->archSpecific->pmeStream); + } +} + +void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index"); + pmeGPU->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGPU->archSpecific->pmeStream); + } +} + +void pme_gpu_get_timings(const pme_gpu_t *pmeGPU, gmx_wallclock_gpu_pme_t *timings) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + GMX_RELEASE_ASSERT(timings, "Null GPU timing pointer"); + for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++) + { + timings->timing[i].t = pmeGPU->archSpecific->timingEvents[i].getTotalTime(); + timings->timing[i].c = pmeGPU->archSpecific->timingEvents[i].getCallCount(); + } + } +} + +void pme_gpu_update_timings(const pme_gpu_t *pmeGPU) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++) + { + pmeGPU->archSpecific->timingEvents[i].getLastRangeTime(); + } + } +} + +void pme_gpu_reset_timings(const pme_gpu_t *pmeGPU) +{ + if (pme_gpu_timings_enabled(pmeGPU)) + { + for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++) + { + pmeGPU->archSpecific->timingEvents[i].reset(); + } + } +} diff --git a/src/gromacs/ewald/pme-timings.cuh b/src/gromacs/ewald/pme-timings.cuh new file mode 100644 index 0000000000..8c94e7f817 --- /dev/null +++ b/src/gromacs/ewald/pme-timings.cuh @@ -0,0 +1,66 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \libinternal \file + * \brief Defines PME GPU timing functions. + * + * \author Aleksei Iupinov + */ + +#ifndef GMX_EWALD_PME_TIMINGS_CUH +#define GMX_EWALD_PME_TIMINGS_CUH + +#include "gromacs/gpu_utils/gpuregiontimer.cuh" +#include "gromacs/timing/gpu_timing.h" // TODO: move include to the source files + +struct pme_gpu_t; + +/*! \libinternal \brief + * Starts timing the certain PME GPU stage during a single step (if timings are enabled). + * + * \param[in] pmeGPU The PME GPU data structure. + * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h + */ +void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId); + +/*! \libinternal \brief + * Stops timing the certain PME GPU stage during a single step (if timings are enabled). + * + * \param[in] pmeGPU The PME GPU data structure. + * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h + */ +void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId); + +#endif diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index fdd9b4a69b..cdd0a41f3d 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -104,6 +104,7 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxmpi.h" #include "gromacs/utility/gmxomp.h" +#include "gromacs/utility/logger.h" #include "gromacs/utility/real.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/stringutil.h" @@ -111,6 +112,7 @@ #include "calculate-spline-moduli.h" #include "pme-gather.h" +#include "pme-gpu-internal.h" #include "pme-grid.h" #include "pme-internal.h" #include "pme-redistribute.h" @@ -503,18 +505,22 @@ static int div_round_up(int enumerator, int denominator) return (enumerator + denominator - 1)/denominator; } -int gmx_pme_init(struct gmx_pme_t **pmedata, - t_commrec * cr, - int nnodes_major, - int nnodes_minor, - const t_inputrec * ir, - int homenr, - gmx_bool bFreeEnergy_q, - gmx_bool bFreeEnergy_lj, - gmx_bool bReproducible, - real ewaldcoeff_q, - real ewaldcoeff_lj, - int nthread) +int gmx_pme_init(struct gmx_pme_t **pmedata, + t_commrec *cr, + int nnodes_major, + int nnodes_minor, + const t_inputrec *ir, + int homenr, + gmx_bool bFreeEnergy_q, + gmx_bool bFreeEnergy_lj, + gmx_bool bReproducible, + real ewaldcoeff_q, + real ewaldcoeff_lj, + int nthread, + PmeRunMode runMode, + pme_gpu_t *pmeGPU, + gmx_device_info_t *gpuInfo, + const gmx::MDLogger &mdlog) { int use_threads, sum_use_threads, i; ivec ndata; @@ -745,6 +751,9 @@ int gmx_pme_init(struct gmx_pme_t **pmedata, snew(pme->bsp_mod[YY], pme->nky); snew(pme->bsp_mod[ZZ], pme->nkz); + pme->gpu = pmeGPU; /* Carrying over the single GPU structure */ + pme->runMode = runMode; + /* The required size of the interpolation grid, including overlap. * The allocated size (pmegrid_n?) might be slightly larger. */ @@ -755,7 +764,6 @@ int gmx_pme_init(struct gmx_pme_t **pmedata, pme->pmegrid_nz_base = pme->nkz; pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1; set_grid_alignment(&pme->pmegrid_nz, pme->pme_order); - pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major]; pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor]; pme->pmegrid_start_iz = 0; @@ -846,6 +854,8 @@ int gmx_pme_init(struct gmx_pme_t **pmedata, pme->lb_buf2 = nullptr; pme->lb_buf_nalloc = 0; + pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr); + pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx); // no exception was thrown during the init, so we hand over the PME structure handle @@ -891,8 +901,14 @@ int gmx_pme_reinit(struct gmx_pme_t **pmedata, try { + const gmx::MDLogger dummyLogger; + // This is reinit which is currently only changing grid size/coefficients, + // so we don't expect the actual logging. + // TODO: when PME is an object, it should take reference to mdlog on construction and save it. ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor, - &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread); + &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, + pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger); + //TODO this is mostly passing around current values } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; @@ -985,6 +1001,8 @@ int gmx_pme_do(struct gmx_pme_t *pme, real *dvdlambda_q, real *dvdlambda_lj, int flags) { + GMX_ASSERT(pme->runMode == PmeRunMode::CPU, "gmx_pme_do should not be called on the GPU PME run."); + int d, i, j, npme, grid_index, max_grid_index; int n_d; pme_atomcomm_t *atc = nullptr; @@ -1738,5 +1756,19 @@ void gmx_pme_destroy(gmx_pme_t *pme) sfree(pme->sum_qgrid_tmp); sfree(pme->sum_qgrid_dd_tmp); + if (pme_gpu_active(pme) && pme->gpu) + { + pme_gpu_destroy(pme->gpu); + } + sfree(pme); } + +void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges) +{ + if (pme_gpu_active(pme)) + { + pme_gpu_reinit_atoms(pme->gpu, nAtoms, charges); + } + // TODO: handle the CPU case here; handle the whole t_mdatoms +} diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu new file mode 100644 index 0000000000..bd67a01ee7 --- /dev/null +++ b/src/gromacs/ewald/pme.cu @@ -0,0 +1,515 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief This file contains internal CUDA function implementations + * for performing the PME calculations on GPU. + * + * \author Aleksei Iupinov + */ + +#include "gmxpre.h" + +#include + +/* The rest */ +#include "pme.h" + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/pmalloc_cuda.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/smalloc.h" + +#include "pme.cuh" +#include "pme-3dfft.cuh" +#include "pme-grid.h" + +int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU) +{ + const int order = pmeGPU->common->pme_order; + GMX_RELEASE_ASSERT(order > 0, "Invalid PME order"); + return PME_ATOM_DATA_ALIGNMENT; +} + +int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU) +{ + const int order = pmeGPU->common->pme_order; + GMX_RELEASE_ASSERT(order > 0, "Invalid PME order"); + return PME_SPREADGATHER_ATOMS_PER_WARP; +} + +void pme_gpu_synchronize(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!"); +} + +void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU) +{ + const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float); + cudaError_t stat = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize); + CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial"); + pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize); +} + +void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy); + CU_RET_ERR(stat, "cudaFree failed on PME energy and virial"); + pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr; + pfree(pmeGPU->staging.h_virialAndEnergy); + pmeGPU->staging.h_virialAndEnergy = nullptr; +} + +void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0, + c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error"); +} + +void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU) +{ + const int splineValuesOffset[DIM] = { + 0, + pmeGPU->kernelParams->grid.realGridSize[XX], + pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY] + }; + memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset)); + + const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] + + pmeGPU->kernelParams->grid.realGridSize[YY] + + pmeGPU->kernelParams->grid.realGridSize[ZZ]; + const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float), + &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true); + if (shouldRealloc) + { + /* Reallocate the host buffer */ + pfree(pmeGPU->staging.h_splineModuli); + pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float)); + } + for (int i = 0; i < DIM; i++) + { + memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float)); + } + /* TODO: pin original buffer instead! */ + cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli, + newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream); +} + +void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU) +{ + pfree(pmeGPU->staging.h_splineModuli); + cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize, + &pmeGPU->archSpecific->splineValuesSizeAlloc); +} + +void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU) +{ + const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM; + GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float), + &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true); +} + +void pme_gpu_free_forces(const pme_gpu_t *pmeGPU) +{ + cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc); +} + +void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces) +{ + GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); + const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); + GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU"); + cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream); +} + +void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces) +{ + GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); + const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); + GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU"); + cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream); + cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME gather forces synchronization failure"); +} + +void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU) +{ + cudaStream_t s = pmeGPU->archSpecific->pmeStream; + cudaError_t stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0); + CU_RET_ERR(stat, "Error while waiting for the PME GPU forces"); +} + +void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU) +{ + const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM; + GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU"); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float), + &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true); + if (c_usePadding) + { + const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms; + const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex; + if (paddingCount > 0) + { + cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME failed to clear the padded coordinates"); + } + } +} + +void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates) +{ + GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU"); +#if GMX_DOUBLE + GMX_RELEASE_ASSERT(false, "Only single precision is supported"); + GMX_UNUSED_VALUE(h_coordinates); +#else + cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast(h_coordinates), + pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream); +#endif +} + +void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU) +{ + cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc); +} + +void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients) +{ + GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU"); + const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc; + GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU"); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float), + &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc, + newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true); + cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), + pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream); + if (c_usePadding) + { + const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms; + const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex; + if (paddingCount > 0) + { + cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME failed to clear the padded charges"); + } + } +} + +void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU) +{ + cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc); +} + +void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU) +{ + const int order = pmeGPU->common->pme_order; + const int alignment = pme_gpu_get_atom_spline_data_alignment(pmeGPU); + const size_t nAtomsPadded = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment; + const int newSplineDataSize = DIM * order * nAtomsPadded; + GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU"); + /* Two arrays of the same size */ + const bool shouldRealloc = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize); + int currentSizeTemp = pmeGPU->archSpecific->splineDataSize; + int currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc; + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float), + ¤tSizeTemp, ¤tSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float), + &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true); + // the host side reallocation + if (shouldRealloc) + { + pfree(pmeGPU->staging.h_theta); + pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float)); + pfree(pmeGPU->staging.h_dtheta); + pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float)); + } +} + +void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU) +{ + /* Two arrays of the same size */ + cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta); + cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc); + pfree(pmeGPU->staging.h_theta); + pfree(pmeGPU->staging.h_dtheta); +} + +void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU) +{ + const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc; + GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU"); + cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int), + &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true); + pfree(pmeGPU->staging.h_gridlineIndices); + pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int)); +} + +void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU) +{ + cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc); + pfree(pmeGPU->staging.h_gridlineIndices); +} + +void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU) +{ + auto *kernelParamsPtr = pmeGPU->kernelParams.get(); + const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] * + kernelParamsPtr->grid.realGridSizePadded[YY] * + kernelParamsPtr->grid.realGridSizePadded[ZZ]; + const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] * + kernelParamsPtr->grid.complexGridSizePadded[YY] * + kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2; + // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float + if (pmeGPU->archSpecific->performOutOfPlaceFFT) + { + /* 2 separate grids */ + cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float), + &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc, + newComplexGridSize, pmeGPU->archSpecific->pmeStream, true); + cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float), + &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc, + newRealGridSize, pmeGPU->archSpecific->pmeStream, true); + } + else + { + /* A single buffer so that any grid will fit */ + const int newGridsSize = std::max(newRealGridSize, newComplexGridSize); + cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float), + &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc, + newGridsSize, pmeGPU->archSpecific->pmeStream, true); + kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid; + pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize; + // the size might get used later for copying the grid + } +} + +void pme_gpu_free_grids(const pme_gpu_t *pmeGPU) +{ + if (pmeGPU->archSpecific->performOutOfPlaceFFT) + { + cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid); + } + cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid, + &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc); +} + +void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0, + pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream); + /* Should the complex grid be cleared in some weird case? */ + CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error"); +} + +void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU) +{ + cudaStream_t stream = pmeGPU->archSpecific->pmeStream; + auto *kernelParamsPtr = pmeGPU->kernelParams.get(); + + const int nx = kernelParamsPtr->grid.realGridSize[XX]; + const int ny = kernelParamsPtr->grid.realGridSize[YY]; + const int nz = kernelParamsPtr->grid.realGridSize[ZZ]; + const int cellCount = c_pmeNeighborUnitcellCount; + const int gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)}; + + memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset)); + + const int newFractShiftsSize = cellCount * (nx + ny + nz); + + /* Two arrays, same size */ + int currentSizeTemp = pmeGPU->archSpecific->fractShiftsSize; + int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc; + cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float), + ¤tSizeTemp, ¤tSizeTempAlloc, + newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true); + float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable; + cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int), + &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc, + newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true); + int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable; + + /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */ + + for (int i = 0; i < DIM; i++) + { + kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i]; + cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(), + cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream); + cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(), + cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream); + } + + pme_gpu_make_fract_shifts_textures(pmeGPU); +} + +void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU) +{ + pme_gpu_free_fract_shifts_textures(pmeGPU); + /* Two arrays, same size */ + cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable); + cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc); +} + +void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0); + CU_RET_ERR(stat, "Error while waiting for PME solve output"); + + for (int j = 0; j < c_virialAndEnergyCount; j++) + { + GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial."); + } +} + +void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid) +{ + const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); + cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream); +} + +void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid) +{ + const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); + cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream); + cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME spread grid sync event record failure"); +} + +void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0); + CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host"); +} + +void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0); + CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host"); +} + +void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0); + CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host"); + //should check for pme_gpu_performs_solve(pmeGPU) +} + +void pme_gpu_init_internal(pme_gpu_t *pmeGPU) +{ + /* Allocate the target-specific structures */ + pmeGPU->archSpecific.reset(new pme_gpu_specific_t()); + pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t()); + + pmeGPU->archSpecific->performOutOfPlaceFFT = true; + /* This should give better performance, according to the cuFFT documentation. + * The performance seems to be the same though. + * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7). + */ + + pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) && + (getenv("GMX_DISABLE_GPU_TIMING") == nullptr); + /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings. + * This should probably also check for gpuId exclusivity? + */ + + /* Creating a PME CUDA stream */ + cudaError_t stat; + int highest_priority, lowest_priority; + stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority); + CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed"); + stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream, + cudaStreamDefault, //cudaStreamNonBlocking, + highest_priority); + CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed"); +} + +void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU) +{ + /* Destroy the CUDA stream */ + cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream); + CU_RET_ERR(stat, "PME cudaStreamDestroy error"); +} + +void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat; + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed"); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed"); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed"); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed"); + stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming); + CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed"); +} + +void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU) +{ + cudaError_t stat; + stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H); + CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H"); + stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H); + CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H"); + stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H); + CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H"); + stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H); + CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H"); + stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H); + CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H"); +} + +void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU) +{ + if (pme_gpu_performs_FFT(pmeGPU)) + { + pmeGPU->archSpecific->fftSetup.resize(0); + for (int i = 0; i < pmeGPU->common->ngrids; i++) + { + pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr(new GpuParallel3dFft(pmeGPU))); + } + } +} + +void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU) +{ + pmeGPU->archSpecific->fftSetup.resize(0); +} diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh new file mode 100644 index 0000000000..16b4b9aae8 --- /dev/null +++ b/src/gromacs/ewald/pme.cuh @@ -0,0 +1,243 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief This file defines the PME CUDA-specific data structure, + * various compile-time constants shared among the PME CUDA kernels, + * and also names some PME CUDA memory management routines. + * TODO: consider changing defines into variables where possible; have inline getters. + * + * \author Aleksei Iupinov + */ + +#ifndef GMX_EWALD_PME_CUH +#define GMX_EWALD_PME_CUH + +#include + +#include + +#include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size + +#include "pme-gpu-internal.h" // for the general PME GPU behaviour defines +#include "pme-timings.cuh" + +class GpuParallel3dFft; + +/* Some defines for PME behaviour follow */ + +/* + Here is a current memory layout for the theta/dtheta B-spline float parameter arrays. + This is the data in global memory used both by spreading and gathering kernels (with same scheduling). + This example has PME order 4 and 2 particles per warp/data chunk. + Each particle has 16 threads assigned to it, each thread works on 4 non-sequential global grid contributions. + + ---------------------------------------------------------------------------- + particles 0, 1 | particles 2, 3 | ... + ---------------------------------------------------------------------------- + order index 0 | index 1 | index 2 | index 3 | order index 0 ..... + ---------------------------------------------------------------------------- + tx0 tx1 ty0 ty1 tz0 tz1 | .......... + ---------------------------------------------------------------------------- + + Each data chunk for a single warp is 24 floats. This goes both for theta and dtheta. + 24 = 2 particles per warp * order 4 * 3 dimensions. 48 floats (1.5 warp size) per warp in total. + I have also tried intertwining theta and theta in a single array (they are used in pairs in gathering stage anyway) + and it didn't seem to make a performance difference. + + The corresponding defines follow. + */ + +/* This is the distance between the neighbour theta elements - would be 2 for the intertwining layout */ +#define PME_SPLINE_THETA_STRIDE 1 + +/*! \brief + * The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order. + * The assumption is currently that any thread processes only a single atom's contributions. + */ +#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order) + +//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4) +#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM) + +//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts, +//! which is asserted in case we use atom data padding at all. +#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP); + +/*! \brief \internal + * An inline CUDA function for checking the global atom data indices against the atom data array sizes. + * + * \param[in] atomDataIndexGlobal The atom data index. + * \param[in] nAtomData The atom data array element count. + * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise. + * + * This is called from the spline_and_spread and gather PME kernels. + * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled. + */ +int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData) +{ + return c_usePadding ? 1 : (atomDataIndex < nAtomData); +} + +/*! \brief \internal + * An inline CUDA function for skipping the zero-charge atoms. + * + * \returns Non-0 if atom should be processed, 0 otherwise. + * \param[in] coefficient The atom charge. + * + * This is called from the spline_and_spread and gather PME kernels. + */ +int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient) +{ + assert(isfinite(coefficient)); + return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1; +} + +/*! \brief \internal + * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer. + */ +struct pme_gpu_cuda_t +{ + /*! \brief The CUDA stream where everything related to the PME happens. */ + cudaStream_t pmeStream; + + /* Synchronization events */ + /*! \brief Triggered after the energy/virial have been copied to the host (after the solving stage). */ + cudaEvent_t syncEnerVirD2H; + /*! \brief Triggered after the output forces have been copied to the host (after the gathering stage). */ + cudaEvent_t syncForcesD2H; + /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */ + cudaEvent_t syncSpreadGridD2H; + /*! \brief Triggered after the atom spline data has been copied to the host (after the spline computation). */ + cudaEvent_t syncSplineAtomDataD2H; + /*! \brief Triggered after the grid hes been copied to the host (after the solving stage) */ + cudaEvent_t syncSolveGridD2H; + + // TODO: consider moving some things below into the non-CUDA struct. + + /* Settings which are set at the start of the run */ + /*! \brief A boolean which tells whether the complex and real grids for cuFFT are different or same. Currenty true. */ + bool performOutOfPlaceFFT; + /*! \brief A boolean which tells if the CUDA timing events are enabled. + * True by default, disabled by setting the environment variable GMX_DISABLE_CUDA_TIMING. + * FIXME: this should also be disabled if any other GPU task is running concurrently on the same device, + * as CUDA events on multiple streams are untrustworthy. + */ + bool useTiming; + + std::vector > fftSetup; + + std::array timingEvents; + + /* GPU arrays element counts (not the arrays sizes in bytes!). + * They might be larger than the actual meaningful data sizes. + * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory. + * These integer pairs are mostly meaningful for the cu_realloc/free_buffered calls. + * As such, if cu_realloc/free_buffered is refactored, they can be freely changed, too. + * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying. + * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well. + */ + /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/ + int coordinatesSize; + /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */ + int coordinatesSizeAlloc; + /*! \brief The kernelParams.atoms.forces float element count (actual) */ + int forcesSize; + /*! \brief The kernelParams.atoms.forces float element count (reserved) */ + int forcesSizeAlloc; + /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */ + int gridlineIndicesSize; + /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */ + int gridlineIndicesSizeAlloc; + /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */ + int splineDataSize; + /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */ + int splineDataSizeAlloc; + /*! \brief The kernelParams.atoms.coefficients float element count (actual) */ + int coefficientsSize; + /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */ + int coefficientsSizeAlloc; + /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */ + int splineValuesSize; + /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */ + int splineValuesSizeAlloc; + /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */ + int fractShiftsSize; + /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */ + int fractShiftsSizeAlloc; + /*! \brief The kernelParams.grid.realGrid float element count (actual) */ + int realGridSize; + /*! \brief The kernelParams.grid.realGrid float element count (reserved) */ + int realGridSizeAlloc; + /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */ + int complexGridSize; + /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */ + int complexGridSizeAlloc; +}; + + +/*! \brief \internal + * A single structure encompassing all the PME data used in CUDA kernels. + * This inherits from pme_gpu_kernel_params_base_t and adds a couple cudaTextureObject_t handles, + * which we would like to avoid in plain C++. + */ +struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t +{ + /* These are CUDA texture objects, related to the grid size. */ + /*! \brief CUDA texture object for accessing grid.d_fractShiftsTable */ + cudaTextureObject_t fractShiftsTableTexture; + /*! \brief CUDA texture object for accessing grid.d_gridlineIndicesTable */ + cudaTextureObject_t gridlineIndicesTableTexture; +}; + +/* CUDA texture functions which will reside in respective kernel files + * (due to texture references having scope of a translation unit). + */ + +/*! \brief \internal + * Creates/binds 2 textures used in the spline parameter computation. + * + * \param[in, out] pmeGPU The PME GPU structure. + */ +inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){}; + +/*! \brief \internal + * Frees/unbinds 2 textures used in the spline parameter computation. + * + * \param[in] pmeGPU The PME GPU structure. + */ +inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){}; + +#endif diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 1b6c0551db..ad25ca3abb 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -52,15 +52,24 @@ #include "gromacs/gmxlib/nrnb.h" #include "gromacs/math/vectypes.h" -#include "gromacs/mdtypes/forcerec.h" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/timing/wallcycle.h" #include "gromacs/timing/walltime_accounting.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" +#include "pme-gpu-types.h" + struct t_commrec; struct t_inputrec; +struct pme_gpu_t; +struct gmx_wallclock_gpu_pme_t; +struct gmx_device_info_t; + +namespace gmx +{ +class MDLogger; +} enum { GMX_SUM_GRID_FORWARD, GMX_SUM_GRID_BACKWARD @@ -95,7 +104,11 @@ int gmx_pme_init(struct gmx_pme_t **pmedata, struct t_commrec *cr, gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj, gmx_bool bReproducible, real ewaldcoeff_q, real ewaldcoeff_lj, - int nthread); + int nthread, + PmeRunMode runMode, + pme_gpu_t *pmeGPU, + gmx_device_info_t *gpuInfo, + const gmx::MDLogger &mdlog); /*! \brief Destroys the PME data structure.*/ void gmx_pme_destroy(gmx_pme_t *pme); @@ -115,7 +128,7 @@ void gmx_pme_destroy(gmx_pme_t *pme); #define GMX_PME_DO_ALL_F (GMX_PME_SPREAD | GMX_PME_SOLVE | GMX_PME_CALC_F) //@} -/*! \brief Do a PME calculation for the long range electrostatics and/or LJ. +/*! \brief Do a PME calculation on a CPU for the long range electrostatics and/or LJ. * * The meaning of \p flags is defined above, and determines which * parts of the calculation are performed. @@ -183,4 +196,60 @@ void gmx_pme_receive_f(struct t_commrec *cr, real *dvdlambda_q, real *dvdlambda_lj, float *pme_cycles); +/*! \brief + * This function updates the local atom data on GPU after DD (charges, coordinates, etc.). + * TODO: it should update the PME CPU atom data as well. + * (currently PME CPU call gmx_pme_do() gets passed the input pointers each step). + * + * \param[in] pme The PME structure. + * \param[in] nAtoms The number of particles. + * \param[in] charges The pointer to the array of particle charges. + */ +void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges); + +/* A block of PME GPU functions */ + +/*! \brief + * Tells if PME is enabled to run on GPU (not necessarily active at the moment). + * For now, this decision is stored in the PME structure itself. + * FIXME: this is an information that should be managed by the task scheduler. + * As soon as such functionality appears, this function should be removed from this module. + * + * \param[in] pme The PME data structure. + * \returns true if PME can run on GPU, false otherwise. + */ +bool pme_gpu_task_enabled(const gmx_pme_t *pme); + +/*! \brief + * Resets the PME GPU timings. To be called at the reset step. + * + * \param[in] pme The PME structure. + */ +void pme_gpu_reset_timings(const gmx_pme_t *pme); + +/*! \brief + * Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end. + * + * \param[in] pme The PME structure. + * \param[in] timings The gmx_wallclock_gpu_pme_t structure. + */ +void pme_gpu_get_timings(const gmx_pme_t *pme, + gmx_wallclock_gpu_pme_t *timings); + +/* The main PME GPU functions */ + +/*! \brief + * Gets the output forces and virial/energy if corresponding flags are (were?) passed in. + * + * \param[in] pme The PME data structure. + * \param[in] wcycle The wallclock counter. + * \param[out] vir_q The output virial matrix. + * \param[out] energy_q The output energy. + */ +void pme_gpu_get_results(const gmx_pme_t *pme, + gmx_wallcycle_t wcycle, + matrix vir_q, + real *energy_q); + + #endif diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index 475b84412f..d1cafc59d4 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -53,9 +53,11 @@ #include "gromacs/ewald/pme-spread.h" #include "gromacs/fft/parallel_3dfft.h" #include "gromacs/math/invertmatrix.h" +#include "gromacs/mdtypes/commrec.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/logger.h" #include "gromacs/utility/stringutil.h" namespace gmx @@ -71,9 +73,12 @@ static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec, real ewaldCoeff_lj = 1.0f ) { - gmx_pme_t *pmeDataRaw = nullptr; - gmx_pme_init(&pmeDataRaw, nullptr, 1, 1, inputRec, - atomCount, false, false, true, ewaldCoeff_q, ewaldCoeff_lj, 1); + gmx_pme_t *pmeDataRaw = nullptr; + const MDLogger dummyLogger; + const auto runMode = PmeRunMode::CPU; + t_commrec dummyCommrec = {0}; + gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true, + ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger); PmeSafePointer pme(pmeDataRaw); // taking ownership // TODO get rid of this with proper matrix type diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h index 2cbbe575d2..e453a5e18b 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer.h +++ b/src/gromacs/gpu_utils/gpuregiontimer.h @@ -77,6 +77,7 @@ template struct GpuTraits /*! \libinternal \brief * This is a GPU region timing implementation interface. * It should provide methods for measuring the last timespan. + * Copying/assignment is disabled since the underlying timing events are owned by this. */ template class GpuRegionTimerImpl { @@ -85,8 +86,15 @@ template class GpuRegionTimerImpl using CommandEvent = typename GpuTraits::CommandEvent; public: + GpuRegionTimerImpl() = default; ~GpuRegionTimerImpl() = default; + //! No copying + GpuRegionTimerImpl(const GpuRegionTimerImpl &) = delete; + //! No assignment + GpuRegionTimerImpl &operator=(GpuRegionTimerImpl &&) = delete; + //! Moving is disabled but can be considered in the future if needed + GpuRegionTimerImpl(GpuRegionTimerImpl &&) = delete; /*! \brief Will be called before the region start. */ inline void openTimingRegion(CommandStream) = 0; @@ -121,21 +129,17 @@ template class GpuRegionTimerWrapper Idle, Recording, Stopped - } debugState_; + } debugState_ = TimerState::Idle; //! The number of times the timespan has been measured - unsigned int callCount_; + unsigned int callCount_ = 0; //! The accumulated duration of the timespans measured (milliseconds) - double totalMilliseconds_; + double totalMilliseconds_ = 0.0; //! The underlying region timer implementation GpuRegionTimerImpl impl_; public: - GpuRegionTimerWrapper() : impl_() - { - reset(); - } - ~GpuRegionTimerWrapper() = default; + /*! \brief * To be called before the region start. * diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h index e0b4b3f87f..fc64d04e96 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h +++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h @@ -75,8 +75,7 @@ template <> class GpuRegionTimerImpl size_t currentEvent_ = 0; public: - GpuRegionTimerImpl() = default; - ~GpuRegionTimerImpl() = default; + inline void openTimingRegion(CommandStream){} inline void closeTimingRegion(CommandStream){} diff --git a/src/gromacs/mdlib/mdsetup.cpp b/src/gromacs/mdlib/mdsetup.cpp index bbc42e5762..1876ab2f9e 100644 --- a/src/gromacs/mdlib/mdsetup.cpp +++ b/src/gromacs/mdlib/mdsetup.cpp @@ -38,6 +38,7 @@ #include "gromacs/domdec/domdec.h" #include "gromacs/domdec/domdec_struct.h" +#include "gromacs/ewald/pme.h" #include "gromacs/listed-forces/manage-threading.h" #include "gromacs/mdlib/mdatoms.h" #include "gromacs/mdlib/shellfc.h" @@ -139,4 +140,11 @@ void mdAlgorithmsSetupAtomData(t_commrec *cr, } setup_bonded_threading(fr, &top->idef); + + gmx_pme_reinit_atoms(fr->pmedata, numHomeAtoms, mdatoms->chargeA); + /* This handles the PP+PME rank case where fr->pmedata is valid. + * For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms(). + * TODO: this only handles the GPU logic so far, should handle CPU as well. + * TODO: this also does not account for TPI. + */ } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index ed6d41064d..3f79587105 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -774,9 +774,9 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, * \param[inout] timings GPU task timing data * \param[in] iloc interaction locality */ -static void countPruneKernelTime(cu_timers_t *timers, - gmx_wallclock_gpu_t *timings, - const int iloc) +static void countPruneKernelTime(cu_timers_t *timers, + gmx_wallclock_gpu_nbnxn_t *timings, + const int iloc) { // We might have not done any pruning (e.g. if we skipped with empty domains). if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc]) @@ -821,13 +821,13 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb, gmx_incons(stmp); } - cu_plist_t *plist = nb->plist[iloc]; - cu_timers_t *timers = nb->timers; - struct gmx_wallclock_gpu_t *timings = nb->timings; - nb_staging nbst = nb->nbst; + cu_plist_t *plist = nb->plist[iloc]; + cu_timers_t *timers = nb->timers; + struct gmx_wallclock_gpu_nbnxn_t *timings = nb->timings; + nb_staging nbst = nb->nbst; - bool bCalcEner = flags & GMX_FORCE_ENERGY; - bool bCalcFshift = flags & GMX_FORCE_VIRIAL; + bool bCalcEner = flags & GMX_FORCE_ENERGY; + bool bCalcFshift = flags & GMX_FORCE_VIRIAL; /* turn energy calculation always on/off (for debugging/testing only) */ bCalcEner = (bCalcEner || always_ener) && !never_ener; diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 9500e4b78f..e01e2a66fa 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -397,7 +397,7 @@ static void init_timers(cu_timers_t *t, bool bUseTwoStreams) } /*! Initializes the timings data structure. */ -static void init_timings(gmx_wallclock_gpu_t *t) +static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) { int i, j; @@ -825,9 +825,10 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) } } -gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb) +//! This function is documented in the header file +gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb) { - return (nb != NULL && nb->bDoTime) ? nb->timings : NULL; + return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index 0e0be3c1e1..56e8270e60 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -274,9 +274,9 @@ struct gmx_nbnxn_cuda_t * concurrent streams, so we won't time if both l/nl work is done on GPUs. * Timer init/uninit is still done even with timing off so only the condition * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */ - bool bDoTime; /**< True if event-based timing is enabled. */ - cu_timers_t *timers; /**< CUDA event-based timers. */ - gmx_wallclock_gpu_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */ + bool bDoTime; /**< True if event-based timing is enabled. */ + cu_timers_t *timers; /**< CUDA event-based timers. */ + gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */ }; #ifdef __cplusplus diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h index 841574e399..3211a9ec26 100644 --- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h +++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h @@ -55,8 +55,9 @@ struct nonbonded_verlet_group_t; struct nbnxn_pairlist_t; struct nbnxn_atomdata_t; struct NbnxnListParameters; -struct gmx_wallclock_gpu_t; +struct gmx_wallclock_gpu_nbnxn_t; struct gmx_gpu_info_t; +struct gmx_device_info_t; /** Initializes the data structures related to GPU nonbonded calculations. */ GPU_FUNC_QUALIFIER @@ -104,7 +105,7 @@ void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */ GPU_FUNC_QUALIFIER -struct gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(NULL) +struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr) /** Resets nonbonded GPU timings. */ GPU_FUNC_QUALIFIER diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index 2e1102cef1..881688d14f 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -972,9 +972,9 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, * \param[inout] timings GPU task timing data * \param[in] iloc interaction locality */ -static void countPruneKernelTime(cl_timers_t *timers, - gmx_wallclock_gpu_t *timings, - const int iloc) +static void countPruneKernelTime(cl_timers_t *timers, + gmx_wallclock_gpu_nbnxn_t *timings, + const int iloc) { // We might have not done any pruning (e.g. if we skipped with empty domains). if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc]) @@ -1024,13 +1024,13 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb, gmx_incons(stmp); } - cl_plist_t *plist = nb->plist[iloc]; - cl_timers_t *timers = nb->timers; - struct gmx_wallclock_gpu_t *timings = nb->timings; - cl_nb_staging nbst = nb->nbst; + cl_plist_t *plist = nb->plist[iloc]; + cl_timers_t *timers = nb->timers; + struct gmx_wallclock_gpu_nbnxn_t *timings = nb->timings; + cl_nb_staging nbst = nb->nbst; - bool bCalcEner = flags & GMX_FORCE_ENERGY; - int bCalcFshift = flags & GMX_FORCE_VIRIAL; + bool bCalcEner = flags & GMX_FORCE_ENERGY; + int bCalcFshift = flags & GMX_FORCE_VIRIAL; /* turn energy calculation always on/off (for debugging/testing only) */ bCalcEner = (bCalcEner || always_ener) && !never_ener; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp index da63c733c4..4da8909363 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp @@ -554,7 +554,7 @@ static void init_timers(cl_timers_t *t, /*! \brief Initializes the timings data structure. */ -static void init_timings(gmx_wallclock_gpu_t *t) +static void init_timings(gmx_wallclock_gpu_nbnxn_t *t) { int i, j; @@ -1239,11 +1239,10 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb) } } - //! This function is documented in the header file -gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb) +gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb) { - return (nb != NULL && nb->bDoTime) ? nb->timings : NULL; + return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr; } //! This function is documented in the header file diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h index 0e9d527cee..4e1b64c6c0 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h @@ -359,16 +359,16 @@ struct gmx_nbnxn_ocl_t cl_command_queue stream[2]; /**< local and non-local GPU queues */ /** events used for synchronization */ - cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel - is done (and the local transfer can proceed) */ - cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in - the local stream that need to precede the - non-local force calculations are done - (e.g. f buffer 0-ing, local x/q H2D) */ - - cl_bool bDoTime; /**< True if event-based timing is enabled. */ - cl_timers_t *timers; /**< OpenCL event-based timers. */ - struct gmx_wallclock_gpu_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */ + cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel + is done (and the local transfer can proceed) */ + cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in + the local stream that need to precede the + non-local force calculations are done + (e.g. f buffer 0-ing, local x/q H2D) */ + + cl_bool bDoTime; /**< True if event-based timing is enabled. */ + cl_timers_t *timers; /**< OpenCL event-based timers. */ + struct gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */ }; #ifdef __cplusplus diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index b0267051a3..5278ad6873 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -2436,6 +2436,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr, t_nrnb nrnb[], gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, nonbonded_verlet_t *nbv, + gmx_pme_t *pme, gmx_bool bWriteStat) { t_nrnb *nrnb_tot = nullptr; @@ -2522,11 +2523,17 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr, if (printReport) { - struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr; - + auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr; + gmx_wallclock_gpu_pme_t pme_gpu_timings = {}; + if (pme_gpu_task_enabled(pme)) + { + pme_gpu_get_timings(pme, &pme_gpu_timings); + } wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme, elapsed_time_over_all_ranks, - wcycle, cycle_sum, gputimes); + wcycle, cycle_sum, + nbnxn_gpu_timings, + &pme_gpu_timings); if (EI_DYNAMICS(inputrec->eI)) { diff --git a/src/gromacs/mdlib/sim_util.h b/src/gromacs/mdlib/sim_util.h index cc3bd29c10..310e9859b5 100644 --- a/src/gromacs/mdlib/sim_util.h +++ b/src/gromacs/mdlib/sim_util.h @@ -126,6 +126,7 @@ void finish_run(FILE *log, const gmx::MDLogger &mdlog, t_commrec *cr, t_nrnb nrnb[], gmx_wallcycle_t wcycle, gmx_walltime_accounting_t walltime_accounting, nonbonded_verlet_t *nbv, + gmx_pme_t *pme, gmx_bool bWriteStat); void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr); diff --git a/src/gromacs/timing/gpu_timing.h b/src/gromacs/timing/gpu_timing.h index e9ec67d855..214b0120eb 100644 --- a/src/gromacs/timing/gpu_timing.h +++ b/src/gromacs/timing/gpu_timing.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,26 +47,52 @@ extern "C" { #endif -/*! \internal \brief Nonbonded kernel time and call count. */ -struct gmx_nbnxn_kernel_timing_data_t +/*! \internal \brief GPU kernel time and call count. */ +struct gmx_kernel_timing_data_t { double t; /**< Accumulated lapsed time */ int c; /**< Number of calls corresponding to the elapsed time */ }; -/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */ -struct gmx_wallclock_gpu_t +/*! \internal \brief + * + * PME GPU stages timing events indices, corresponding to the string in PMEStageNames in wallcycle.cpp. + */ +enum +{ + gtPME_SPLINE = 0, + gtPME_SPREAD, + gtPME_SPLINEANDSPREAD, + gtPME_FFT_R2C, + gtPME_SOLVE, + gtPME_FFT_C2R, + gtPME_GATHER, + gtPME_EVENT_COUNT /* not a stage ID but a static array size */ +}; + +/*! \internal \brief GPU timings for PME. */ +struct gmx_wallclock_gpu_pme_t +{ + /* A separate PME structure to avoid refactoring the NB code for gmx_wallclock_gpu_t later + * TODO: devise a better GPU timing data structuring. + */ + /*! \brief Array of PME GPU timing data. */ + gmx_kernel_timing_data_t timing[gtPME_EVENT_COUNT]; +}; + +/*! \internal \brief GPU NB timings for kernels and H2d/D2H transfers. */ +struct gmx_wallclock_gpu_nbnxn_t { - gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four + gmx_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four versions of the nonbonded kernels: force-only, force+energy, force+pruning, and force+energy+pruning */ - gmx_nbnxn_kernel_timing_data_t pruneTime; /**< table containing the timings of the 1st pass prune-only kernels */ - gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */ - double nb_h2d_t; /**< host to device transfer time in nb calculation */ - double nb_d2h_t; /**< device to host transfer time in nb calculation */ - int nb_c; /**< total call count of the nonbonded gpu operations */ - double pl_h2d_t; /**< pair search step host to device transfer time */ - int pl_h2d_c; /**< pair search step host to device transfer call count */ + gmx_kernel_timing_data_t pruneTime; /**< table containing the timings of the 1st pass prune-only kernels */ + gmx_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */ + double nb_h2d_t; /**< host to device transfer time in nb calculation */ + double nb_d2h_t; /**< device to host transfer time in nb calculation */ + int nb_c; /**< total call count of the nonbonded gpu operations */ + double pl_h2d_t; /**< pair search step host to device transfer time */ + int pl_h2d_c; /**< pair search step host to device transfer call count */ }; #ifdef __cplusplus diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index efa296bdff..2ad431a09d 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -105,7 +105,7 @@ static const char *wcn[ewcNR] = "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.", "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh", "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec", - "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.", + "PME wait for PP", "Wait + Recv. PME F", "Wait PME GPU spread", "Wait PME GPU gather", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.", "Vsite spread", "COM pull force", "Write traj.", "Update", "Constraints", "Comm. energies", "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test" @@ -128,6 +128,18 @@ static const char *wcsn[ewcsNR] = "NB F buffer ops.", }; +/* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */ +static const char *PMEStageNames[] = +{ + "Spline", + "Spread", + "Spline/spread", + "FFT r2c", + "Solve", + "FFT c2r", + "Gather", +}; + gmx_bool wallcycle_have_counter(void) { return gmx_cycles_have_counter(); @@ -717,9 +729,10 @@ static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, i void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme, int nth_pp, int nth_pme, double realtime, gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum, - struct gmx_wallclock_gpu_t *gpu_t) + const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t, + const gmx_wallclock_gpu_pme_t *gpu_pme_t) { - double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k; + double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio; double c2t, c2t_pp, c2t_pme = 0; int i, j, npp, nth_tot; char buf[STRLEN]; @@ -876,25 +889,31 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np } /* print GPU timing summary */ - if (gpu_t) + double tot_gpu = 0.0; + if (gpu_pme_t) + { + for (size_t k = 0; k < gtPME_EVENT_COUNT; k++) + { + tot_gpu += gpu_pme_t->timing[k].t; + } + } + if (gpu_nbnxn_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; - - tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; + tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t; /* add up the kernel timings */ - tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - tot_k += gpu_t->ktime[i][j].t; + tot_gpu += gpu_nbnxn_t->ktime[i][j].t; } } - tot_gpu += tot_k + gpu_t->pruneTime.t; + tot_gpu += gpu_nbnxn_t->pruneTime.t; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) @@ -907,44 +926,57 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", - gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); + gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", - gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); + gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { - if (gpu_t->ktime[i][j].c) + if (gpu_nbnxn_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], - gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); + gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu); } } } - if (gpu_t->pruneTime.c) + if (gpu_pme_t) { - print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu); + for (size_t k = 0; k < gtPME_EVENT_COUNT; k++) + { + if (gpu_pme_t->timing[k].c) + { + print_gputimes(fplog, PMEStageNames[k], + gpu_pme_t->timing[k].c, + gpu_pme_t->timing[k].t, + tot_gpu); + } + } } - print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); + if (gpu_nbnxn_t->pruneTime.c) + { + print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu); + } + print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); - print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); + print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); - if (gpu_t->dynamicPruneTime.c) + if (gpu_nbnxn_t->dynamicPruneTime.c) { /* We print the dynamic pruning kernel timings after a separator * and avoid adding it to tot_gpu as this is not in the force * overlap. We print the fraction as relative to the rest. */ - print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu); + print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu); fprintf(fplog, "%s\n", hline); } - gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; - if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0) + if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0) { + // FIXME the code below is not updated for PME on GPU fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n", - tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, + tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); } diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 8ad165514a..01cec41b8f 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -52,13 +52,12 @@ enum { ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU, ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH, ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE, - ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS, + ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_PME_SPREAD, ewcWAIT_GPU_PME_GATHER, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS, ewcVSITESPREAD, ewcPULLPOT, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD, ewcTEST, ewcNR }; - enum { ewcsDD_REDIST, ewcsDD_GRID, ewcsDD_SETUPCOMM, ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER, diff --git a/src/gromacs/timing/wallcyclereporting.h b/src/gromacs/timing/wallcyclereporting.h index e84243fe8c..632ea53d29 100644 --- a/src/gromacs/timing/wallcyclereporting.h +++ b/src/gromacs/timing/wallcyclereporting.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -54,7 +54,8 @@ class MDLogger; } typedef struct gmx_wallcycle *gmx_wallcycle_t; -typedef struct gmx_wallclock_gpu_t gmx_wallclock_gpu_t; +struct gmx_wallclock_gpu_nbnxn_t; +struct gmx_wallclock_gpu_pme_t; typedef std::array WallcycleCounts; /* Convenience typedef */ @@ -66,7 +67,8 @@ WallcycleCounts wallcycle_sum(struct t_commrec *cr, gmx_wallcycle_t wc); void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme, int nth_pp, int nth_pme, double realtime, gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum, - struct gmx_wallclock_gpu_t *gpu_t); + const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t, + const gmx_wallclock_gpu_pme_t *gpu_pme_t); /* Print the cycle and time accounting */ #endif diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index 70f87b5390..e96e080667 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -163,7 +163,8 @@ static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commre gmx_int64_t *step_rel, t_inputrec *ir, gmx_wallcycle_t wcycle, t_nrnb *nrnb, gmx_walltime_accounting_t walltime_accounting, - struct nonbonded_verlet_t *nbv) + struct nonbonded_verlet_t *nbv, + struct gmx_pme_t *pme) { char sbuf[STEPSTRSIZE]; @@ -175,6 +176,15 @@ static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commre if (use_GPU(nbv)) { nbnxn_gpu_reset_timings(nbv); + } + + if (pme_gpu_task_enabled(pme)) + { + pme_gpu_reset_timings(pme); + } + + if (use_GPU(nbv) || pme_gpu_task_enabled(pme)) + { resetGpuProfiler(); } @@ -1861,7 +1871,7 @@ double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog, "mdrun -resetstep.", step); } reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting, - use_GPU(fr->nbv) ? fr->nbv : nullptr); + use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata); wcycle_set_reset_counters(wcycle, -1); if (!(cr->duty & DUTY_PME)) { diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 9255754315..316c2e3f28 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -1080,11 +1080,14 @@ int Mdrunner::mdrunner() { try { + gmx_device_info_t *pmeGpuInfo = nullptr; + auto runMode = PmeRunMode::CPU; status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec, mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed, mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj, - nthreads_pme); + nthreads_pme, + runMode, nullptr, pmeGpuInfo, mdlog); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; if (status != 0) @@ -1186,6 +1189,7 @@ int Mdrunner::mdrunner() finish_run(fplog, mdlog, cr, inputrec, nrnb, wcycle, walltime_accounting, fr ? fr->nbv : nullptr, + fr ? fr->pmedata : nullptr, EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr)); // Free PME data -- 2.22.0