else()
add_library(libgromacs ${LIBGROMACS_SOURCES})
endif()
+ target_link_libraries(libgromacs PRIVATE ${CUDA_CUFFT_LIBRARIES})
else()
add_library(libgromacs ${LIBGROMACS_SOURCES})
endif()
file(GLOB EWALD_SOURCES *.cpp)
set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${EWALD_SOURCES} PARENT_SCOPE)
+if (GMX_USE_CUDA)
+ file(GLOB EWALD_CUDA_SOURCES *.cu)
+ gmx_add_libgromacs_sources(${EWALD_CUDA_SOURCES})
+endif()
if (BUILD_TESTING)
add_subdirectory(tests)
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements CUDA FFT routines for PME GPU.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme-3dfft.cuh"
+
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+#include "pme-gpu-types.h"
+
+static void handleCufftError(cufftResult_t status, const char *msg)
+{
+ if (status != CUFFT_SUCCESS)
+ {
+ gmx_fatal(FARGS, "%s (error code %d)\n", msg, status);
+ }
+}
+
+GpuParallel3dFft::GpuParallel3dFft(const pme_gpu_t *pmeGPU)
+{
+ const pme_gpu_cuda_kernel_params_t *kernelParamsPtr = pmeGPU->kernelParams.get();
+ ivec realGridSize, realGridSizePadded, complexGridSizePadded;
+ for (int i = 0; i < DIM; i++)
+ {
+ realGridSize[i] = kernelParamsPtr->grid.realGridSize[i];
+ realGridSizePadded[i] = kernelParamsPtr->grid.realGridSizePadded[i];
+ complexGridSizePadded[i] = kernelParamsPtr->grid.complexGridSizePadded[i];
+ }
+
+ GMX_RELEASE_ASSERT(!pme_gpu_uses_dd(pmeGPU), "FFT decomposition not implemented");
+
+ const int complexGridSizePaddedTotal = complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ];
+ const int realGridSizePaddedTotal = realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
+
+ realGrid_ = (cufftReal *)kernelParamsPtr->grid.d_realGrid;
+ GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
+ complexGrid_ = (cufftComplex *)kernelParamsPtr->grid.d_fourierGrid;
+ GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
+
+ cufftResult_t result;
+ /* Commented code for a simple 3D grid with no padding */
+ /*
+ result = cufftPlan3d(&planR2C_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_R2C);
+ handleCufftError(result, "cufftPlan3d R2C plan failure");
+
+ result = cufftPlan3d(&planC2R_, realGridSize[XX], realGridSize[YY], realGridSize[ZZ], CUFFT_C2R);
+ handleCufftError(result, "cufftPlan3d C2R plan failure");
+ */
+
+ const int rank = 3, batch = 1;
+ result = cufftPlanMany(&planR2C_, rank, realGridSize,
+ realGridSizePadded, 1, realGridSizePaddedTotal,
+ complexGridSizePadded, 1, complexGridSizePaddedTotal,
+ CUFFT_R2C,
+ batch);
+ handleCufftError(result, "cufftPlanMany R2C plan failure");
+
+ result = cufftPlanMany(&planC2R_, rank, realGridSize,
+ complexGridSizePadded, 1, complexGridSizePaddedTotal,
+ realGridSizePadded, 1, realGridSizePaddedTotal,
+ CUFFT_C2R,
+ batch);
+ handleCufftError(result, "cufftPlanMany C2R plan failure");
+
+ cudaStream_t stream = pmeGPU->archSpecific->pmeStream;
+ GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT");
+
+ result = cufftSetStream(planR2C_, stream);
+ handleCufftError(result, "cufftSetStream R2C failure");
+
+ result = cufftSetStream(planC2R_, stream);
+ handleCufftError(result, "cufftSetStream C2R failure");
+}
+
+GpuParallel3dFft::~GpuParallel3dFft()
+{
+ cufftResult_t result;
+ result = cufftDestroy(planR2C_);
+ handleCufftError(result, "cufftDestroy R2C failure");
+ result = cufftDestroy(planC2R_);
+ handleCufftError(result, "cufftDestroy C2R failure");
+}
+
+void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir)
+{
+ cufftResult_t result;
+ if (dir == GMX_FFT_REAL_TO_COMPLEX)
+ {
+ result = cufftExecR2C(planR2C_, realGrid_, complexGrid_);
+ handleCufftError(result, "cuFFT R2C execution failure");
+ }
+ else
+ {
+ result = cufftExecC2R(planC2R_, complexGrid_, realGrid_);
+ handleCufftError(result, "cuFFT C2R execution failure");
+ }
+}
+
+inline void pme_gpu_3dfft(const pme_gpu_t *pmeGPU, gmx_fft_direction dir, int grid_index)
+{
+ int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
+ pme_gpu_start_timing(pmeGPU, timerId);
+ pmeGPU->archSpecific->fftSetup[grid_index]->perform3dFft(dir);
+ pme_gpu_stop_timing(pmeGPU, timerId);
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines the CUDA 3D-FFT functions for PME.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_3DFFT_CUH
+#define GMX_EWALD_PME_3DFFT_CUH
+
+#include <cufft.h> // for the cufft types
+
+#include "gromacs/fft/fft.h" // for the enum gmx_fft_direction
+
+struct pme_gpu_t;
+
+/*! \brief \internal A 3D FFT class for performing R2C/C2R transforms
+ * \todo Make this class actually parallel over multiple GPUs
+ */
+class GpuParallel3dFft
+{
+ cufftHandle planR2C_;
+ cufftHandle planC2R_;
+ cufftReal *realGrid_;
+ cufftComplex *complexGrid_;
+
+ public:
+ /*! \brief
+ * Constructs CUDA FFT plans for performing 3D FFT on a PME grid.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+ GpuParallel3dFft(const pme_gpu_t *pmeGPU);
+ /*! \brief Destroys CUDA FFT plans. */
+ ~GpuParallel3dFft();
+ /*! \brief Performs the FFT transform in given direction */
+ void perform3dFft(gmx_fft_direction dir);
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief This file contains internal function implementations
+ * for performing the PME calculations on GPU.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "pme-gpu-internal.h"
+
+#include "config.h"
+
+#include <list>
+#include <string>
+
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/math/invertmatrix.h"
+#include "gromacs/math/units.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "pme-grid.h"
+#include "pme-internal.h"
+
+/*! \internal \brief
+ * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns The pointer to the kernel parameters.
+ */
+static pme_gpu_kernel_params_base_t *pme_gpu_get_kernel_params_base_ptr(const pme_gpu_t *pmeGPU)
+{
+ // reinterpret_cast is needed because the derived CUDA structure is not known in this file
+ auto *kernelParamsPtr = reinterpret_cast<pme_gpu_kernel_params_base_t *>(pmeGPU->kernelParams.get());
+ return kernelParamsPtr;
+}
+
+void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial)
+{
+ GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU");
+ unsigned int j = 0;
+ virial[XX][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ virial[YY][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ virial[ZZ][ZZ] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ virial[XX][YY] = virial[YY][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ virial[XX][ZZ] = virial[ZZ][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ virial[YY][ZZ] = virial[ZZ][YY] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
+ *energy = 0.5f * pmeGPU->staging.h_virialAndEnergy[j++];
+}
+
+void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box)
+{
+ auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+ kernelParamsPtr->step.boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
+ GMX_ASSERT(kernelParamsPtr->step.boxVolume != 0.0f, "Zero volume of the unit cell");
+
+#if GMX_DOUBLE
+ GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
+#else
+ matrix recipBox;
+ gmx::invertBoxMatrix(box, recipBox);
+ /* The GPU recipBox is transposed as compared to the CPU recipBox.
+ * Spread uses matrix columns (while solve and gather use rows).
+ * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
+ */
+ const real newRecipBox[DIM][DIM] =
+ {
+ {recipBox[XX][XX], recipBox[YY][XX], recipBox[ZZ][XX]},
+ { 0.0, recipBox[YY][YY], recipBox[ZZ][YY]},
+ { 0.0, 0.0, recipBox[ZZ][ZZ]}
+ };
+ memcpy(kernelParamsPtr->step.recipBox, newRecipBox, sizeof(matrix));
+#endif
+}
+
+void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates)
+{
+ pme_gpu_copy_input_coordinates(pmeGPU, h_coordinates);
+ if (needToUpdateBox)
+ {
+ pme_gpu_update_input_box(pmeGPU, box);
+ }
+}
+
+/*! \brief \libinternal
+ * The PME GPU reinitialization function that is called both at the end of any MD step and on any load balancing step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+static void pme_gpu_reinit_step(const pme_gpu_t *pmeGPU)
+{
+ pme_gpu_clear_grids(pmeGPU);
+ pme_gpu_clear_energy_virial(pmeGPU);
+}
+
+void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcF, const bool bCalcEnerVir)
+{
+ /* Needed for copy back as well as timing events */
+ pme_gpu_synchronize(pmeGPU);
+
+ if (bCalcF && pme_gpu_performs_gather(pmeGPU))
+ {
+ pme_gpu_sync_output_forces(pmeGPU);
+ }
+ if (bCalcEnerVir && pme_gpu_performs_solve(pmeGPU))
+ {
+ pme_gpu_sync_output_energy_virial(pmeGPU);
+ }
+ pme_gpu_update_timings(pmeGPU);
+ pme_gpu_reinit_step(pmeGPU);
+}
+
+/*! \brief \libinternal
+ * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+static void pme_gpu_reinit_grids(pme_gpu_t *pmeGPU)
+{
+ auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+ kernelParamsPtr->grid.ewaldFactor = (M_PI * M_PI) / (pmeGPU->common->ewaldcoeff_q * pmeGPU->common->ewaldcoeff_q);
+
+ /* The grid size variants */
+ for (int i = 0; i < DIM; i++)
+ {
+ kernelParamsPtr->grid.realGridSize[i] = pmeGPU->common->nk[i];
+ kernelParamsPtr->grid.realGridSizeFP[i] = (float)kernelParamsPtr->grid.realGridSize[i];
+ kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
+
+ // The complex grid currently uses no padding;
+ // if it starts to do so, then another test should be added for that
+ kernelParamsPtr->grid.complexGridSize[i] = kernelParamsPtr->grid.realGridSize[i];
+ kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
+ }
+ /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
+ if (!pme_gpu_performs_FFT(pmeGPU))
+ {
+ // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
+ kernelParamsPtr->grid.realGridSizePadded[ZZ] = (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
+ }
+
+ /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
+ kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
+ kernelParamsPtr->grid.complexGridSize[ZZ]++;
+ kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
+
+ pme_gpu_realloc_and_copy_fract_shifts(pmeGPU);
+ pme_gpu_realloc_and_copy_bspline_values(pmeGPU);
+ pme_gpu_realloc_grids(pmeGPU);
+ pme_gpu_reinit_3dfft(pmeGPU);
+}
+
+/* Several GPU functions that refer to the CPU PME data live here.
+ * We would like to keep these away from the GPU-framework specific code for clarity,
+ * as well as compilation issues with MPI.
+ */
+
+/*! \brief \libinternal
+ * Copies everything useful from the PME CPU to the PME GPU structure.
+ * The goal is to minimize interaction with the PME CPU structure in the GPU code.
+ *
+ * \param[in] pme The PME structure.
+ */
+static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
+{
+ /* TODO: Consider refactoring the CPU PME code to use the same structure,
+ * so that this function becomes 2 lines */
+ pme_gpu_t *pmeGPU = pme->gpu;
+ pmeGPU->common->ngrids = pme->ngrids;
+ pmeGPU->common->epsilon_r = pme->epsilon_r;
+ pmeGPU->common->ewaldcoeff_q = pme->ewaldcoeff_q;
+ pmeGPU->common->nk[XX] = pme->nkx;
+ pmeGPU->common->nk[YY] = pme->nky;
+ pmeGPU->common->nk[ZZ] = pme->nkz;
+ pmeGPU->common->pmegrid_n[XX] = pme->pmegrid_nx;
+ pmeGPU->common->pmegrid_n[YY] = pme->pmegrid_ny;
+ pmeGPU->common->pmegrid_n[ZZ] = pme->pmegrid_nz;
+ pmeGPU->common->pme_order = pme->pme_order;
+ for (int i = 0; i < DIM; i++)
+ {
+ pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]);
+ }
+ const int cellCount = c_pmeNeighborUnitcellCount;
+ pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx);
+ pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky);
+ pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz);
+ pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx);
+ pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky);
+ pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz);
+ pmeGPU->common->runMode = pme->runMode;
+}
+
+/*! \brief \libinternal
+ * Finds out if PME with given inputs is possible to run on GPU.
+ *
+ * \param[in] pme The PME structure.
+ * \param[out] error The error message if the input is not supported on GPU.
+ * \returns True if this PME input is possible to run on GPU, false otherwise.
+ */
+static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
+{
+ std::list<std::string> errorReasons;
+ if (pme->nnodes != 1)
+ {
+ errorReasons.push_back("PME decomposition");
+ }
+ if (pme->pme_order != 4)
+ {
+ errorReasons.push_back("interpolation orders other than 4");
+ }
+ if (pme->bFEP)
+ {
+ errorReasons.push_back("free energy calculations (multiple grids)");
+ }
+ if (pme->doLJ)
+ {
+ errorReasons.push_back("Lennard-Jones PME");
+ }
+#if GMX_DOUBLE
+ {
+ errorReasons.push_back("double precision");
+ }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+ {
+ errorReasons.push_back("non-CUDA build of GROMACS");
+ }
+#endif
+
+ bool inputSupported = errorReasons.empty();
+ if (!inputSupported && error)
+ {
+ std::string regressionTestMarker = "PME GPU does not support";
+ // this prefix is tested for in the regression tests script gmxtest.pl
+ *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+ }
+ return inputSupported;
+}
+
+/*! \libinternal \brief
+ * Initializes the PME GPU data at the beginning of the run.
+ *
+ * \param[in,out] pme The PME structure.
+ * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in] mdlog The logger.
+ * \param[in] cr The communication structure.
+ */
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
+ const t_commrec *cr)
+{
+ std::string errorString;
+ bool canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
+ if (!canRunOnGpu)
+ {
+ GMX_THROW(gmx::NotImplementedError(errorString));
+ }
+
+ pme->gpu = new pme_gpu_t();
+ pme_gpu_t *pmeGPU = pme->gpu;
+ pmeGPU->common = std::shared_ptr<pme_shared_t>(new pme_shared_t());
+
+ /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
+ /* A convenience variable. */
+ pmeGPU->settings.useDecomposition = (pme->nnodes == 1);
+ /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
+ pmeGPU->settings.performGPUGather = true;
+
+ // GPU initialization
+ init_gpu(mdlog, cr->nodeid, gpuInfo);
+ pmeGPU->deviceInfo = gpuInfo;
+
+ pme_gpu_init_internal(pmeGPU);
+ pme_gpu_init_sync_events(pmeGPU);
+ pme_gpu_alloc_energy_virial(pmeGPU);
+
+ pme_gpu_copy_common_data_from(pme);
+
+ GMX_ASSERT(pmeGPU->common->epsilon_r != 0.0f, "PME GPU: bad electrostatic coefficient");
+
+ auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+ kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r;
+}
+
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+{
+ if (!pme_gpu_active(pme))
+ {
+ return;
+ }
+
+ if (!pme->gpu)
+ {
+ /* First-time initialization */
+ pme_gpu_init(pme, gpuInfo, mdlog, cr);
+ }
+ else
+ {
+ /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
+ pme_gpu_copy_common_data_from(pme);
+ }
+ /* GPU FFT will only get used for a single rank.*/
+ pme->gpu->settings.performGPUFFT = (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme_gpu_uses_dd(pme->gpu);
+ pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
+
+ pme_gpu_reinit_grids(pme->gpu);
+ pme_gpu_reinit_step(pme->gpu);
+}
+
+void pme_gpu_destroy(pme_gpu_t *pmeGPU)
+{
+ /* Free lots of data */
+ pme_gpu_free_energy_virial(pmeGPU);
+ pme_gpu_free_bspline_values(pmeGPU);
+ pme_gpu_free_forces(pmeGPU);
+ pme_gpu_free_coordinates(pmeGPU);
+ pme_gpu_free_coefficients(pmeGPU);
+ pme_gpu_free_spline_data(pmeGPU);
+ pme_gpu_free_grid_indices(pmeGPU);
+ pme_gpu_free_fract_shifts(pmeGPU);
+ pme_gpu_free_grids(pmeGPU);
+
+ pme_gpu_destroy_3dfft(pmeGPU);
+ pme_gpu_destroy_sync_events(pmeGPU);
+
+ /* Free the GPU-framework specific data last */
+ pme_gpu_destroy_specific(pmeGPU);
+
+ delete pmeGPU;
+}
+
+void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU, const int nAtoms, const real *charges)
+{
+ auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+ kernelParamsPtr->atoms.nAtoms = nAtoms;
+ const int alignment = pme_gpu_get_atom_data_alignment(pmeGPU);
+ pmeGPU->nAtomsPadded = ((nAtoms + alignment - 1) / alignment) * alignment;
+ const int nAtomsAlloc = c_usePadding ? pmeGPU->nAtomsPadded : nAtoms;
+ const bool haveToRealloc = (pmeGPU->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
+ pmeGPU->nAtomsAlloc = nAtomsAlloc;
+
+#if GMX_DOUBLE
+ GMX_RELEASE_ASSERT(false, "Only single precision supported");
+ GMX_UNUSED_VALUE(charges);
+#else
+ pme_gpu_realloc_and_copy_input_coefficients(pmeGPU, reinterpret_cast<const float *>(charges));
+ /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
+#endif
+
+ if (haveToRealloc)
+ {
+ pme_gpu_realloc_coordinates(pmeGPU);
+ pme_gpu_realloc_forces(pmeGPU);
+ pme_gpu_realloc_spline_data(pmeGPU);
+ pme_gpu_realloc_grid_indices(pmeGPU);
+ }
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief This file contains internal function definitions for performing the PME calculations on GPU.
+ * These are not meant to be exposed outside of the PME GPU code.
+ * As of now, their bodies are still in the common pme-gpu.cpp files.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#ifndef GMX_EWALD_PME_GPU_INTERNAL_H
+#define GMX_EWALD_PME_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros
+
+#include "pme-gpu-types.h" // for the inline functions accessing pme_gpu_t members
+
+struct gmx_hw_info_t;
+struct gmx_gpu_opt_t;
+struct gmx_pme_t; // only used in pme_gpu_reinit
+struct t_commrec;
+struct gmx_wallclock_gpu_pme_t;
+struct pme_atomcomm_t;
+
+namespace gmx
+{
+class MDLogger;
+}
+
+/* Some general constants for PME GPU behaviour follow. */
+
+/*! \brief \libinternal
+ * false: The atom data GPU buffers are sized precisely according to the number of atoms.
+ * (Except GPU spline data layout which is regardless intertwined for 2 atoms per warp).
+ * The atom index checks in the spread/gather code potentially hinder the performance.
+ * true: The atom data GPU buffers are padded with zeroes so that the possible number of atoms
+ * fitting in is divisible by PME_ATOM_DATA_ALIGNMENT.
+ * The atom index checks are not performed. There should be a performance win, but how big is it, remains to be seen.
+ * Additional cudaMemsetAsync calls are done occasionally (only charges/coordinates; spline data is always recalculated now).
+ * \todo Estimate performance differences
+ */
+const bool c_usePadding = true;
+
+/*! \brief \libinternal
+ * false: Atoms with zero charges are processed by PME. Could introduce some overhead.
+ * true: Atoms with zero charges are not processed by PME. Adds branching to the spread/gather.
+ * Could be good for performance in specific systems with lots of neutral atoms.
+ * \todo Estimate performance differences.
+ */
+const bool c_skipNeutralAtoms = false;
+
+/*! \brief \libinternal
+ * Number of PME solve output floating point numbers.
+ * 6 for symmetric virial matrix + 1 for reciprocal energy.
+ */
+const int c_virialAndEnergyCount = 7;
+
+/* A block of CUDA-only functions that live in pme.cu */
+
+/*! \libinternal \brief
+ * Returns the number of atoms per chunk in the atom charges/coordinates data layout.
+ * Depends on CUDA-specific block sizes, needed for the atom data padding.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns Number of atoms in a single GPU atom data chunk.
+ */
+CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
+
+/*! \libinternal \brief
+ * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns Number of atoms in a single GPU atom spline data chunk.
+ */
+CUDA_FUNC_QUALIFIER int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM_WITH_RETURN(1)
+
+/*! \libinternal \brief
+ * Synchronizes the current step, waiting for the GPU kernels/transfers to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Allocates the fixed size energy and virial buffer both on GPU and CPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_alloc_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the energy and virial memory both on GPU and CPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_energy_virial(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Clears the energy and virial memory on GPU with 0.
+ * Should be called at the end of the energy/virial calculation step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_clear_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates and copies the pre-computed B-spline values to the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the GPU buffer for the PME forces.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the GPU buffer for the PME forces.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered forces).
+ * To be called e.g. after the bonded calculations.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] h_forces The input forces rvec buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ const float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[out] h_forces The output forces rvec buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ float *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the PME GPU output forces copying to the CPU buffer to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ *
+ * Needs to be called on every DD step/in the beginning.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the input coordinates from the CPU buffer onto the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] h_coordinates Input coordinates (XYZ rvec array).
+ *
+ * Needs to be called every MD step. The coordinates are then used in the spline calculation.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the coordinates on the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_coordinates(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
+ * Clears the padded part if needed.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] h_coefficients The input atom charges/coefficients.
+ *
+ * Does not need to be done every MD step, only whenever the local charges change.
+ * (So, in the beginning of the run, or on DD step).
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ const float *CUDA_FUNC_ARGUMENT(h_coefficients)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the charges/coefficients on the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_coefficients(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffers on the GPU and the host for the atoms spline data.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the buffers on the GPU for the atoms spline data.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_spline_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the buffers on the GPU and the host for the particle gridline indices.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the buffer on the GPU for the particle gridline indices.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_grid_indices(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_grids(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Clears the real space grid on the GPU.
+ * Should be called at the end of each MD step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_clear_grids(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Frees the pre-computed fractional coordinates' shifts on the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the input real-space grid from the host to the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] h_grid The host-side grid buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the output real-space grid from the GPU to the host.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[out] h_grid The host-side grid buffer.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ float *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the grid copying to the host-side buffer after spreading to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the atom data copying to the intermediate host-side buffer after spline computation to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_spline_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Waits for the grid copying to the host-side buffer after solving to finish.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_sync_solve_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Does the one-time GPU-framework specific PME initialization.
+ * For CUDA, the PME stream is created with the highest priority.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_init_internal(pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the PME GPU-framework specific data.
+ * Should be called last in the PME GPU destructor.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_specific(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Initializes the PME GPU synchronization events.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_init_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the PME GPU synchronization events.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_sync_events(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Initializes the CUDA FFT structures.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Destroys the CUDA FFT structures.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_destroy_3dfft(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/* Several CUDA event-based timing functions that live in pme-timings.cu */
+
+/*! \libinternal \brief
+ * Finalizes all the PME GPU stage timings for the current step. Should be called at the end of every step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_update_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \brief
+ * Resets the PME GPU timings. To be called at the reset step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
+/*! \libinternal \brief
+ * Copies the PME GPU timings to the gmx_wallclock_gpu_t structure (for log output). To be called at the run end.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] timings The gmx_wallclock_gpu_pme_t structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
+ gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
+
+/* The inlined convenience PME GPU status getters */
+
+/*! \libinternal \brief
+ * Tells if PME runs on multiple GPUs with the decomposition.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns True if PME runs on multiple GPUs, false otherwise.
+ */
+gmx_inline bool pme_gpu_uses_dd(const pme_gpu_t *pmeGPU)
+{
+ return !pmeGPU->settings.useDecomposition;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the gathering stage on GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns True if the gathering is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_gather(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->settings.performGPUGather;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the FFT stages on GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns True if FFT is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_FFT(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->settings.performGPUFFT;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the grid (un-)wrapping on GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns True if (un-)wrapping is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_wrapping(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->settings.useDecomposition;
+}
+
+/*! \libinternal \brief
+ * Tells if PME performs the grid solving on GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \returns True if solving is performed on GPU, false otherwise.
+ */
+gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->settings.performGPUSolve;
+}
+
+/* A block of C++ functions that live in pme-gpu-internal.cpp */
+
+/*! \libinternal \brief
+ * Returns the output virial and energy of the PME solving.
+ * Should be called after pme_gpu_finish_step.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[out] energy The output energy.
+ * \param[out] virial The output virial matrix.
+ */
+void pme_gpu_get_energy_virial(const pme_gpu_t *pmeGPU, real *energy, matrix virial);
+
+/*! \libinternal \brief
+ * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_start_step().
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] box The unit cell box.
+ */
+void pme_gpu_update_input_box(pme_gpu_t *pmeGPU, const matrix box);
+
+/*! \libinternal \brief
+ * Starts the PME GPU step (copies coordinates onto GPU, possibly sets the unit cell parameters).
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] needToUpdateBox Tells if the stored unit cell parameters should be updated from \p box.
+ * \param[in] box The unit cell box which does not necessarily change every step (only with pressure coupling enabled).
+ * Would only be used if \p needToUpdateBox is true.
+ * \param[in] h_coordinates Input coordinates (XYZ rvec array).
+ */
+void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix box, const rvec *h_coordinates);
+
+/*! \libinternal \brief
+ * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] bCalcForces The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
+ * \param[in] bCalcEnerVir The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
+ */
+void pme_gpu_finish_step(const pme_gpu_t *pmeGPU, const bool bCalcForces,
+ const bool bCalcEnerVir);
+
+/*! \libinternal \brief
+ * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
+ *
+ * \param[in,out] pme The PME structure.
+ * \param[in,out] gpuInfo The GPU information structure.
+ * \param[in] mdlog The logger.
+ * \param[in] cr The communication structure.
+ * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
+ */
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+
+/*! \libinternal \brief
+ * Destroys the PME GPU data at the end of the run.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+void pme_gpu_destroy(pme_gpu_t *pmeGPU);
+
+/*! \libinternal \brief
+ * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ * \param[in] nAtoms The number of particles.
+ * \param[in] charges The pointer to the host-side array of particle charges.
+ *
+ * This is a function that should only be called in the beginning of the run and on domain decomposition.
+ * Should be called before the pme_gpu_set_io_ranges.
+ */
+void pme_gpu_reinit_atoms(pme_gpu_t *pmeGPU,
+ const int nAtoms,
+ const real *charges);
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines the GPU-agnostic PME GPU data structures
+ * (the host-side PME GPU data, and the GPU function parameters).
+ * \todo Due to Gerrit workflow and time constraints, some renaming/refactoring
+ * which does not impair the performance will be performed once
+ * most of the initial PME CUDA implementation is merged
+ * into the master branch (likely, after release 2017).
+ * This should include:
+ * -- bringing the structure/function names up to guidelines
+ * ---- pme_gpu_settings_t -> PmeGpuTasks
+ * -- refining GPU notation application (#2053)
+ * -- renaming coefficients to charges (?)
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#ifndef GMX_EWALD_PME_GPU_TYPES_H
+#define GMX_EWALD_PME_GPU_TYPES_H
+
+#include "config.h"
+
+#include <memory>
+#include <vector>
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+
+struct gmx_hw_info;
+struct gmx_device_info_t;
+
+/*! \brief Possible PME codepaths
+ * \todo: make this enum class with gmx_pme_t C++ refactoring
+ */
+enum PmeRunMode
+{
+ CPU, //!< Whole PME step is done on CPU
+ GPU, //!< Whole PME step is done on GPU
+ Hybrid, //!< Mixed mode: only spread and gather run on GPU; FFT and solving are done on CPU.
+};
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+struct pme_gpu_cuda_t;
+/*! \brief A typedef for including the GPU host data by pointer */
+typedef pme_gpu_cuda_t pme_gpu_specific_t;
+
+struct pme_gpu_cuda_kernel_params_t;
+/*! \brief A typedef for including the GPU kernel arguments data by pointer */
+typedef pme_gpu_cuda_kernel_params_t pme_gpu_kernel_params_t;
+
+#else
+
+/*! \brief A dummy typedef for the GPU host data placeholder on non-GPU builds */
+typedef int pme_gpu_specific_t;
+/*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
+typedef int pme_gpu_kernel_params_t;
+
+#endif
+
+/* What follows is all the PME GPU function arguments,
+ * sorted into several device-side structures depending on the update rate.
+ * This is GPU agnostic (float3 replaced by float[3], etc.).
+ * The GPU-framework specifics (e.g. cudaTextureObject_t handles) are described
+ * in the larger structure pme_gpu_cuda_kernel_params_t in the pme.cuh.
+ */
+
+/*! \internal \brief
+ * A GPU data structure for storing the constant PME data.
+ * This only has to be initialized once.
+ */
+struct pme_gpu_const_params_t
+{
+ /*! \brief Electrostatics coefficient = ONE_4PI_EPS0 / pme->epsilon_r */
+ float elFactor;
+ /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
+ * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
+ float *d_virialAndEnergy;
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data related to the grid sizes and cut-off.
+ * This only has to be updated at every DD step.
+ */
+struct pme_gpu_grid_params_t
+{
+ /* Grid sizes */
+ /*! \brief Real-space grid data dimensions. */
+ int realGridSize[DIM];
+ /*! \brief Real-space grid dimensions, only converted to floating point. */
+ float realGridSizeFP[DIM];
+ /*! \brief Real-space grid dimensions (padded). The padding as compared to realGridSize includes the (order - 1) overlap. */
+ int realGridSizePadded[DIM]; /* Is major dimension of this ever used in kernels? */
+ /*! \brief Fourier grid dimensions. This counts the complex numbers! */
+ int complexGridSize[DIM];
+ /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
+ int complexGridSizePadded[DIM];
+
+ /* Grid pointers */
+ /*! \brief Real space grid. */
+ float *d_realGrid;
+ /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
+ float *d_fourierGrid;
+
+ /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
+ float ewaldFactor;
+
+ /*! \brief Grid spline values as in pme->bsp_mod
+ * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
+ */
+ float *d_splineModuli;
+ /*! \brief Offsets for X/Y/Z components of d_splineModuli */
+ int splineValuesOffset[DIM];
+
+ /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
+ float *d_fractShiftsTable;
+ /*! \brief Gridline indices lookup table
+ * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
+ int *d_gridlineIndicesTable;
+ /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
+ int tablesOffsets[DIM];
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data of the atoms, local to this process' domain partition.
+ * This only has to be updated every DD step.
+ */
+struct pme_gpu_atom_params_t
+{
+ /*! \brief Number of local atoms */
+ int nAtoms;
+ /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
+ * The coordinates themselves change and need to be copied to the GPU every MD step,
+ * but reallocation happens only at DD.
+ */
+ float *d_coordinates;
+ /*! \brief Pointer to the global GPU memory with input atom charges.
+ * The charges only need to be reallocated and copied to the GPU at DD step.
+ */
+ float *d_coefficients;
+ /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
+ * The forces change and need to be copied from (and possibly to) the GPU every MD step,
+ * but reallocation happens only at DD.
+ */
+ float *d_forces;
+ /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
+ * Computed on GPU in the spline calculation part.
+ */
+ int *d_gridlineIndices;
+
+ /* B-spline parameters are computed entirely on GPU every MD step, not copied.
+ * Unless we want to try something like GPU spread + CPU gather?
+ */
+ /*! \brief Pointer to the global GPU memory with B-spline values */
+ float *d_theta;
+ /*! \brief Pointer to the global GPU memory with B-spline derivative values */
+ float *d_dtheta;
+};
+
+/*! \internal \brief
+ * A GPU data structure for storing the PME data which might change every MD step.
+ */
+struct pme_gpu_step_params_t
+{
+ /* The box parameters. The box only changes size each step with pressure coupling enabled. */
+ /*! \brief
+ * Reciprocal (inverted unit cell) box.
+ *
+ * The box is transposed as compared to the CPU pme->recipbox.
+ * Basically, spread uses matrix columns (while solve and gather use rows).
+ * This storage format might be not the most optimal since the box is always triangular so there are zeroes.
+ */
+ float recipBox[DIM][DIM];
+ /*! \brief The unit cell volume for solving. */
+ float boxVolume;
+};
+
+/*! \internal \brief
+ * A single structure encompassing almost all the PME data used in GPU kernels on device.
+ * This is inherited by the GPU framework-specific structure
+ * (pme_gpu_cuda_kernel_params_t in pme.cuh).
+ * This way, most code preparing the kernel parameters can be GPU-agnostic by casting
+ * the kernel parameter data pointer to pme_gpu_kernel_params_base_t.
+ */
+struct pme_gpu_kernel_params_base_t
+{
+ /*! \brief Constant data that is set once. */
+ pme_gpu_const_params_t constants;
+ /*! \brief Data dependent on the grid size/cutoff. */
+ pme_gpu_grid_params_t grid;
+ /*! \brief Data dependent on the DD and local atoms. */
+ pme_gpu_atom_params_t atoms;
+ /*! \brief Data that possibly changes on every MD step. */
+ pme_gpu_step_params_t step;
+};
+
+/* Here are the host-side structures */
+
+/*! \internal \brief
+ * The PME GPU settings structure, included in the main PME GPU structure by value.
+ */
+struct pme_gpu_settings_t
+{
+ /* Permanent settings set on initialization */
+ /*! \brief A boolean which tells if the solving is performed on GPU. Currently always true */
+ bool performGPUSolve;
+ /*! \brief A boolean which tells if the gathering is performed on GPU. Currently always true */
+ bool performGPUGather;
+ /*! \brief A boolean which tells if the FFT is performed on GPU. Currently true for a single MPI rank. */
+ bool performGPUFFT;
+ /*! \brief A convenience boolean which tells if PME decomposition is used. */
+ bool useDecomposition;
+ /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
+ * Only intended to be used by the test framework.
+ */
+ bool copyAllOutputs;
+ /*! \brief Various computation flags for the curent step, corresponding to the GMX_PME_ flags in pme.h. */
+ int stepFlags;
+};
+
+/*! \internal \brief
+ * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
+ * Buffers are managed by the PME GPU module.
+ */
+struct pme_gpu_staging_t
+{
+ /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
+ float *h_virialAndEnergy;
+ /*! \brief B-spline values intermediate host-side buffer. */
+ float *h_splineModuli;
+
+ /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
+ float *h_theta;
+ /*! \brief Pointer to the host memory with B-spline derivative values. Only used for host-side gather, or unit tests */
+ float *h_dtheta;
+ /*! \brief Pointer to the host memory with ivec atom gridline indices. Only used for host-side gather, or unit tests */
+ int *h_gridlineIndices;
+};
+
+/*! \internal \brief
+ * The PME GPU structure for all the data copied directly from the CPU PME structure.
+ * The copying is done when the CPU PME structure is already (re-)initialized
+ * (pme_gpu_reinit is called at the end of gmx_pme_init).
+ * All the variables here are named almost the same way as in gmx_pme_t.
+ * The types are different: pointers are replaced by vectors.
+ * TODO: use the shared data with the PME CPU.
+ * Included in the main PME GPU structure by value.
+ */
+struct pme_shared_t
+{
+ /*! \brief Grid count - currently always 1 on GPU */
+ int ngrids;
+ /*! \brief Grid dimensions - nkx, nky, nkz */
+ int nk[DIM];
+ /*! \brief Padded grid dimensions - pmegrid_nx, pmegrid_ny, pmegrid_nz
+ * TODO: find out if these are really needed for the CPU FFT compatibility.
+ */
+ int pmegrid_n[DIM];
+ /*! \brief PME interpolation order */
+ int pme_order;
+ /*! \brief Ewald splitting coefficient for Coulomb */
+ real ewaldcoeff_q;
+ /*! \brief Electrostatics parameter */
+ real epsilon_r;
+ /*! \brief Gridline indices - nnx, nny, nnz */
+ std::vector<int> nn[DIM];
+ /*! \brief Fractional shifts - fshx, fshy, fshz */
+ std::vector<real> fsh[DIM];
+ /*! \brief Precomputed B-spline values */
+ std::vector<real> bsp_mod[DIM];
+ /*! \brief The PME codepath being taken */
+ PmeRunMode runMode;
+};
+
+/*! \internal \brief
+ * The main PME GPU host structure, included in the PME CPU structure by pointer.
+ */
+struct pme_gpu_t
+{
+ /*! \brief The information copied once per reinit from the CPU structure. */
+ std::shared_ptr<pme_shared_t> common; // TODO: make the CPU structure use the same type
+
+ /*! \brief The settings. */
+ pme_gpu_settings_t settings;
+
+ /*! \brief The host-side buffers.
+ * The device-side buffers are buried in kernelParams, but that will have to change.
+ */
+ pme_gpu_staging_t staging;
+
+ /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT.
+ * Used for kernel scheduling.
+ * kernelParams.atoms.nAtoms is the actual atom count to be used for data copying.
+ * TODO: this and the next member represent a memory allocation/padding properties -
+ * what a container type should do ideally.
+ */
+ int nAtomsPadded;
+ /*! \brief Number of local atoms, padded to be divisible by PME_ATOM_DATA_ALIGNMENT
+ * if c_usePadding is true.
+ * Used only as a basic size for almost all the atom data allocations
+ * (spline parameter data is also aligned by PME_SPREADGATHER_PARTICLES_PER_WARP).
+ * This should be the same as (c_usePadding ? nAtomsPadded : kernelParams.atoms.nAtoms).
+ * kernelParams.atoms.nAtoms is the actual atom count to be used for most data copying.
+ */
+ int nAtomsAlloc;
+
+ /*! \brief A pointer to the device used during the execution. */
+ gmx_device_info_t *deviceInfo;
+
+ /*! \brief A single structure encompassing all the PME data used on GPU.
+ * Its value is the only argument to all the PME GPU kernels.
+ * \todo Test whether this should be copied to the constant GPU memory once per MD step
+ * (or even less often with no box updates) instead of being an argument.
+ */
+ std::shared_ptr<pme_gpu_kernel_params_t> kernelParams;
+
+ /*! \brief The pointer to GPU-framework specific host-side data, such as CUDA streams and events. */
+ std::shared_ptr<pme_gpu_specific_t> archSpecific; /* FIXME: make it an unique_ptr */
+};
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme-gpu-internal.h"
+#include "pme-grid.h"
+#include "pme-internal.h"
+#include "pme-solve.h"
+
+bool pme_gpu_task_enabled(const gmx_pme_t *pme)
+{
+ return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
+}
+
+void pme_gpu_reset_timings(const gmx_pme_t *pme)
+{
+ if (pme_gpu_active(pme))
+ {
+ pme_gpu_reset_timings(pme->gpu);
+ }
+}
+
+void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
+{
+ if (pme_gpu_active(pme))
+ {
+ pme_gpu_get_timings(pme->gpu, timings);
+ }
+}
+
+void pme_gpu_get_results(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ matrix vir_q,
+ real *energy_q)
+{
+ GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+ const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
+ const bool haveComputedForces = pme->gpu->settings.stepFlags & GMX_PME_CALC_F;
+
+ // The wallcycle hierarchy is messy - we pause ewcFORCE just to exclude the PME GPU sync time
+ wallcycle_stop(wcycle, ewcFORCE);
+
+ wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+ pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+
+ if (haveComputedEnergyAndVirial)
+ {
+ if (pme->doCoulomb)
+ {
+ if (pme_gpu_performs_solve(pme->gpu))
+ {
+ pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q);
+ }
+ else
+ {
+ get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q);
+ }
+ }
+ else
+ {
+ *energy_q = 0;
+ }
+ }
+ /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */
+}
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
+#include "pme-internal.h"
+
#ifdef DEBUG_PME
#include "gromacs/fileio/pdbio.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
-#include "pme-internal.h"
-
+struct gmx_pme_t;
/*! \brief
* We allow coordinates to be out the unit-cell by up to 2 box lengths,
*/
constexpr int c_pmeNeighborUnitcellCount = 2*c_pmeMaxUnitcellShift + 1;
+struct pmegrid_t;
+struct pmegrids_t;
#if GMX_MPI
void
-gmx_sum_qgrid_dd(struct gmx_pme_t *pme, real *grid, int direction);
+gmx_sum_qgrid_dd(gmx_pme_t *pme, real *grid, int direction);
#endif
int
copy_pmegrid_to_fftgrid(const gmx_pme_t *pme, real *pmegrid, real *fftgrid, int grid_index);
int
-copy_fftgrid_to_pmegrid(struct gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
+copy_fftgrid_to_pmegrid(gmx_pme_t *pme, const real *fftgrid, real *pmegrid, int grid_index,
int nthread, int thread);
void
wrap_periodic_pmegrid(const gmx_pme_t *pme, real *pmegrid);
void
-unwrap_periodic_pmegrid(struct gmx_pme_t *pme, real *pmegrid);
+unwrap_periodic_pmegrid(gmx_pme_t *pme, real *pmegrid);
void
pmegrid_init(pmegrid_t *grid,
/* This function is called from gmx_pme_do() only from debugging code
that is commented out. */
void
-dump_local_fftgrid(struct gmx_pme_t *pme, const real *fftgrid);
+dump_local_fftgrid(gmx_pme_t *pme, const real *fftgrid);
#endif
#include "gromacs/timing/walltime_accounting.h"
#include "gromacs/utility/gmxmpi.h"
+#include "pme-gpu-types.h"
+
//! A repeat of typedef from parallel_3dfft.h
typedef struct gmx_parallel_3dfft *gmx_parallel_3dfft_t;
struct t_commrec;
struct t_inputrec;
+struct pme_gpu_t;
//@{
//! Grid indices for A state for charge and Lennard-Jones C6
} splinedata_t;
/*! \brief Data structure for coordinating transfer between PP and PME ranks*/
-typedef struct {
+struct pme_atomcomm_t{
int dimind; /* The index of the dimension, 0=x, 1=y */
int nslab;
int nodeid;
int *thread_idx; /* Which thread should spread which coefficient */
thread_plist_t *thread_plist;
splinedata_t *spline;
-} pme_atomcomm_t;
+};
/*! \brief Data structure for a single PME grid */
-typedef struct {
+struct pmegrid_t{
ivec ci; /* The spatial location of this grid */
ivec n; /* The used size of *grid, including order-1 */
ivec offset; /* The grid offset from the full node grid */
int order; /* PME spreading order */
ivec s; /* The allocated size of *grid, s >= n */
real *grid; /* The grid local thread, size n */
-} pmegrid_t;
+};
/*! \brief Data structures for PME grids */
-typedef struct {
+struct pmegrids_t{
pmegrid_t grid; /* The full node grid (non thread-local) */
int nthread; /* The number of threads operating on this grid */
ivec nc; /* The local spatial decomposition over the threads */
real *grid_all; /* Allocated array for the grids in *grid_th */
int *g2t[DIM]; /* The grid to thread index */
ivec nthread_comm; /* The number of threads to communicate with */
-} pmegrids_t;
+};
/*! \brief Data structure for spline-interpolation working buffers */
struct pme_spline_work;
struct pme_solve_work_t;
/*! \brief Master PME data structure */
-typedef struct gmx_pme_t {
+struct gmx_pme_t {
int ndecompdim; /* The number of decomposition dimensions */
int nodeid; /* Our nodeid in mpi->mpi_comm */
int nodeid_major;
real ewaldcoeff_lj; /* Ewald splitting coefficient for r^-6 */
real epsilon_r;
- class EwaldBoxZScaler *boxScaler; /*! The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */
+
+ enum PmeRunMode runMode; /* Which codepath is the PME runner taking - CPU, GPU, mixed;
+ * TODO: this is the information that should be owned by the task scheduler,
+ * and ideally not be duplicated here.
+ */
+
+ pme_gpu_t *gpu; /* A pointer to the GPU data.
+ * TODO: this should be unique or a shared pointer.
+ * Currently in practice there is a single gmx_pme_t instance while a code
+ * is partially set up for many of them. The PME tuning calls gmx_pme_reinit()
+ * which fully reinitializes the one and only PME structure anew while maybe
+ * keeping the old grid buffers if they were already large enough.
+ * This small choice should be made clear in the later refactoring -
+ * do we store many PME objects for different grid sizes,
+ * or a single PME object that handles different grid sizes gracefully.
+ */
+
+
+ class EwaldBoxZScaler *boxScaler; /**< The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */
+
int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
* This can probably be done in a better way
* but this simple hack works for now
*/
+
/* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
int pmegrid_nx, pmegrid_ny, pmegrid_nz;
/* pmegrid_nz might be larger than strictly necessary to ensure
/* Work data for sum_qgrid */
real * sum_qgrid_tmp;
real * sum_qgrid_dd_tmp;
-} t_gmx_pme_t;
+};
//! @endcond
+/*! \brief
+ * Finds out if PME is currently running on GPU.
+ * TODO: should this be removed eventually?
+ *
+ * \param[in] pme The PME structure.
+ * \returns True if PME runs on GPU currently, false otherwise.
+ */
+inline bool pme_gpu_active(const gmx_pme_t *pme)
+{
+ return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
+}
+
+/*! \brief Check restrictions on pme_order and the PME grid nkx,nky,nkz.
+ *
+ * With bFatal=TRUE, a fatal error is generated on violation,
+ * bValidSettings=NULL can be passed.
+ * With bFatal=FALSE, *bValidSettings reports the validity of the settings.
+ * bUseThreads tells if any MPI rank doing PME uses more than 1 threads.
+ * If at calling you bUseThreads is unknown, pass TRUE for conservative
+ * checking.
+ *
+ * TODO: the GPU restrictions are checked separately during pme_gpu_init().
+ */
+void gmx_pme_check_restrictions(int pme_order,
+ int nkx, int nky, int nkz,
+ int nnodes_major,
+ gmx_bool bUseThreads,
+ gmx_bool bFatal,
+ gmx_bool *bValidSettings);
+
/*! \brief Initialize the PME-only side of the PME <-> PP communication */
gmx_pme_pp_t gmx_pme_pp_init(t_commrec *cr);
/*! \brief Called by PME-only ranks to receive coefficients and coordinates
*
- * The return value is used to control further processing, with meanings:
- * pmerecvqxX: all parameters set, chargeA and chargeB can be NULL
- * pmerecvqxFINISH: no parameters set
- * pmerecvqxSWITCHGRID: only grid_size and *ewaldcoeff are set
- * pmerecvqxRESETCOUNTERS: *step is set
+ * \param[in,out] pme_pp PME-PP communication structure.
+ * \param[out] natoms Number of received atoms.
+ * \param[out] chargeA State A charges, if received.
+ * \param[out] chargeB State B charges, if received.
+ * \param[out] sqrt_c6A State A coefficients, if received.
+ * \param[out] sqrt_c6B State B coefficients, if received.
+ * \param[out] sigmaA State A coefficients, if received.
+ * \param[out] sigmaB State B coefficients, if received.
+ * \param[out] box System box, if received.
+ * \param[out] x Atoms' coordinates, if received.
+ * \param[out] f Atoms' PME forces, if received.
+ * \param[out] maxshift_x Maximum shift in X direction, if received.
+ * \param[out] maxshift_y Maximum shift in Y direction, if received.
+ * \param[out] lambda_q Free-energy lambda for electrostatics, if received.
+ * \param[out] lambda_lj Free-energy lambda for Lennard-Jones, if received.
+ * \param[out] bEnerVir Set to true if this is an energy/virial calculation step, otherwise set to false.
+ * \param[out] step MD integration step number.
+ * \param[out] grid_size PME grid size, if received.
+ * \param[out] ewaldcoeff_q Ewald cut-off parameter for electrostatics, if received.
+ * \param[out] ewaldcoeff_lj Ewald cut-off parameter for Lennard-Jones, if received.
+ * \param[out] atomSetChanged Set to true only if the local domain atom data (charges/coefficients)
+ * has been received (after DD) and should be reinitialized. Otherwise not changed.
+ *
+ * \retval pmerecvqxX All parameters were set, chargeA and chargeB can be NULL.
+ * \retval pmerecvqxFINISH No parameters were set.
+ * \retval pmerecvqxSWITCHGRID Only grid_size and *ewaldcoeff were set.
+ * \retval pmerecvqxRESETCOUNTERS *step was set.
*/
int gmx_pme_recv_coeffs_coords(struct gmx_pme_pp *pme_pp,
int *natoms,
real *lambda_q, real *lambda_lj,
gmx_bool *bEnerVir,
gmx_int64_t *step,
- ivec grid_size, real *ewaldcoeff_q, real *ewaldcoeff_lj);
+ ivec grid_size,
+ real *ewaldcoeff_q,
+ real *ewaldcoeff_lj,
+ bool *atomSetChanged);
/*! \brief Send the PME mesh force, virial and energy to the PP-only nodes */
void gmx_pme_send_force_vir_ener(struct gmx_pme_pp *pme_pp,
if (!pme_lb->bSepPMERanks)
{
- if (pme_lb->setup[pme_lb->cur].pmedata == nullptr)
+ /* FIXME:
+ * CPU PME keeps a list of allocated pmedata's, that's why pme_lb->setup[pme_lb->cur].pmedata is not always nullptr.
+ * GPU PME, however, currently needs the gmx_pme_reinit always called on load balancing
+ * (pme_gpu_reinit might be not sufficiently decoupled from gmx_pme_init).
+ * This can lead to a lot of reallocations for PME GPU.
+ * Would be nicer if the allocated grid list was hidden within a single pmedata structure.
+ */
+ if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr) || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
{
/* Generate a new PME data structure,
* copying part of the old pointers.
int n_prev;
double cycles_prev;
- assert(pme_lb != NULL);
+ assert(pme_lb != nullptr);
if (!pme_lb->bActive)
{
pme->nky == grid_size[YY] &&
pme->nkz == grid_size[ZZ])
{
+ /* Here we have found an existing PME data structure that suits us.
+ * However, in the GPU case, we have to reinitialize it - there's only one GPU structure.
+ * This should not cause actual GPU reallocations, at least (the allocated buffers are never shrunk).
+ * So, just some grid size updates in the GPU kernel parameters.
+ */
+ gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size, ewaldcoeff_q, ewaldcoeff_lj);
*pme_ret = pme;
-
return;
}
do
{
/* Domain decomposition */
+ bool atomSetChanged = false;
ret = gmx_pme_recv_coeffs_coords(pme_pp,
&natoms,
&chargeA, &chargeB,
&lambda_q, &lambda_lj,
&bEnerVir,
&step,
- grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
+ grid_switch,
+ &ewaldcoeff_q,
+ &ewaldcoeff_lj,
+ &atomSetChanged);
if (ret == pmerecvqxSWITCHGRID)
{
gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, ewaldcoeff_q, ewaldcoeff_lj, cr, ir, &pme);
}
+ if (atomSetChanged)
+ {
+ gmx_pme_reinit_atoms(pme, natoms, chargeA);
+ }
+
if (ret == pmerecvqxRESETCOUNTERS)
{
/* Reset the cycle and flop counters */
gmx_int64_t *step,
ivec grid_size,
real *ewaldcoeff_q,
- real *ewaldcoeff_lj)
+ real *ewaldcoeff_lj,
+ bool *atomSetChanged)
{
int status = -1;
int nat = 0;
if (cnb.flags & (PP_PME_CHARGE | PP_PME_SQRTC6 | PP_PME_SIGMA))
{
+ *atomSetChanged = true;
+
/* Receive the send counts from the other PP nodes */
for (int sender = 0; sender < pme_pp->nnode; sender++)
{
GMX_UNUSED_VALUE(grid_size);
GMX_UNUSED_VALUE(ewaldcoeff_q);
GMX_UNUSED_VALUE(ewaldcoeff_lj);
+ GMX_UNUSED_VALUE(atomSetChanged);
status = pmerecvqxX;
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements PME GPU timing events in CUDA.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme-timings.cuh"
+
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+
+/*! \brief \internal
+ * Tells if CUDA-based performance tracking is enabled for PME.
+ *
+ * \param[in] pme The PME data structure.
+ * \returns True if timings are enabled, false otherwise.
+ */
+gmx_inline bool pme_gpu_timings_enabled(const pme_gpu_t *pmeGPU)
+{
+ return pmeGPU->archSpecific->useTiming;
+}
+
+void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId)
+{
+ if (pme_gpu_timings_enabled(pmeGPU))
+ {
+ GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+ pmeGPU->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGPU->archSpecific->pmeStream);
+ }
+}
+
+void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId)
+{
+ if (pme_gpu_timings_enabled(pmeGPU))
+ {
+ GMX_ASSERT(PMEStageId < pmeGPU->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+ pmeGPU->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGPU->archSpecific->pmeStream);
+ }
+}
+
+void pme_gpu_get_timings(const pme_gpu_t *pmeGPU, gmx_wallclock_gpu_pme_t *timings)
+{
+ if (pme_gpu_timings_enabled(pmeGPU))
+ {
+ GMX_RELEASE_ASSERT(timings, "Null GPU timing pointer");
+ for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+ {
+ timings->timing[i].t = pmeGPU->archSpecific->timingEvents[i].getTotalTime();
+ timings->timing[i].c = pmeGPU->archSpecific->timingEvents[i].getCallCount();
+ }
+ }
+}
+
+void pme_gpu_update_timings(const pme_gpu_t *pmeGPU)
+{
+ if (pme_gpu_timings_enabled(pmeGPU))
+ {
+ for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+ {
+ pmeGPU->archSpecific->timingEvents[i].getLastRangeTime();
+ }
+ }
+}
+
+void pme_gpu_reset_timings(const pme_gpu_t *pmeGPU)
+{
+ if (pme_gpu_timings_enabled(pmeGPU))
+ {
+ for (size_t i = 0; i < pmeGPU->archSpecific->timingEvents.size(); i++)
+ {
+ pmeGPU->archSpecific->timingEvents[i].reset();
+ }
+ }
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines PME GPU timing functions.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_TIMINGS_CUH
+#define GMX_EWALD_PME_TIMINGS_CUH
+
+#include "gromacs/gpu_utils/gpuregiontimer.cuh"
+#include "gromacs/timing/gpu_timing.h" // TODO: move include to the source files
+
+struct pme_gpu_t;
+
+/*! \libinternal \brief
+ * Starts timing the certain PME GPU stage during a single step (if timings are enabled).
+ *
+ * \param[in] pmeGPU The PME GPU data structure.
+ * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+void pme_gpu_start_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId);
+
+/*! \libinternal \brief
+ * Stops timing the certain PME GPU stage during a single step (if timings are enabled).
+ *
+ * \param[in] pmeGPU The PME GPU data structure.
+ * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+void pme_gpu_stop_timing(const pme_gpu_t *pmeGPU, size_t PMEStageId);
+
+#endif
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/logger.h"
#include "gromacs/utility/real.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
#include "calculate-spline-moduli.h"
#include "pme-gather.h"
+#include "pme-gpu-internal.h"
#include "pme-grid.h"
#include "pme-internal.h"
#include "pme-redistribute.h"
return (enumerator + denominator - 1)/denominator;
}
-int gmx_pme_init(struct gmx_pme_t **pmedata,
- t_commrec * cr,
- int nnodes_major,
- int nnodes_minor,
- const t_inputrec * ir,
- int homenr,
- gmx_bool bFreeEnergy_q,
- gmx_bool bFreeEnergy_lj,
- gmx_bool bReproducible,
- real ewaldcoeff_q,
- real ewaldcoeff_lj,
- int nthread)
+int gmx_pme_init(struct gmx_pme_t **pmedata,
+ t_commrec *cr,
+ int nnodes_major,
+ int nnodes_minor,
+ const t_inputrec *ir,
+ int homenr,
+ gmx_bool bFreeEnergy_q,
+ gmx_bool bFreeEnergy_lj,
+ gmx_bool bReproducible,
+ real ewaldcoeff_q,
+ real ewaldcoeff_lj,
+ int nthread,
+ PmeRunMode runMode,
+ pme_gpu_t *pmeGPU,
+ gmx_device_info_t *gpuInfo,
+ const gmx::MDLogger &mdlog)
{
int use_threads, sum_use_threads, i;
ivec ndata;
snew(pme->bsp_mod[YY], pme->nky);
snew(pme->bsp_mod[ZZ], pme->nkz);
+ pme->gpu = pmeGPU; /* Carrying over the single GPU structure */
+ pme->runMode = runMode;
+
/* The required size of the interpolation grid, including overlap.
* The allocated size (pmegrid_n?) might be slightly larger.
*/
pme->pmegrid_nz_base = pme->nkz;
pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
-
pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
pme->pmegrid_start_iz = 0;
pme->lb_buf2 = nullptr;
pme->lb_buf_nalloc = 0;
+ pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+
pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
// no exception was thrown during the init, so we hand over the PME structure handle
try
{
+ const gmx::MDLogger dummyLogger;
+ // This is reinit which is currently only changing grid size/coefficients,
+ // so we don't expect the actual logging.
+ // TODO: when PME is an object, it should take reference to mdlog on construction and save it.
ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
- &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread);
+ &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, ewaldcoeff_q, ewaldcoeff_lj,
+ pme_src->nthread, pme_src->runMode, pme_src->gpu, nullptr, dummyLogger);
+ //TODO this is mostly passing around current values
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
real *dvdlambda_q, real *dvdlambda_lj,
int flags)
{
+ GMX_ASSERT(pme->runMode == PmeRunMode::CPU, "gmx_pme_do should not be called on the GPU PME run.");
+
int d, i, j, npme, grid_index, max_grid_index;
int n_d;
pme_atomcomm_t *atc = nullptr;
sfree(pme->sum_qgrid_tmp);
sfree(pme->sum_qgrid_dd_tmp);
+ if (pme_gpu_active(pme) && pme->gpu)
+ {
+ pme_gpu_destroy(pme->gpu);
+ }
+
sfree(pme);
}
+
+void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges)
+{
+ if (pme_gpu_active(pme))
+ {
+ pme_gpu_reinit_atoms(pme->gpu, nAtoms, charges);
+ }
+ // TODO: handle the CPU case here; handle the whole t_mdatoms
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief This file contains internal CUDA function implementations
+ * for performing the PME calculations on GPU.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cmath>
+
+/* The rest */
+#include "pme.h"
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/pmalloc_cuda.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "pme.cuh"
+#include "pme-3dfft.cuh"
+#include "pme-grid.h"
+
+int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
+{
+ const int order = pmeGPU->common->pme_order;
+ GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+ return PME_ATOM_DATA_ALIGNMENT;
+}
+
+int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
+{
+ const int order = pmeGPU->common->pme_order;
+ GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+ return PME_SPREADGATHER_ATOMS_PER_WARP;
+}
+
+void pme_gpu_synchronize(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
+}
+
+void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU)
+{
+ const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
+ cudaError_t stat = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
+ CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
+ pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize);
+}
+
+void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy);
+ CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
+ pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr;
+ pfree(pmeGPU->staging.h_virialAndEnergy);
+ pmeGPU->staging.h_virialAndEnergy = nullptr;
+}
+
+void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0,
+ c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
+}
+
+void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU)
+{
+ const int splineValuesOffset[DIM] = {
+ 0,
+ pmeGPU->kernelParams->grid.realGridSize[XX],
+ pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY]
+ };
+ memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
+
+ const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] +
+ pmeGPU->kernelParams->grid.realGridSize[YY] +
+ pmeGPU->kernelParams->grid.realGridSize[ZZ];
+ const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize);
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true);
+ if (shouldRealloc)
+ {
+ /* Reallocate the host buffer */
+ pfree(pmeGPU->staging.h_splineModuli);
+ pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
+ }
+ for (int i = 0; i < DIM; i++)
+ {
+ memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
+ }
+ /* TODO: pin original buffer instead! */
+ cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
+ newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+}
+
+void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU)
+{
+ pfree(pmeGPU->staging.h_splineModuli);
+ cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize,
+ &pmeGPU->archSpecific->splineValuesSizeAlloc);
+}
+
+void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU)
+{
+ const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
+ GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
+}
+
+void pme_gpu_free_forces(const pme_gpu_t *pmeGPU)
+{
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
+}
+
+void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces)
+{
+ GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
+ const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
+ GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
+ cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
+}
+
+void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
+{
+ GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
+ const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
+ GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
+ cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
+ cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME gather forces synchronization failure");
+}
+
+void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
+{
+ cudaStream_t s = pmeGPU->archSpecific->pmeStream;
+ cudaError_t stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0);
+ CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
+}
+
+void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU)
+{
+ const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM;
+ GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true);
+ if (c_usePadding)
+ {
+ const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms;
+ const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex;
+ if (paddingCount > 0)
+ {
+ cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
+ }
+ }
+}
+
+void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates)
+{
+ GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
+#if GMX_DOUBLE
+ GMX_RELEASE_ASSERT(false, "Only single precision is supported");
+ GMX_UNUSED_VALUE(h_coordinates);
+#else
+ cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
+ pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
+#endif
+}
+
+void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU)
+{
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc);
+}
+
+void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients)
+{
+ GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
+ const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc;
+ GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
+ newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
+ cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
+ pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ if (c_usePadding)
+ {
+ const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
+ const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex;
+ if (paddingCount > 0)
+ {
+ cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME failed to clear the padded charges");
+ }
+ }
+}
+
+void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU)
+{
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc);
+}
+
+void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU)
+{
+ const int order = pmeGPU->common->pme_order;
+ const int alignment = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
+ const size_t nAtomsPadded = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment;
+ const int newSplineDataSize = DIM * order * nAtomsPadded;
+ GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
+ /* Two arrays of the same size */
+ const bool shouldRealloc = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize);
+ int currentSizeTemp = pmeGPU->archSpecific->splineDataSize;
+ int currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc;
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float),
+ ¤tSizeTemp, ¤tSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
+ // the host side reallocation
+ if (shouldRealloc)
+ {
+ pfree(pmeGPU->staging.h_theta);
+ pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float));
+ pfree(pmeGPU->staging.h_dtheta);
+ pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float));
+ }
+}
+
+void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU)
+{
+ /* Two arrays of the same size */
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta);
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc);
+ pfree(pmeGPU->staging.h_theta);
+ pfree(pmeGPU->staging.h_dtheta);
+}
+
+void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU)
+{
+ const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc;
+ GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
+ cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
+ &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true);
+ pfree(pmeGPU->staging.h_gridlineIndices);
+ pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
+}
+
+void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU)
+{
+ cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc);
+ pfree(pmeGPU->staging.h_gridlineIndices);
+}
+
+void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU)
+{
+ auto *kernelParamsPtr = pmeGPU->kernelParams.get();
+ const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
+ kernelParamsPtr->grid.realGridSizePadded[YY] *
+ kernelParamsPtr->grid.realGridSizePadded[ZZ];
+ const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
+ kernelParamsPtr->grid.complexGridSizePadded[YY] *
+ kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
+ // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
+ if (pmeGPU->archSpecific->performOutOfPlaceFFT)
+ {
+ /* 2 separate grids */
+ cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc,
+ newComplexGridSize, pmeGPU->archSpecific->pmeStream, true);
+ cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
+ newRealGridSize, pmeGPU->archSpecific->pmeStream, true);
+ }
+ else
+ {
+ /* A single buffer so that any grid will fit */
+ const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
+ cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
+ &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
+ newGridsSize, pmeGPU->archSpecific->pmeStream, true);
+ kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
+ pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize;
+ // the size might get used later for copying the grid
+ }
+}
+
+void pme_gpu_free_grids(const pme_gpu_t *pmeGPU)
+{
+ if (pmeGPU->archSpecific->performOutOfPlaceFFT)
+ {
+ cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid);
+ }
+ cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid,
+ &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc);
+}
+
+void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0,
+ pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ /* Should the complex grid be cleared in some weird case? */
+ CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
+}
+
+void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
+{
+ cudaStream_t stream = pmeGPU->archSpecific->pmeStream;
+ auto *kernelParamsPtr = pmeGPU->kernelParams.get();
+
+ const int nx = kernelParamsPtr->grid.realGridSize[XX];
+ const int ny = kernelParamsPtr->grid.realGridSize[YY];
+ const int nz = kernelParamsPtr->grid.realGridSize[ZZ];
+ const int cellCount = c_pmeNeighborUnitcellCount;
+ const int gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
+
+ memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
+
+ const int newFractShiftsSize = cellCount * (nx + ny + nz);
+
+ /* Two arrays, same size */
+ int currentSizeTemp = pmeGPU->archSpecific->fractShiftsSize;
+ int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
+ cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
+ ¤tSizeTemp, ¤tSizeTempAlloc,
+ newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
+ float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
+ cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
+ &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
+ newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
+ int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
+
+ /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
+
+ for (int i = 0; i < DIM; i++)
+ {
+ kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
+ cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
+ cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
+ cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
+ cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
+ }
+
+ pme_gpu_make_fract_shifts_textures(pmeGPU);
+}
+
+void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
+{
+ pme_gpu_free_fract_shifts_textures(pmeGPU);
+ /* Two arrays, same size */
+ cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
+ cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
+}
+
+void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0);
+ CU_RET_ERR(stat, "Error while waiting for PME solve output");
+
+ for (int j = 0; j < c_virialAndEnergyCount; j++)
+ {
+ GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
+ }
+}
+
+void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid)
+{
+ const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
+ cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
+}
+
+void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
+{
+ const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
+ cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
+ cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME spread grid sync event record failure");
+}
+
+void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
+ CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
+}
+
+void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0);
+ CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
+}
+
+void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0);
+ CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
+ //should check for pme_gpu_performs_solve(pmeGPU)
+}
+
+void pme_gpu_init_internal(pme_gpu_t *pmeGPU)
+{
+ /* Allocate the target-specific structures */
+ pmeGPU->archSpecific.reset(new pme_gpu_specific_t());
+ pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t());
+
+ pmeGPU->archSpecific->performOutOfPlaceFFT = true;
+ /* This should give better performance, according to the cuFFT documentation.
+ * The performance seems to be the same though.
+ * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
+ */
+
+ pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) &&
+ (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+ /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings.
+ * This should probably also check for gpuId exclusivity?
+ */
+
+ /* Creating a PME CUDA stream */
+ cudaError_t stat;
+ int highest_priority, lowest_priority;
+ stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
+ CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
+ stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream,
+ cudaStreamDefault, //cudaStreamNonBlocking,
+ highest_priority);
+ CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
+}
+
+void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
+{
+ /* Destroy the CUDA stream */
+ cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream);
+ CU_RET_ERR(stat, "PME cudaStreamDestroy error");
+}
+
+void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat;
+ stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
+ stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
+ stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
+ stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
+ stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
+}
+
+void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU)
+{
+ cudaError_t stat;
+ stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
+ stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
+ stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
+ stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
+ stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
+}
+
+void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU)
+{
+ if (pme_gpu_performs_FFT(pmeGPU))
+ {
+ pmeGPU->archSpecific->fftSetup.resize(0);
+ for (int i = 0; i < pmeGPU->common->ngrids; i++)
+ {
+ pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGPU)));
+ }
+ }
+}
+
+void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU)
+{
+ pmeGPU->archSpecific->fftSetup.resize(0);
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief This file defines the PME CUDA-specific data structure,
+ * various compile-time constants shared among the PME CUDA kernels,
+ * and also names some PME CUDA memory management routines.
+ * TODO: consider changing defines into variables where possible; have inline getters.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#ifndef GMX_EWALD_PME_CUH
+#define GMX_EWALD_PME_CUH
+
+#include <cassert>
+
+#include <array>
+
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size
+
+#include "pme-gpu-internal.h" // for the general PME GPU behaviour defines
+#include "pme-timings.cuh"
+
+class GpuParallel3dFft;
+
+/* Some defines for PME behaviour follow */
+
+/*
+ Here is a current memory layout for the theta/dtheta B-spline float parameter arrays.
+ This is the data in global memory used both by spreading and gathering kernels (with same scheduling).
+ This example has PME order 4 and 2 particles per warp/data chunk.
+ Each particle has 16 threads assigned to it, each thread works on 4 non-sequential global grid contributions.
+
+ ----------------------------------------------------------------------------
+ particles 0, 1 | particles 2, 3 | ...
+ ----------------------------------------------------------------------------
+ order index 0 | index 1 | index 2 | index 3 | order index 0 .....
+ ----------------------------------------------------------------------------
+ tx0 tx1 ty0 ty1 tz0 tz1 | ..........
+ ----------------------------------------------------------------------------
+
+ Each data chunk for a single warp is 24 floats. This goes both for theta and dtheta.
+ 24 = 2 particles per warp * order 4 * 3 dimensions. 48 floats (1.5 warp size) per warp in total.
+ I have also tried intertwining theta and theta in a single array (they are used in pairs in gathering stage anyway)
+ and it didn't seem to make a performance difference.
+
+ The corresponding defines follow.
+ */
+
+/* This is the distance between the neighbour theta elements - would be 2 for the intertwining layout */
+#define PME_SPLINE_THETA_STRIDE 1
+
+/*! \brief
+ * The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order.
+ * The assumption is currently that any thread processes only a single atom's contributions.
+ */
+#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
+
+//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
+
+//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
+//! which is asserted in case we use atom data padding at all.
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+
+/*! \brief \internal
+ * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
+ *
+ * \param[in] atomDataIndexGlobal The atom data index.
+ * \param[in] nAtomData The atom data array element count.
+ * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
+{
+ return c_usePadding ? 1 : (atomDataIndex < nAtomData);
+}
+
+/*! \brief \internal
+ * An inline CUDA function for skipping the zero-charge atoms.
+ *
+ * \returns Non-0 if atom should be processed, 0 otherwise.
+ * \param[in] coefficient The atom charge.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+{
+ assert(isfinite(coefficient));
+ return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+}
+
+/*! \brief \internal
+ * The main PME CUDA-specific host data structure, included in the PME GPU structure by the archSpecific pointer.
+ */
+struct pme_gpu_cuda_t
+{
+ /*! \brief The CUDA stream where everything related to the PME happens. */
+ cudaStream_t pmeStream;
+
+ /* Synchronization events */
+ /*! \brief Triggered after the energy/virial have been copied to the host (after the solving stage). */
+ cudaEvent_t syncEnerVirD2H;
+ /*! \brief Triggered after the output forces have been copied to the host (after the gathering stage). */
+ cudaEvent_t syncForcesD2H;
+ /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
+ cudaEvent_t syncSpreadGridD2H;
+ /*! \brief Triggered after the atom spline data has been copied to the host (after the spline computation). */
+ cudaEvent_t syncSplineAtomDataD2H;
+ /*! \brief Triggered after the grid hes been copied to the host (after the solving stage) */
+ cudaEvent_t syncSolveGridD2H;
+
+ // TODO: consider moving some things below into the non-CUDA struct.
+
+ /* Settings which are set at the start of the run */
+ /*! \brief A boolean which tells whether the complex and real grids for cuFFT are different or same. Currenty true. */
+ bool performOutOfPlaceFFT;
+ /*! \brief A boolean which tells if the CUDA timing events are enabled.
+ * True by default, disabled by setting the environment variable GMX_DISABLE_CUDA_TIMING.
+ * FIXME: this should also be disabled if any other GPU task is running concurrently on the same device,
+ * as CUDA events on multiple streams are untrustworthy.
+ */
+ bool useTiming;
+
+ std::vector<std::unique_ptr<GpuParallel3dFft > > fftSetup;
+
+ std::array<GpuRegionTimer, gtPME_EVENT_COUNT> timingEvents;
+
+ /* GPU arrays element counts (not the arrays sizes in bytes!).
+ * They might be larger than the actual meaningful data sizes.
+ * These are paired: the actual element count + the maximum element count that can fit in the current allocated memory.
+ * These integer pairs are mostly meaningful for the cu_realloc/free_buffered calls.
+ * As such, if cu_realloc/free_buffered is refactored, they can be freely changed, too.
+ * The only exceptions are realGridSize and complexGridSize which are also used for grid clearing/copying.
+ * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
+ */
+ /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/
+ int coordinatesSize;
+ /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */
+ int coordinatesSizeAlloc;
+ /*! \brief The kernelParams.atoms.forces float element count (actual) */
+ int forcesSize;
+ /*! \brief The kernelParams.atoms.forces float element count (reserved) */
+ int forcesSizeAlloc;
+ /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */
+ int gridlineIndicesSize;
+ /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */
+ int gridlineIndicesSizeAlloc;
+ /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */
+ int splineDataSize;
+ /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
+ int splineDataSizeAlloc;
+ /*! \brief The kernelParams.atoms.coefficients float element count (actual) */
+ int coefficientsSize;
+ /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
+ int coefficientsSizeAlloc;
+ /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
+ int splineValuesSize;
+ /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
+ int splineValuesSizeAlloc;
+ /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
+ int fractShiftsSize;
+ /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
+ int fractShiftsSizeAlloc;
+ /*! \brief The kernelParams.grid.realGrid float element count (actual) */
+ int realGridSize;
+ /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
+ int realGridSizeAlloc;
+ /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
+ int complexGridSize;
+ /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
+ int complexGridSizeAlloc;
+};
+
+
+/*! \brief \internal
+ * A single structure encompassing all the PME data used in CUDA kernels.
+ * This inherits from pme_gpu_kernel_params_base_t and adds a couple cudaTextureObject_t handles,
+ * which we would like to avoid in plain C++.
+ */
+struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t
+{
+ /* These are CUDA texture objects, related to the grid size. */
+ /*! \brief CUDA texture object for accessing grid.d_fractShiftsTable */
+ cudaTextureObject_t fractShiftsTableTexture;
+ /*! \brief CUDA texture object for accessing grid.d_gridlineIndicesTable */
+ cudaTextureObject_t gridlineIndicesTableTexture;
+};
+
+/* CUDA texture functions which will reside in respective kernel files
+ * (due to texture references having scope of a translation unit).
+ */
+
+/*! \brief \internal
+ * Creates/binds 2 textures used in the spline parameter computation.
+ *
+ * \param[in, out] pmeGPU The PME GPU structure.
+ */
+inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
+
+/*! \brief \internal
+ * Frees/unbinds 2 textures used in the spline parameter computation.
+ *
+ * \param[in] pmeGPU The PME GPU structure.
+ */
+inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+
+#endif
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/math/vectypes.h"
-#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
+#include "pme-gpu-types.h"
+
struct t_commrec;
struct t_inputrec;
+struct pme_gpu_t;
+struct gmx_wallclock_gpu_pme_t;
+struct gmx_device_info_t;
+
+namespace gmx
+{
+class MDLogger;
+}
enum {
GMX_SUM_GRID_FORWARD, GMX_SUM_GRID_BACKWARD
gmx_bool bFreeEnergy_q, gmx_bool bFreeEnergy_lj,
gmx_bool bReproducible,
real ewaldcoeff_q, real ewaldcoeff_lj,
- int nthread);
+ int nthread,
+ PmeRunMode runMode,
+ pme_gpu_t *pmeGPU,
+ gmx_device_info_t *gpuInfo,
+ const gmx::MDLogger &mdlog);
/*! \brief Destroys the PME data structure.*/
void gmx_pme_destroy(gmx_pme_t *pme);
#define GMX_PME_DO_ALL_F (GMX_PME_SPREAD | GMX_PME_SOLVE | GMX_PME_CALC_F)
//@}
-/*! \brief Do a PME calculation for the long range electrostatics and/or LJ.
+/*! \brief Do a PME calculation on a CPU for the long range electrostatics and/or LJ.
*
* The meaning of \p flags is defined above, and determines which
* parts of the calculation are performed.
real *dvdlambda_q, real *dvdlambda_lj,
float *pme_cycles);
+/*! \brief
+ * This function updates the local atom data on GPU after DD (charges, coordinates, etc.).
+ * TODO: it should update the PME CPU atom data as well.
+ * (currently PME CPU call gmx_pme_do() gets passed the input pointers each step).
+ *
+ * \param[in] pme The PME structure.
+ * \param[in] nAtoms The number of particles.
+ * \param[in] charges The pointer to the array of particle charges.
+ */
+void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *charges);
+
+/* A block of PME GPU functions */
+
+/*! \brief
+ * Tells if PME is enabled to run on GPU (not necessarily active at the moment).
+ * For now, this decision is stored in the PME structure itself.
+ * FIXME: this is an information that should be managed by the task scheduler.
+ * As soon as such functionality appears, this function should be removed from this module.
+ *
+ * \param[in] pme The PME data structure.
+ * \returns true if PME can run on GPU, false otherwise.
+ */
+bool pme_gpu_task_enabled(const gmx_pme_t *pme);
+
+/*! \brief
+ * Resets the PME GPU timings. To be called at the reset step.
+ *
+ * \param[in] pme The PME structure.
+ */
+void pme_gpu_reset_timings(const gmx_pme_t *pme);
+
+/*! \brief
+ * Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
+ *
+ * \param[in] pme The PME structure.
+ * \param[in] timings The gmx_wallclock_gpu_pme_t structure.
+ */
+void pme_gpu_get_timings(const gmx_pme_t *pme,
+ gmx_wallclock_gpu_pme_t *timings);
+
+/* The main PME GPU functions */
+
+/*! \brief
+ * Gets the output forces and virial/energy if corresponding flags are (were?) passed in.
+ *
+ * \param[in] pme The PME data structure.
+ * \param[in] wcycle The wallclock counter.
+ * \param[out] vir_q The output virial matrix.
+ * \param[out] energy_q The output energy.
+ */
+void pme_gpu_get_results(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ matrix vir_q,
+ real *energy_q);
+
+
#endif
#include "gromacs/ewald/pme-spread.h"
#include "gromacs/fft/parallel_3dfft.h"
#include "gromacs/math/invertmatrix.h"
+#include "gromacs/mdtypes/commrec.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
namespace gmx
real ewaldCoeff_lj = 1.0f
)
{
- gmx_pme_t *pmeDataRaw = nullptr;
- gmx_pme_init(&pmeDataRaw, nullptr, 1, 1, inputRec,
- atomCount, false, false, true, ewaldCoeff_q, ewaldCoeff_lj, 1);
+ gmx_pme_t *pmeDataRaw = nullptr;
+ const MDLogger dummyLogger;
+ const auto runMode = PmeRunMode::CPU;
+ t_commrec dummyCommrec = {0};
+ gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
+ ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger);
PmeSafePointer pme(pmeDataRaw); // taking ownership
// TODO get rid of this with proper matrix type
/*! \libinternal \brief
* This is a GPU region timing implementation interface.
* It should provide methods for measuring the last timespan.
+ * Copying/assignment is disabled since the underlying timing events are owned by this.
*/
template <GpuFramework framework> class GpuRegionTimerImpl
{
using CommandEvent = typename GpuTraits<framework>::CommandEvent;
public:
+
GpuRegionTimerImpl() = default;
~GpuRegionTimerImpl() = default;
+ //! No copying
+ GpuRegionTimerImpl(const GpuRegionTimerImpl &) = delete;
+ //! No assignment
+ GpuRegionTimerImpl &operator=(GpuRegionTimerImpl &&) = delete;
+ //! Moving is disabled but can be considered in the future if needed
+ GpuRegionTimerImpl(GpuRegionTimerImpl &&) = delete;
/*! \brief Will be called before the region start. */
inline void openTimingRegion(CommandStream) = 0;
Idle,
Recording,
Stopped
- } debugState_;
+ } debugState_ = TimerState::Idle;
//! The number of times the timespan has been measured
- unsigned int callCount_;
+ unsigned int callCount_ = 0;
//! The accumulated duration of the timespans measured (milliseconds)
- double totalMilliseconds_;
+ double totalMilliseconds_ = 0.0;
//! The underlying region timer implementation
GpuRegionTimerImpl<framework> impl_;
public:
- GpuRegionTimerWrapper() : impl_()
- {
- reset();
- }
- ~GpuRegionTimerWrapper() = default;
+
/*! \brief
* To be called before the region start.
*
size_t currentEvent_ = 0;
public:
- GpuRegionTimerImpl() = default;
- ~GpuRegionTimerImpl() = default;
+
inline void openTimingRegion(CommandStream){}
inline void closeTimingRegion(CommandStream){}
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/ewald/pme.h"
#include "gromacs/listed-forces/manage-threading.h"
#include "gromacs/mdlib/mdatoms.h"
#include "gromacs/mdlib/shellfc.h"
}
setup_bonded_threading(fr, &top->idef);
+
+ gmx_pme_reinit_atoms(fr->pmedata, numHomeAtoms, mdatoms->chargeA);
+ /* This handles the PP+PME rank case where fr->pmedata is valid.
+ * For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms().
+ * TODO: this only handles the GPU logic so far, should handle CPU as well.
+ * TODO: this also does not account for TPI.
+ */
}
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
-static void countPruneKernelTime(cu_timers_t *timers,
- gmx_wallclock_gpu_t *timings,
- const int iloc)
+static void countPruneKernelTime(cu_timers_t *timers,
+ gmx_wallclock_gpu_nbnxn_t *timings,
+ const int iloc)
{
// We might have not done any pruning (e.g. if we skipped with empty domains).
if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
gmx_incons(stmp);
}
- cu_plist_t *plist = nb->plist[iloc];
- cu_timers_t *timers = nb->timers;
- struct gmx_wallclock_gpu_t *timings = nb->timings;
- nb_staging nbst = nb->nbst;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_nbnxn_t *timings = nb->timings;
+ nb_staging nbst = nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
}
/*! Initializes the timings data structure. */
-static void init_timings(gmx_wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
{
int i, j;
}
}
-gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+//! This function is documented in the header file
+gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
{
- return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+ return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
}
void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
* Timer init/uninit is still done even with timing off so only the condition
* setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
- bool bDoTime; /**< True if event-based timing is enabled. */
- cu_timers_t *timers; /**< CUDA event-based timers. */
- gmx_wallclock_gpu_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
+ bool bDoTime; /**< True if event-based timing is enabled. */
+ cu_timers_t *timers; /**< CUDA event-based timers. */
+ gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
};
#ifdef __cplusplus
struct nbnxn_pairlist_t;
struct nbnxn_atomdata_t;
struct NbnxnListParameters;
-struct gmx_wallclock_gpu_t;
+struct gmx_wallclock_gpu_nbnxn_t;
struct gmx_gpu_info_t;
+struct gmx_device_info_t;
/** Initializes the data structures related to GPU nonbonded calculations. */
GPU_FUNC_QUALIFIER
/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(NULL)
+struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
/** Resets nonbonded GPU timings. */
GPU_FUNC_QUALIFIER
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
-static void countPruneKernelTime(cl_timers_t *timers,
- gmx_wallclock_gpu_t *timings,
- const int iloc)
+static void countPruneKernelTime(cl_timers_t *timers,
+ gmx_wallclock_gpu_nbnxn_t *timings,
+ const int iloc)
{
// We might have not done any pruning (e.g. if we skipped with empty domains).
if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
gmx_incons(stmp);
}
- cl_plist_t *plist = nb->plist[iloc];
- cl_timers_t *timers = nb->timers;
- struct gmx_wallclock_gpu_t *timings = nb->timings;
- cl_nb_staging nbst = nb->nbst;
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_nbnxn_t *timings = nb->timings;
+ cl_nb_staging nbst = nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- int bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ int bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/*! \brief Initializes the timings data structure.
*/
-static void init_timings(gmx_wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
{
int i, j;
}
}
-
//! This function is documented in the header file
-gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
{
- return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+ return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
}
//! This function is documented in the header file
cl_command_queue stream[2]; /**< local and non-local GPU queues */
/** events used for synchronization */
- cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
- is done (and the local transfer can proceed) */
- cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
- the local stream that need to precede the
- non-local force calculations are done
- (e.g. f buffer 0-ing, local x/q H2D) */
-
- cl_bool bDoTime; /**< True if event-based timing is enabled. */
- cl_timers_t *timers; /**< OpenCL event-based timers. */
- struct gmx_wallclock_gpu_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
+ cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
+ is done (and the local transfer can proceed) */
+ cl_event misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
+ the local stream that need to precede the
+ non-local force calculations are done
+ (e.g. f buffer 0-ing, local x/q H2D) */
+
+ cl_bool bDoTime; /**< True if event-based timing is enabled. */
+ cl_timers_t *timers; /**< OpenCL event-based timers. */
+ struct gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
};
#ifdef __cplusplus
t_nrnb nrnb[], gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t *nbv,
+ gmx_pme_t *pme,
gmx_bool bWriteStat)
{
t_nrnb *nrnb_tot = nullptr;
if (printReport)
{
- struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
-
+ auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+ gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
+ if (pme_gpu_task_enabled(pme))
+ {
+ pme_gpu_get_timings(pme, &pme_gpu_timings);
+ }
wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
elapsed_time_over_all_ranks,
- wcycle, cycle_sum, gputimes);
+ wcycle, cycle_sum,
+ nbnxn_gpu_timings,
+ &pme_gpu_timings);
if (EI_DYNAMICS(inputrec->eI))
{
t_nrnb nrnb[], gmx_wallcycle_t wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t *nbv,
+ gmx_pme_t *pme,
gmx_bool bWriteStat);
void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr);
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
extern "C" {
#endif
-/*! \internal \brief Nonbonded kernel time and call count. */
-struct gmx_nbnxn_kernel_timing_data_t
+/*! \internal \brief GPU kernel time and call count. */
+struct gmx_kernel_timing_data_t
{
double t; /**< Accumulated lapsed time */
int c; /**< Number of calls corresponding to the elapsed time */
};
-/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
-struct gmx_wallclock_gpu_t
+/*! \internal \brief
+ *
+ * PME GPU stages timing events indices, corresponding to the string in PMEStageNames in wallcycle.cpp.
+ */
+enum
+{
+ gtPME_SPLINE = 0,
+ gtPME_SPREAD,
+ gtPME_SPLINEANDSPREAD,
+ gtPME_FFT_R2C,
+ gtPME_SOLVE,
+ gtPME_FFT_C2R,
+ gtPME_GATHER,
+ gtPME_EVENT_COUNT /* not a stage ID but a static array size */
+};
+
+/*! \internal \brief GPU timings for PME. */
+struct gmx_wallclock_gpu_pme_t
+{
+ /* A separate PME structure to avoid refactoring the NB code for gmx_wallclock_gpu_t later
+ * TODO: devise a better GPU timing data structuring.
+ */
+ /*! \brief Array of PME GPU timing data. */
+ gmx_kernel_timing_data_t timing[gtPME_EVENT_COUNT];
+};
+
+/*! \internal \brief GPU NB timings for kernels and H2d/D2H transfers. */
+struct gmx_wallclock_gpu_nbnxn_t
{
- gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
+ gmx_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
versions of the nonbonded kernels: force-only,
force+energy, force+pruning, and force+energy+pruning */
- gmx_nbnxn_kernel_timing_data_t pruneTime; /**< table containing the timings of the 1st pass prune-only kernels */
- gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
- double nb_h2d_t; /**< host to device transfer time in nb calculation */
- double nb_d2h_t; /**< device to host transfer time in nb calculation */
- int nb_c; /**< total call count of the nonbonded gpu operations */
- double pl_h2d_t; /**< pair search step host to device transfer time */
- int pl_h2d_c; /**< pair search step host to device transfer call count */
+ gmx_kernel_timing_data_t pruneTime; /**< table containing the timings of the 1st pass prune-only kernels */
+ gmx_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
+ double nb_h2d_t; /**< host to device transfer time in nb calculation */
+ double nb_d2h_t; /**< device to host transfer time in nb calculation */
+ int nb_c; /**< total call count of the nonbonded gpu operations */
+ double pl_h2d_t; /**< pair search step host to device transfer time */
+ int pl_h2d_c; /**< pair search step host to device transfer call count */
};
#ifdef __cplusplus
"DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
- "PME wait for PP", "Wait + Recv. PME F", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
+ "PME wait for PP", "Wait + Recv. PME F", "Wait PME GPU spread", "Wait PME GPU gather", "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
"Vsite spread", "COM pull force",
"Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
"NB F buffer ops.",
};
+/* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
+static const char *PMEStageNames[] =
+{
+ "Spline",
+ "Spread",
+ "Spline/spread",
+ "FFT r2c",
+ "Solve",
+ "FFT c2r",
+ "Gather",
+};
+
gmx_bool wallcycle_have_counter(void)
{
return gmx_cycles_have_counter();
void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
int nth_pp, int nth_pme, double realtime,
gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
- struct gmx_wallclock_gpu_t *gpu_t)
+ const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
+ const gmx_wallclock_gpu_pme_t *gpu_pme_t)
{
- double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
+ double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
double c2t, c2t_pp, c2t_pme = 0;
int i, j, npp, nth_tot;
char buf[STRLEN];
}
/* print GPU timing summary */
- if (gpu_t)
+ double tot_gpu = 0.0;
+ if (gpu_pme_t)
+ {
+ for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+ {
+ tot_gpu += gpu_pme_t->timing[k].t;
+ }
+ }
+ if (gpu_nbnxn_t)
{
const char *k_log_str[2][2] = {
{"Nonbonded F kernel", "Nonbonded F+ene k."},
{"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
};
-
- tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
+ tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
/* add up the kernel timings */
- tot_k = 0.0;
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
- tot_k += gpu_t->ktime[i][j].t;
+ tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
}
}
- tot_gpu += tot_k + gpu_t->pruneTime.t;
+ tot_gpu += gpu_nbnxn_t->pruneTime.t;
tot_cpu_overlap = wc->wcc[ewcFORCE].c;
if (wc->wcc[ewcPMEMESH].n > 0)
fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%');
fprintf(fplog, "%s\n", hline);
print_gputimes(fplog, "Pair list H2D",
- gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
+ gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
print_gputimes(fplog, "X / q H2D",
- gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
+ gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
for (i = 0; i < 2; i++)
{
for (j = 0; j < 2; j++)
{
- if (gpu_t->ktime[i][j].c)
+ if (gpu_nbnxn_t->ktime[i][j].c)
{
print_gputimes(fplog, k_log_str[i][j],
- gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
+ gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
}
}
}
- if (gpu_t->pruneTime.c)
+ if (gpu_pme_t)
{
- print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+ for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+ {
+ if (gpu_pme_t->timing[k].c)
+ {
+ print_gputimes(fplog, PMEStageNames[k],
+ gpu_pme_t->timing[k].c,
+ gpu_pme_t->timing[k].t,
+ tot_gpu);
+ }
+ }
}
- print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
+ if (gpu_nbnxn_t->pruneTime.c)
+ {
+ print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
+ }
+ print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
fprintf(fplog, "%s\n", hline);
- print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
+ print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
fprintf(fplog, "%s\n", hline);
- if (gpu_t->dynamicPruneTime.c)
+ if (gpu_nbnxn_t->dynamicPruneTime.c)
{
/* We print the dynamic pruning kernel timings after a separator
* and avoid adding it to tot_gpu as this is not in the force
* overlap. We print the fraction as relative to the rest.
*/
- print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+ print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
fprintf(fplog, "%s\n", hline);
}
-
gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
- if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+ if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
{
+ // FIXME the code below is not updated for PME on GPU
fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
- tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
+ tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
gpu_cpu_ratio);
}
ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcNS, ewcLAUNCH_GPU,
ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
ewcPME_REDISTXF, ewcPME_SPREAD, ewcPME_GATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
- ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
+ ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_PME_SPREAD, ewcWAIT_GPU_PME_GATHER, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
ewcVSITESPREAD, ewcPULLPOT,
ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
ewcTEST, ewcNR
};
-
enum {
ewcsDD_REDIST, ewcsDD_GRID, ewcsDD_SETUPCOMM,
ewcsDD_MAKETOP, ewcsDD_MAKECONSTR, ewcsDD_TOPOTHER,
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
typedef struct gmx_wallcycle *gmx_wallcycle_t;
-typedef struct gmx_wallclock_gpu_t gmx_wallclock_gpu_t;
+struct gmx_wallclock_gpu_nbnxn_t;
+struct gmx_wallclock_gpu_pme_t;
typedef std::array<double, ewcNR+ewcsNR> WallcycleCounts;
/* Convenience typedef */
void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
int nth_pp, int nth_pme, double realtime,
gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
- struct gmx_wallclock_gpu_t *gpu_t);
+ const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
+ const gmx_wallclock_gpu_pme_t *gpu_pme_t);
/* Print the cycle and time accounting */
#endif
gmx_int64_t *step_rel, t_inputrec *ir,
gmx_wallcycle_t wcycle, t_nrnb *nrnb,
gmx_walltime_accounting_t walltime_accounting,
- struct nonbonded_verlet_t *nbv)
+ struct nonbonded_verlet_t *nbv,
+ struct gmx_pme_t *pme)
{
char sbuf[STEPSTRSIZE];
if (use_GPU(nbv))
{
nbnxn_gpu_reset_timings(nbv);
+ }
+
+ if (pme_gpu_task_enabled(pme))
+ {
+ pme_gpu_reset_timings(pme);
+ }
+
+ if (use_GPU(nbv) || pme_gpu_task_enabled(pme))
+ {
resetGpuProfiler();
}
"mdrun -resetstep.", step);
}
reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
- use_GPU(fr->nbv) ? fr->nbv : nullptr);
+ use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata);
wcycle_set_reset_counters(wcycle, -1);
if (!(cr->duty & DUTY_PME))
{
{
try
{
+ gmx_device_info_t *pmeGpuInfo = nullptr;
+ auto runMode = PmeRunMode::CPU;
status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
mdrunOptions.reproducible,
ewaldcoeff_q, ewaldcoeff_lj,
- nthreads_pme);
+ nthreads_pme,
+ runMode, nullptr, pmeGpuInfo, mdlog);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
if (status != 0)
finish_run(fplog, mdlog, cr,
inputrec, nrnb, wcycle, walltime_accounting,
fr ? fr->nbv : nullptr,
+ fr ? fr->pmedata : nullptr,
EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
// Free PME data