From 68e1dd0108740bae8f6cec521feb9ce9a26c4dbd Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 22 Apr 2021 16:21:28 +0000 Subject: [PATCH] Make LINCS setup code platform-agnostic --- src/gromacs/mdlib/CMakeLists.txt | 17 +- .../mdlib/{lincs_gpu.cu => lincs_gpu.cpp} | 433 +--------------- .../mdlib/{lincs_gpu.cuh => lincs_gpu.h} | 27 +- src/gromacs/mdlib/lincs_gpu_internal.cu | 466 ++++++++++++++++++ src/gromacs/mdlib/lincs_gpu_internal.h | 71 +++ src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp | 68 +++ src/gromacs/mdlib/tests/constrtestrunners.cu | 2 +- .../mdlib/update_constrain_gpu_impl.cu | 2 +- src/gromacs/mdlib/update_constrain_gpu_impl.h | 2 +- 9 files changed, 658 insertions(+), 430 deletions(-) rename src/gromacs/mdlib/{lincs_gpu.cu => lincs_gpu.cpp} (60%) rename src/gromacs/mdlib/{lincs_gpu.cuh => lincs_gpu.h} (93%) create mode 100644 src/gromacs/mdlib/lincs_gpu_internal.cu create mode 100644 src/gromacs/mdlib/lincs_gpu_internal.h create mode 100644 src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index 96b6be720f..623fcb31df 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -36,26 +36,37 @@ add_library(mdlib INTERFACE) file(GLOB MDLIB_SOURCES *.cpp) -# To avoid listing all the necessary files manually, we will remove SYCL-specfific one here: -list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp) +# To avoid listing all the necessary files manually, we will remove SYCL-specfific files here: +list(REMOVE_ITEM MDLIB_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp) set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE) if(GMX_GPU_CUDA) gmx_add_libgromacs_sources( leapfrog_gpu.cu - lincs_gpu.cu + lincs_gpu.cpp + lincs_gpu_internal.cu settle_gpu.cu update_constrain_gpu_impl.cu gpuforcereduction_impl.cu ) + _gmx_add_files_to_property(CUDA_SOURCES + lincs_gpu.cpp + ) endif() if(GMX_GPU_SYCL) gmx_add_libgromacs_sources( leapfrog_gpu_sycl.cpp + lincs_gpu.cpp + lincs_gpu_internal_sycl.cpp ) _gmx_add_files_to_property(SYCL_SOURCES leapfrog_gpu_sycl.cpp + lincs_gpu.cpp + lincs_gpu_internal_sycl.cpp ) endif() diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cpp similarity index 60% rename from src/gromacs/mdlib/lincs_gpu.cu rename to src/gromacs/mdlib/lincs_gpu.cpp index 466c250f4c..96c0b8fc32 100644 --- a/src/gromacs/mdlib/lincs_gpu.cu +++ b/src/gromacs/mdlib/lincs_gpu.cpp @@ -34,11 +34,9 @@ */ /*! \internal \file * - * \brief Implements LINCS using CUDA + * \brief Implementations of LINCS GPU class * - * This file contains implementation of LINCS constraints algorithm - * using CUDA, including class initialization, data-structures management - * and GPU kernel. + * This file contains back-end agnostic implementation of LINCS GPU class. * * \author Artem Zhmurov * \author Alan Gray @@ -47,7 +45,7 @@ */ #include "gmxpre.h" -#include "lincs_gpu.cuh" +#include "lincs_gpu.h" #include #include @@ -56,378 +54,19 @@ #include -#include "gromacs/gpu_utils/cuda_arch_utils.cuh" -#include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/devicebuffer.cuh" +#include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gputraits.h" -#include "gromacs/gpu_utils/typecasts.cuh" -#include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/math/functions.h" #include "gromacs/math/vec.h" #include "gromacs/mdlib/constr.h" +#include "gromacs/mdlib/lincs_gpu_internal.h" #include "gromacs/pbcutil/pbc.h" -#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" -#include "gromacs/topology/forcefieldparameters.h" #include "gromacs/topology/ifunc.h" #include "gromacs/topology/topology.h" namespace gmx { -//! Number of CUDA threads in a block -constexpr static int c_threadsPerBlock = 256; -//! Maximum number of threads in a block (for __launch_bounds__) -constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; - -/*! \brief Main kernel for LINCS constraints. - * - * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm. - * - * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are - * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved - * by moving active threads to the next block, if the correspondent group of coupled constraints is to big - * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e. - * threads that are not required to do actual work. Since constraints from different blocks are not coupled, - * there is no need to synchronize across the device. However, extensive communication in a thread block - * are still needed. - * - * \todo Reduce synchronization overhead. Some ideas are: - * 1. Consider going to warp-level synchronization for the coupled constraints. - * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on - * the device level). - * 3. Use analytical solution for matrix A inversion. - * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating - * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints. - * See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885) - * \todo The use of __restrict__ for gm_xp and gm_v causes failure, probably because of the atomic - operations. Investigate this issue further. - * - * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct. - * \param[in] invdt Inverse timestep (needed to update velocities). - */ -template -__launch_bounds__(c_maxThreadsPerBlock) __global__ - void lincs_kernel(LincsGpuKernelParameters kernelParams, - const float3* __restrict__ gm_x, - float3* gm_xp, - float3* gm_v, - const float invdt) -{ - const PbcAiuc pbcAiuc = kernelParams.pbcAiuc; - const int numConstraintsThreads = kernelParams.numConstraintsThreads; - const int numIterations = kernelParams.numIterations; - const int expansionOrder = kernelParams.expansionOrder; - const int2* __restrict__ gm_constraints = kernelParams.d_constraints; - const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths; - const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts; - const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices; - const float* __restrict__ gm_massFactors = kernelParams.d_massFactors; - float* __restrict__ gm_matrixA = kernelParams.d_matrixA; - const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; - float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; - - int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; - - // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize). - // This is to ensure proper synchronizations and reduction. All array are padded to the required size. - assert(threadIndex < numConstraintsThreads); - - // Vectors connecting constrained atoms before algorithm was applied. - // Needed to construct constrain matrix A - extern __shared__ float3 sm_r[]; - - int2 pair = gm_constraints[threadIndex]; - int i = pair.x; - int j = pair.y; - - // Mass-scaled Lagrange multiplier - float lagrangeScaled = 0.0f; - - float targetLength; - float inverseMassi; - float inverseMassj; - float sqrtReducedMass; - - float3 xi; - float3 xj; - float3 rc; - - // i == -1 indicates dummy constraint at the end of the thread block. - bool isDummyThread = (i == -1); - - // Everything computed for these dummies will be equal to zero - if (isDummyThread) - { - targetLength = 0.0f; - inverseMassi = 0.0f; - inverseMassj = 0.0f; - sqrtReducedMass = 0.0f; - - xi = make_float3(0.0f, 0.0f, 0.0f); - xj = make_float3(0.0f, 0.0f, 0.0f); - rc = make_float3(0.0f, 0.0f, 0.0f); - } - else - { - // Collecting data - targetLength = gm_constraintsTargetLengths[threadIndex]; - inverseMassi = gm_inverseMasses[i]; - inverseMassj = gm_inverseMasses[j]; - sqrtReducedMass = rsqrt(inverseMassi + inverseMassj); - - xi = gm_x[i]; - xj = gm_x[j]; - - float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); - - float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z); - rc = rlen * dx; - } - - sm_r[threadIdx.x] = rc; - // Make sure that all r's are saved into shared memory - // before they are accessed in the loop below - __syncthreads(); - - /* - * Constructing LINCS matrix (A) - */ - - // Only non-zero values are saved (for coupled constraints) - int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex]; - for (int n = 0; n < coupledConstraintsCount; n++) - { - int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; - - float3 rc1 = sm_r[c1 - blockIdx.x * blockDim.x]; - gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z); - } - - // Skipping in dummy threads - if (!isDummyThread) - { - xi = gm_xp[i]; - xj = gm_xp[j]; - } - - float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); - - float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength); - - /* - * Inverse matrix using a set of expansionOrder matrix multiplications - */ - - // This will use the same memory space as sm_r, which is no longer needed. - extern __shared__ float sm_rhs[]; - // Save current right-hand-side vector in the shared memory - sm_rhs[threadIdx.x] = sol; - - for (int rec = 0; rec < expansionOrder; rec++) - { - // Making sure that all sm_rhs are saved before they are accessed in a loop below - __syncthreads(); - float mvb = 0.0f; - - for (int n = 0; n < coupledConstraintsCount; n++) - { - int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; - // Convolute current right-hand-side with A - // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations - mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; - } - // 'Switch' rhs vectors, save current result - // These values will be accessed in the loop above during the next iteration. - sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb; - sol = sol + mvb; - } - - // Current mass-scaled Lagrange multipliers - lagrangeScaled = sqrtReducedMass * sol; - - // Save updated coordinates before correction for the rotational lengthening - float3 tmp = rc * lagrangeScaled; - - // Writing for all but dummy constraints - if (!isDummyThread) - { - atomicAdd(&gm_xp[i], -tmp * inverseMassi); - atomicAdd(&gm_xp[j], tmp * inverseMassj); - } - - /* - * Correction for centripetal effects - */ - for (int iter = 0; iter < numIterations; iter++) - { - // Make sure that all xp's are saved: atomic operation calls before are - // communicating current xp[..] values across thread block. - __syncthreads(); - - if (!isDummyThread) - { - xi = gm_xp[i]; - xj = gm_xp[j]; - } - - float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); - - float len2 = targetLength * targetLength; - float dlen2 = 2.0f * len2 - norm2(dx); - - // TODO A little bit more effective but slightly less readable version of the below would be: - // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2)); - float proj; - if (dlen2 > 0.0f) - { - proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2)); - } - else - { - proj = sqrtReducedMass * targetLength; - } - - sm_rhs[threadIdx.x] = proj; - float sol = proj; - - /* - * Same matrix inversion as above is used for updated data - */ - for (int rec = 0; rec < expansionOrder; rec++) - { - // Make sure that all elements of rhs are saved into shared memory - __syncthreads(); - float mvb = 0; - - for (int n = 0; n < coupledConstraintsCount; n++) - { - int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; - - mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; - } - sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb; - sol = sol + mvb; - } - - // Add corrections to Lagrange multipliers - float sqrtmu_sol = sqrtReducedMass * sol; - lagrangeScaled += sqrtmu_sol; - - // Save updated coordinates for the next iteration - // Dummy constraints are skipped - if (!isDummyThread) - { - float3 tmp = rc * sqrtmu_sol; - atomicAdd(&gm_xp[i], -tmp * inverseMassi); - atomicAdd(&gm_xp[j], tmp * inverseMassj); - } - } - - // Updating particle velocities for all but dummy threads - if (updateVelocities && !isDummyThread) - { - float3 tmp = rc * invdt * lagrangeScaled; - atomicAdd(&gm_v[i], -tmp * inverseMassi); - atomicAdd(&gm_v[j], tmp * inverseMassj); - } - - - if (computeVirial) - { - // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length - // (targetLength) and the normalized vector connecting constrained atoms before - // the algorithm was applied (rc). The evaluation of virial in each thread is - // followed by basic reduction for the values inside single thread block. - // Then, the values are reduced across grid by atomicAdd(...). - // - // TODO Shuffle reduction. - // TODO Should be unified and/or done once when virial is actually needed. - // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally, - // one that works for any datatype. - - // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only - // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength, - // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel. - // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter - // two are no longer in use. - extern __shared__ float sm_threadVirial[]; - float mult = targetLength * lagrangeScaled; - sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x; - sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y; - sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z; - sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y; - sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z; - sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z; - - __syncthreads(); - - // Reduce up to one virial per thread block. All blocks are divided by half, the first - // half of threads sums two virials. Then the first half is divided by two and the first - // half of it sums two values. This procedure is repeated until only one thread is left. - // Only works if the threads per blocks is a power of two (hence static_assert - // in the beginning of the kernel). - for (int divideBy = 2; divideBy <= static_cast(blockDim.x); divideBy *= 2) - { - int dividedAt = blockDim.x / divideBy; - if (static_cast(threadIdx.x) < dividedAt) - { - for (int d = 0; d < 6; d++) - { - sm_threadVirial[d * blockDim.x + threadIdx.x] += - sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)]; - } - } - // Syncronize if not within one warp - if (dividedAt > warpSize / 2) - { - __syncthreads(); - } - } - // First 6 threads in the block add the results of 6 tensor components to the global memory address. - if (threadIdx.x < 6) - { - atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]); - } - } - - return; -} - -/*! \brief Select templated kernel. - * - * Returns pointer to a CUDA kernel based on provided booleans. - * - * \param[in] updateVelocities If the velocities should be constrained. - * \param[in] computeVirial If virial should be updated. - * - * \return Pointer to CUDA kernel - */ -inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial) -{ - - auto kernelPtr = lincs_kernel; - if (updateVelocities && computeVirial) - { - kernelPtr = lincs_kernel; - } - else if (updateVelocities && !computeVirial) - { - kernelPtr = lincs_kernel; - } - else if (!updateVelocities && computeVirial) - { - kernelPtr = lincs_kernel; - } - else if (!updateVelocities && !computeVirial) - { - kernelPtr = lincs_kernel; - } - return kernelPtr; -} - void LincsGpu::apply(const DeviceBuffer d_x, DeviceBuffer d_xp, const bool updateVelocities, @@ -437,7 +76,7 @@ void LincsGpu::apply(const DeviceBuffer d_x, tensor virialScaled, const PbcAiuc pbcAiuc) { - ensureNoPendingDeviceError("In CUDA version of LINCS"); + GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA."); // Early exit if no constraints if (kernelParams_.numConstraintsThreads == 0) @@ -452,48 +91,10 @@ void LincsGpu::apply(const DeviceBuffer d_x, clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_); } - auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial); - - KernelLaunchConfig config; - config.blockSize[0] = c_threadsPerBlock; - config.blockSize[1] = 1; - config.blockSize[2] = 1; - config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock; - config.gridSize[1] = 1; - config.gridSize[2] = 1; - - // Shared memory is used to store: - // -- Current coordinates (3 floats per thread) - // -- Right-hand-sides for matrix inversion (2 floats per thread) - // -- Virial tensor components (6 floats per thread) - // Since none of these three are needed simultaneously, they can be saved at the same shared memory address - // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only - // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not. - if (computeVirial) - { - config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float); - } - else - { - config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float); - } - kernelParams_.pbcAiuc = pbcAiuc; - const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, - config, - &kernelParams_, - asFloat3Pointer(&d_x), - asFloat3Pointer(&d_xp), - asFloat3Pointer(&d_v), - &invdt); - - launchGpuKernel(kernelPtr, - config, - deviceStream_, - nullptr, - "lincs_kernel", - kernelArgs); + launchLincsGpuKernel( + kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_); if (computeVirial) { @@ -530,6 +131,7 @@ LincsGpu::LincsGpu(int numIterations, deviceContext_(deviceContext), deviceStream_(deviceStream) { + GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA."); kernelParams_.numIterations = numIterations; kernelParams_.expansionOrder = expansionOrder; @@ -662,7 +264,7 @@ inline int countCoupled(int a, * \param[in] iatoms The list of constraints. * \param[in] stride Number of elements per constraint in \p iatoms. * \param[in] atomsAdjacencyList Information about connections between atoms. - * \param[our] splitMap Map of sequential constraint indexes to indexes to be on the device + * \param[out] splitMap Map of sequential constraint indexes to indexes to be on the device * \param[in] c Sequential index for constraint to consider adding. * \param[in,out] currentMapIndex The rolling index for the constraints mapping. */ @@ -744,8 +346,9 @@ bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop) void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass) { + GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA."); // List of constrained atoms (CPU memory) - std::vector constraintsHost; + std::vector constraintsHost; // Equilibrium distances for the constraints (CPU) std::vector constraintsTargetLengthsHost; // Number of constraints, coupled with the current one (CPU) @@ -805,9 +408,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const "Number of threads should be a multiple of the block size"); // Initialize constraints and their target indexes taking into account the splits in the data arrays. - int2 pair; - pair.x = -1; - pair.y = -1; + AtomPair pair; + pair.i = -1; + pair.j = -1; constraintsHost.resize(kernelParams_.numConstraintsThreads, pair); std::fill(constraintsHost.begin(), constraintsHost.end(), pair); constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0); @@ -818,9 +421,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const int a2 = iatoms[stride * c + 2]; int type = iatoms[stride * c]; - int2 pair; - pair.x = a1; - pair.y = a2; + AtomPair pair; + pair.i = a1; + pair.j = a2; constraintsHost.at(splitMap.at(c)) = pair; constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA; } diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.h similarity index 93% rename from src/gromacs/mdlib/lincs_gpu.cuh rename to src/gromacs/mdlib/lincs_gpu.h index 6507892649..3ed6cd3e59 100644 --- a/src/gromacs/mdlib/lincs_gpu.cuh +++ b/src/gromacs/mdlib/lincs_gpu.h @@ -49,7 +49,7 @@ #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" -#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/mdlib/constr.h" #include "gromacs/pbcutil/pbc_aiuc.h" @@ -58,6 +58,15 @@ class InteractionDefinitions; namespace gmx { +//! A pair of atoms indexes +struct AtomPair +{ + //! First atom + int i; + //! Second atom + int j; +}; + /* \brief LINCS parameters and GPU pointers * * This is used to accumulate all the parameters and pointers so they can be passed @@ -73,9 +82,9 @@ struct LincsGpuKernelParameters //! Number of iterations used to correct the projection int numIterations; //! 1/mass for all atoms (GPU) - float* d_inverseMasses; + DeviceBuffer d_inverseMasses; //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ], GPU) - float* d_virialScaled; + DeviceBuffer d_virialScaled; /*! \brief Total number of threads. * * This covers all constraints and gaps in the ends of the thread blocks @@ -84,17 +93,17 @@ struct LincsGpuKernelParameters */ int numConstraintsThreads; //! List of constrained atoms (GPU memory) - int2* d_constraints; + DeviceBuffer d_constraints; //! Equilibrium distances for the constraints (GPU) - float* d_constraintsTargetLengths; + DeviceBuffer d_constraintsTargetLengths; //! Number of constraints, coupled with the current one (GPU) - int* d_coupledConstraintsCounts; + DeviceBuffer d_coupledConstraintsCounts; //! List of coupled with the current one (GPU) - int* d_coupledConstraintsIndices; + DeviceBuffer d_coupledConstraintsIndices; //! Elements of the coupling matrix. - float* d_matrixA; + DeviceBuffer d_matrixA; //! Mass factors (GPU) - float* d_massFactors; + DeviceBuffer d_massFactors; }; /*! \internal \brief Class with interfaces and data for GPU version of LINCS. */ diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu new file mode 100644 index 0000000000..55d4b48bad --- /dev/null +++ b/src/gromacs/mdlib/lincs_gpu_internal.cu @@ -0,0 +1,466 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements LINCS kernels using CUDA + * + * This file contains CUDA kernels of LINCS constraints algorithm. + * + * \author Artem Zhmurov + * \author Alan Gray + * + * \ingroup module_mdlib + */ +#include "gromacs/gpu_utils/cuda_arch_utils.cuh" +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/devicebuffer.cuh" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/gpu_utils/typecasts.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/mdlib/lincs_gpu.h" +#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh" + +#include "lincs_gpu_internal.h" + +namespace gmx +{ + +//! Maximum number of threads in a block (for __launch_bounds__) +constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; + +/*! \brief Main kernel for LINCS constraints. + * + * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm. + * + * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are + * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved + * by moving active threads to the next block, if the correspondent group of coupled constraints is to big + * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e. + * threads that are not required to do actual work. Since constraints from different blocks are not coupled, + * there is no need to synchronize across the device. However, extensive communication in a thread block + * are still needed. + * + * \todo Reduce synchronization overhead. Some ideas are: + * 1. Consider going to warp-level synchronization for the coupled constraints. + * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on + * the device level). + * 3. Use analytical solution for matrix A inversion. + * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating + * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints. + * See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885) + * \todo The use of __restrict__ for gm_xp and gm_v causes failure, probably because of the atomic + operations. Investigate this issue further. + * + * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct. + * \param[in] invdt Inverse timestep (needed to update velocities). + */ +template +__launch_bounds__(c_maxThreadsPerBlock) __global__ + void lincs_kernel(LincsGpuKernelParameters kernelParams, + const float3* __restrict__ gm_x, + float3* gm_xp, + float3* gm_v, + const float invdt) +{ + const PbcAiuc pbcAiuc = kernelParams.pbcAiuc; + const int numConstraintsThreads = kernelParams.numConstraintsThreads; + const int numIterations = kernelParams.numIterations; + const int expansionOrder = kernelParams.expansionOrder; + const AtomPair* __restrict__ gm_constraints = kernelParams.d_constraints; + const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths; + const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts; + const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices; + const float* __restrict__ gm_massFactors = kernelParams.d_massFactors; + float* __restrict__ gm_matrixA = kernelParams.d_matrixA; + const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; + float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; + + int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; + + // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize). + // This is to ensure proper synchronizations and reduction. All array are padded to the required size. + assert(threadIndex < numConstraintsThreads); + + // Vectors connecting constrained atoms before algorithm was applied. + // Needed to construct constrain matrix A + extern __shared__ float3 sm_r[]; + + AtomPair pair = gm_constraints[threadIndex]; + int i = pair.i; + int j = pair.j; + + // Mass-scaled Lagrange multiplier + float lagrangeScaled = 0.0f; + + float targetLength; + float inverseMassi; + float inverseMassj; + float sqrtReducedMass; + + float3 xi; + float3 xj; + float3 rc; + + // i == -1 indicates dummy constraint at the end of the thread block. + bool isDummyThread = (i == -1); + + // Everything computed for these dummies will be equal to zero + if (isDummyThread) + { + targetLength = 0.0f; + inverseMassi = 0.0f; + inverseMassj = 0.0f; + sqrtReducedMass = 0.0f; + + xi = make_float3(0.0f, 0.0f, 0.0f); + xj = make_float3(0.0f, 0.0f, 0.0f); + rc = make_float3(0.0f, 0.0f, 0.0f); + } + else + { + // Collecting data + targetLength = gm_constraintsTargetLengths[threadIndex]; + inverseMassi = gm_inverseMasses[i]; + inverseMassj = gm_inverseMasses[j]; + sqrtReducedMass = rsqrt(inverseMassi + inverseMassj); + + xi = gm_x[i]; + xj = gm_x[j]; + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z); + rc = rlen * dx; + } + + sm_r[threadIdx.x] = rc; + // Make sure that all r's are saved into shared memory + // before they are accessed in the loop below + __syncthreads(); + + /* + * Constructing LINCS matrix (A) + */ + + // Only non-zero values are saved (for coupled constraints) + int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex]; + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n * numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + + float3 rc1 = sm_r[c1 - blockIdx.x * blockDim.x]; + gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z); + } + + // Skipping in dummy threads + if (!isDummyThread) + { + xi = gm_xp[i]; + xj = gm_xp[j]; + } + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength); + + /* + * Inverse matrix using a set of expansionOrder matrix multiplications + */ + + // This will use the same memory space as sm_r, which is no longer needed. + extern __shared__ float sm_rhs[]; + // Save current right-hand-side vector in the shared memory + sm_rhs[threadIdx.x] = sol; + + for (int rec = 0; rec < expansionOrder; rec++) + { + // Making sure that all sm_rhs are saved before they are accessed in a loop below + __syncthreads(); + float mvb = 0.0f; + + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n * numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + // Convolute current right-hand-side with A + // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations + mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; + } + // 'Switch' rhs vectors, save current result + // These values will be accessed in the loop above during the next iteration. + sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb; + sol = sol + mvb; + } + + // Current mass-scaled Lagrange multipliers + lagrangeScaled = sqrtReducedMass * sol; + + // Save updated coordinates before correction for the rotational lengthening + float3 tmp = rc * lagrangeScaled; + + // Writing for all but dummy constraints + if (!isDummyThread) + { + atomicAdd(&gm_xp[i], -tmp * inverseMassi); + atomicAdd(&gm_xp[j], tmp * inverseMassj); + } + + /* + * Correction for centripetal effects + */ + for (int iter = 0; iter < numIterations; iter++) + { + // Make sure that all xp's are saved: atomic operation calls before are + // communicating current xp[..] values across thread block. + __syncthreads(); + + if (!isDummyThread) + { + xi = gm_xp[i]; + xj = gm_xp[j]; + } + + float3 dx = pbcDxAiuc(pbcAiuc, xi, xj); + + float len2 = targetLength * targetLength; + float dlen2 = 2.0f * len2 - norm2(dx); + + // TODO A little bit more effective but slightly less readable version of the below would be: + // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2)); + float proj; + if (dlen2 > 0.0f) + { + proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2)); + } + else + { + proj = sqrtReducedMass * targetLength; + } + + sm_rhs[threadIdx.x] = proj; + float sol = proj; + + /* + * Same matrix inversion as above is used for updated data + */ + for (int rec = 0; rec < expansionOrder; rec++) + { + // Make sure that all elements of rhs are saved into shared memory + __syncthreads(); + float mvb = 0; + + for (int n = 0; n < coupledConstraintsCount; n++) + { + int index = n * numConstraintsThreads + threadIndex; + int c1 = gm_coupledConstraintsIdxes[index]; + + mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; + } + sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb; + sol = sol + mvb; + } + + // Add corrections to Lagrange multipliers + float sqrtmu_sol = sqrtReducedMass * sol; + lagrangeScaled += sqrtmu_sol; + + // Save updated coordinates for the next iteration + // Dummy constraints are skipped + if (!isDummyThread) + { + float3 tmp = rc * sqrtmu_sol; + atomicAdd(&gm_xp[i], -tmp * inverseMassi); + atomicAdd(&gm_xp[j], tmp * inverseMassj); + } + } + + // Updating particle velocities for all but dummy threads + if (updateVelocities && !isDummyThread) + { + float3 tmp = rc * invdt * lagrangeScaled; + atomicAdd(&gm_v[i], -tmp * inverseMassi); + atomicAdd(&gm_v[j], tmp * inverseMassj); + } + + + if (computeVirial) + { + // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length + // (targetLength) and the normalized vector connecting constrained atoms before + // the algorithm was applied (rc). The evaluation of virial in each thread is + // followed by basic reduction for the values inside single thread block. + // Then, the values are reduced across grid by atomicAdd(...). + // + // TODO Shuffle reduction. + // TODO Should be unified and/or done once when virial is actually needed. + // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally, + // one that works for any datatype. + + // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only + // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength, + // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel. + // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter + // two are no longer in use. + extern __shared__ float sm_threadVirial[]; + float mult = targetLength * lagrangeScaled; + sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x; + sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y; + sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z; + sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y; + sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z; + sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z; + + __syncthreads(); + + // Reduce up to one virial per thread block. All blocks are divided by half, the first + // half of threads sums two virials. Then the first half is divided by two and the first + // half of it sums two values. This procedure is repeated until only one thread is left. + // Only works if the threads per blocks is a power of two (hence static_assert + // in the beginning of the kernel). + for (int divideBy = 2; divideBy <= static_cast(blockDim.x); divideBy *= 2) + { + int dividedAt = blockDim.x / divideBy; + if (static_cast(threadIdx.x) < dividedAt) + { + for (int d = 0; d < 6; d++) + { + sm_threadVirial[d * blockDim.x + threadIdx.x] += + sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)]; + } + } + // Syncronize if not within one warp + if (dividedAt > warpSize / 2) + { + __syncthreads(); + } + } + // First 6 threads in the block add the results of 6 tensor components to the global memory address. + if (threadIdx.x < 6) + { + atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]); + } + } + + return; +} + +/*! \brief Select templated kernel. + * + * Returns pointer to a CUDA kernel based on provided booleans. + * + * \param[in] updateVelocities If the velocities should be constrained. + * \param[in] computeVirial If virial should be updated. + * + * \return Pointer to CUDA kernel + */ +inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial) +{ + + auto kernelPtr = lincs_kernel; + if (updateVelocities && computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (updateVelocities && !computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (!updateVelocities && computeVirial) + { + kernelPtr = lincs_kernel; + } + else if (!updateVelocities && !computeVirial) + { + kernelPtr = lincs_kernel; + } + return kernelPtr; +} + +void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams, + const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + const DeviceStream& deviceStream) +{ + + auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial); + + KernelLaunchConfig config; + config.blockSize[0] = c_threadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = (kernelParams.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + + // Shared memory is used to store: + // -- Current coordinates (3 floats per thread) + // -- Right-hand-sides for matrix inversion (2 floats per thread) + // -- Virial tensor components (6 floats per thread) + // Since none of these three are needed simultaneously, they can be saved at the same shared memory address + // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only + // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not. + if (computeVirial) + { + config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float); + } + else + { + config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float); + } + + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, + config, + &kernelParams, + asFloat3Pointer(&d_x), + asFloat3Pointer(&d_xp), + asFloat3Pointer(&d_v), + &invdt); + + launchGpuKernel(kernelPtr, + config, + deviceStream, + nullptr, + "lincs_kernel", + kernelArgs); + + return; +} + +} // namespace gmx diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h new file mode 100644 index 0000000000..5c42a784c6 --- /dev/null +++ b/src/gromacs/mdlib/lincs_gpu_internal.h @@ -0,0 +1,71 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declare backend-specific LINCS GPU functions + * + * \author Artem Zhmurov + * \author Alan Gray + * + * \ingroup module_mdlib + */ +#ifndef GMX_MDLIB_LINCS_GPU_INTERNAL_H +#define GMX_MDLIB_LINCS_GPU_INTERNAL_H + +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" + +class DeviceStream; + +namespace gmx +{ + +struct LincsGpuKernelParameters; + +//! Number of threads in a GPU block +constexpr static int c_threadsPerBlock = 256; + +void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams, + const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + const DeviceStream& deviceStream); + +} // namespace gmx + +#endif // GMX_MDLIB_LINCS_GPU_INTERNAL_H diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp new file mode 100644 index 0000000000..b5594c1bc0 --- /dev/null +++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp @@ -0,0 +1,68 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements LINCS kernels using SYCL + * + * This file contains SYCL kernels of LINCS constraints algorithm. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "lincs_gpu_internal.h" + +#include "gromacs/utility/gmxassert.h" + +namespace gmx +{ + +void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */, + const DeviceBuffer /* d_x */, + DeviceBuffer /* d_xp */, + const bool /* updateVelocities */, + DeviceBuffer /* d_v */, + const real /* invdt */, + const bool /* computeVirial */, + const DeviceStream& /* deviceStream */) +{ + + // SYCL_TODO + GMX_RELEASE_ASSERT(false, "LINCS is not yet implemented in SYCL."); + + return; +} + +} // namespace gmx diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu index c22b91f351..1f45f0ccab 100644 --- a/src/gromacs/mdlib/tests/constrtestrunners.cu +++ b/src/gromacs/mdlib/tests/constrtestrunners.cu @@ -54,7 +54,7 @@ #include "gromacs/gpu_utils/devicebuffer.cuh" #include "gromacs/gpu_utils/gputraits.h" #include "gromacs/hardware/device_information.h" -#include "gromacs/mdlib/lincs_gpu.cuh" +#include "gromacs/mdlib/lincs_gpu.h" #include "gromacs/pbcutil/pbc.h" #include "gromacs/utility/unique_cptr.h" diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index 5a950d34e5..ba3a1d12af 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -64,7 +64,7 @@ #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/mdlib/leapfrog_gpu.h" -#include "gromacs/mdlib/lincs_gpu.cuh" +#include "gromacs/mdlib/lincs_gpu.h" #include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/mdlib/update_constrain_gpu.h" #include "gromacs/mdtypes/mdatom.h" diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index ece9859374..dcca5dc2a9 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -49,7 +49,7 @@ #include "gmxpre.h" #include "gromacs/mdlib/leapfrog_gpu.h" -#include "gromacs/mdlib/lincs_gpu.cuh" +#include "gromacs/mdlib/lincs_gpu.h" #include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/mdlib/update_constrain_gpu.h" #include "gromacs/mdtypes/inputrec.h" -- 2.22.0