--- /dev/null
+# Test goal: Clang-CUDA build
+# Test intents (should change rarely and conservatively):
+# OS: Ubuntu newest supported
+# GPU: Clang CUDA
+# HW: NVIDIA GPU
+# Scope: configure, build, unit tests, regression tests
+# Test implementation choices (free to change as needed):
+# OS: Ubuntu 20.04
+# Build type: RelWithDebInfo
+# Compiler: Clang 11
+# MPI: thread_MPI
+# GPU: Clang CUDA 11.2, CUDA 11.2
+# SIMD: AVX2_256, no kernels
+# FFT: FFTW3
+# Parallelism nt/ntomp: 4/2 (unit tests)
+# Parallelism nt/ntomp: 2/1 (regression tests)
+
+gromacs:clang-11-cuda-11.2:configure:
+ extends:
+ - .gromacs:base:configure
+ - .use-clang:base
+ - .use-cuda
+ - .rules:post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ CMAKE_SIMD_OPTIONS: "-DGMX_USE_SIMD_KERNELS=off"
+ CMAKE_EXTRA_OPTIONS: "-DGMX_CLANG_CUDA=ON"
+ CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithDebInfo"
+ COMPILER_MAJOR_VERSION: 11
+
+gromacs:clang-11-cuda-11.2:build:
+ extends:
+ - .variables:default
+ - .gromacs:base:build
+ - .use-clang:base
+ - .use-ccache
+ - .rules:post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ needs:
+ - job: gromacs:clang-11-cuda-11.2:configure
+
+gromacs:clang-11-cuda-11.2:test:
+ extends:
+ - .gromacs:base:test
+ - .rules:post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+ KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1
+ tags:
+ - k8s-scilifelab
+ needs:
+ - job: gromacs:clang-11-cuda-11.2:build
+
+gromacs:clang-11-cuda-11.2:regressiontest:
+ extends:
+ - .gromacs:base:regressiontest
+ - .rules:post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+ KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1
+ REGRESSIONTEST_PME_RANK_NUMBER: 0
+ REGRESSIONTEST_TOTAL_RANK_NUMBER: 2
+ REGRESSIONTEST_OMP_RANK_NUMBER: 1
+ tags:
+ - k8s-scilifelab
+ needs:
+ - job: gromacs:clang-11-cuda-11.2:build
+ - job: regressiontests:prepare
\ No newline at end of file
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
-void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
-extern template void
+extern template void __global__
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template void __global__
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template void __global__
pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template void __global__
pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
-extern template void
+extern template __global__ void
pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
-void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
-extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateA>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateA>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateB>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateB>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::YZX, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::YZX, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::YZX, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_solve_kernel<GridOrdering::YZX, true, c_stateB>(const PmeGpuCudaKernelParams);
// clang-format on
template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
-void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template __global__ void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
// clang-format on
PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :