From ebdf9991103f92cc0672255512156c571136f516 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Tue, 16 Feb 2021 14:16:02 +0000 Subject: [PATCH] Add support for Clang-CUDA 11.x and a post-merge CI build Tested mostly with Clang-11 and CUDA-11.2 While Clang does not officially support CUDA after 10.1, it appears to work fine with newer versions. Caveat: Clang fails to properly detect CUDA versions past 10.2 This can be worked around by creating `/usr/local/cuda/version.txt` file with the single line `CUDA Version X.Y.Z`. Despite the workaround above, Clang: - does not recognize sm_86. - recognizes sm_80, but believes that the installed CUDA version is too old to support it. --- admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml | 1 + .../gromacs.clang-11-cuda-11.2.gitlab-ci.yml | 75 +++++++++++++++++++ cmake/gmxManageClangCudaConfig.cmake | 17 ++++- src/gromacs/ewald/pme_gpu_program_impl.cu | 72 +++++++++--------- 4 files changed, 125 insertions(+), 40 deletions(-) create mode 100644 admin/gitlab-ci/gromacs.matrix/gromacs.clang-11-cuda-11.2.gitlab-ci.yml diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml index c43740e875..44fcdc7be6 100644 --- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml @@ -260,6 +260,7 @@ include: - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-8.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-9-mpi.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-9-release.gitlab-ci.yml' + - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-11-cuda-11.2.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-ASAN.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-static-analyzer.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.clang-TSAN.gitlab-ci.yml' diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.clang-11-cuda-11.2.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.clang-11-cuda-11.2.gitlab-ci.yml new file mode 100644 index 0000000000..e8a6dc069b --- /dev/null +++ b/admin/gitlab-ci/gromacs.matrix/gromacs.clang-11-cuda-11.2.gitlab-ci.yml @@ -0,0 +1,75 @@ +# Test goal: Clang-CUDA build +# Test intents (should change rarely and conservatively): +# OS: Ubuntu newest supported +# GPU: Clang CUDA +# HW: NVIDIA GPU +# Scope: configure, build, unit tests, regression tests +# Test implementation choices (free to change as needed): +# OS: Ubuntu 20.04 +# Build type: RelWithDebInfo +# Compiler: Clang 11 +# MPI: thread_MPI +# GPU: Clang CUDA 11.2, CUDA 11.2 +# SIMD: AVX2_256, no kernels +# FFT: FFTW3 +# Parallelism nt/ntomp: 4/2 (unit tests) +# Parallelism nt/ntomp: 2/1 (regression tests) + +gromacs:clang-11-cuda-11.2:configure: + extends: + - .gromacs:base:configure + - .use-clang:base + - .use-cuda + - .rules:post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + CMAKE_SIMD_OPTIONS: "-DGMX_USE_SIMD_KERNELS=off" + CMAKE_EXTRA_OPTIONS: "-DGMX_CLANG_CUDA=ON" + CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithDebInfo" + COMPILER_MAJOR_VERSION: 11 + +gromacs:clang-11-cuda-11.2:build: + extends: + - .variables:default + - .gromacs:base:build + - .use-clang:base + - .use-ccache + - .rules:post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + needs: + - job: gromacs:clang-11-cuda-11.2:configure + +gromacs:clang-11-cuda-11.2:test: + extends: + - .gromacs:base:test + - .rules:post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu" + KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1 + tags: + - k8s-scilifelab + needs: + - job: gromacs:clang-11-cuda-11.2:build + +gromacs:clang-11-cuda-11.2:regressiontest: + extends: + - .gromacs:base:regressiontest + - .rules:post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu" + KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1 + REGRESSIONTEST_PME_RANK_NUMBER: 0 + REGRESSIONTEST_TOTAL_RANK_NUMBER: 2 + REGRESSIONTEST_OMP_RANK_NUMBER: 1 + tags: + - k8s-scilifelab + needs: + - job: gromacs:clang-11-cuda-11.2:build + - job: regressiontests:prepare \ No newline at end of file diff --git a/cmake/gmxManageClangCudaConfig.cmake b/cmake/gmxManageClangCudaConfig.cmake index 8a0a1ea212..2f25ceddb1 100644 --- a/cmake/gmxManageClangCudaConfig.cmake +++ b/cmake/gmxManageClangCudaConfig.cmake @@ -79,7 +79,9 @@ if (GMX_CUDA_TARGET_SM) list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_${_target}") endforeach() else() - list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_30") + if (CUDA_VERSION VERSION_LESS 11.0) + list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_30") + endif() list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_35") # clang 6.0 + CUDA 9.0 seems to have issues generating code for sm_37 if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.999) @@ -90,9 +92,16 @@ else() list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_60") list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_61") list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_70") - # Enable this when clang (8.0 ?) introduces sm_75 support - #if (NOT CUDA_VERSION VERSION_LESS 10.0) - # list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_75") + if (NOT CUDA_VERSION VERSION_LESS 10.0) + list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_75") + endif() + # Enable this when clang (12.0 ?) properly recognizes CUDA 11.0 + #if(NOT CUDA_VERSION VERSION_LESS 11.0) + # list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_80") + #endif() + # Enable this when clang (12.0 ?) introduces sm_86 support + #if(NOT CUDA_VERSION VERSION_LESS 11.1) + # list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_86") #endif() endif() if (GMX_CUDA_TARGET_SM) diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu index 1358793807..6a6196fd5c 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.cu +++ b/src/gromacs/ewald/pme_gpu_program_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -60,74 +60,74 @@ constexpr int c_stateB = 1; //! PME CUDA kernels forward declarations. Kernels are documented in their respective files. template -void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off -extern template void +extern template void __global__ pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template void __global__ pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template void __global__ pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template void __global__ pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -extern template void +extern template __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); template /* It is significantly slower to pass gridIndex as a kernel parameter */ -void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); // clang-format on template -void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams); +__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel(const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template __global__ void pme_gather_kernel (const PmeGpuCudaKernelParams); // clang-format on PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : -- 2.22.0