From 97fe7d2f1f7b101a55c3c7db4b32b4014b1ad0d2 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Tue, 9 Oct 2018 00:49:16 +0200
Subject: [PATCH] Fix more clang-6 warnings in CUDA code

Fixes #2681

Change-Id: Ic1e096328687e53ca814034c4c3eb8db40db46df
---
 src/gromacs/ewald/pme-gpu-program-impl.cu     | 50 ++++++++++++++-----
 src/gromacs/gpu_utils/cudautils.cu            | 14 +++---
 src/gromacs/gpu_utils/cudautils.cuh           | 18 +++++--
 src/gromacs/gpu_utils/gpu_utils.cu            | 23 ++++-----
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  | 44 ++++++++--------
 .../nbnxn_cuda_kernel_pruneonly.cuh           | 13 ++++-
 6 files changed, 100 insertions(+), 62 deletions(-)

diff --git a/src/gromacs/ewald/pme-gpu-program-impl.cu b/src/gromacs/ewald/pme-gpu-program-impl.cu
index ac2c3a359e..db2f1b178d 100644
--- a/src/gromacs/ewald/pme-gpu-program-impl.cu
+++ b/src/gromacs/ewald/pme-gpu-program-impl.cu
@@ -49,6 +49,12 @@
 #include "pme-gpu-internal.h"                    // for GridOrdering enum
 #include "pme-gpu-types-host.h"
 
+// PME interpolation order
+constexpr int  c_pmeOrder = 4;
+// These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+constexpr bool c_wrapX = true;
+constexpr bool c_wrapY = true;
+
 //! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
 template <
     const int order,
@@ -59,12 +65,32 @@ template <
     >
 void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
 
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+
 template<
     GridOrdering gridOrdering,
     bool computeEnergyAndVirial
     >
 void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
 
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
+
 template <
     const int order,
     const bool overwriteForces,
@@ -73,6 +99,12 @@ template <
     >
 void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
 
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
 
 PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
 {
@@ -82,19 +114,11 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
     solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
     gatherWorkGroupSize   = c_gatherMaxThreadsPerBlock;
 
-    // PME interpolation order
-    constexpr int  pmeOrder = 4;
-    GMX_UNUSED_VALUE(pmeOrder);
-    // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
-    constexpr bool wrapX = true;
-    constexpr bool wrapY = true;
-    GMX_UNUSED_VALUE(wrapX);
-    GMX_UNUSED_VALUE(wrapY);
-    splineAndSpreadKernel       = pme_spline_and_spread_kernel<pmeOrder, true, true, wrapX, wrapY>;
-    splineKernel                = pme_spline_and_spread_kernel<pmeOrder, true, false, wrapX, wrapY>;
-    spreadKernel                = pme_spline_and_spread_kernel<pmeOrder, false, true, wrapX, wrapY>;
-    gatherKernel                = pme_gather_kernel<pmeOrder, true, wrapX, wrapY>;
-    gatherReduceWithInputKernel = pme_gather_kernel<pmeOrder, false, wrapX, wrapY>;
+    splineAndSpreadKernel       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>;
+    splineKernel                = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>;
+    spreadKernel                = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>;
+    gatherKernel                = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>;
+    gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>;
     solveXYZKernel              = pme_solve_kernel<GridOrdering::XYZ, false>;
     solveXYZEnergyKernel        = pme_solve_kernel<GridOrdering::XYZ, true>;
     solveYZXKernel              = pme_solve_kernel<GridOrdering::YZX, false>;
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu
index b66be40f09..6731fef53d 100644
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -48,11 +48,11 @@
 
 // TODO: template on transferKind to avoid runtime conditionals
 int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
-                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
+                GpuApiCallBehavior transferKind, cudaStream_t s = nullptr)
 {
     cudaError_t stat;
 
-    if (h_dest == NULL || d_src == NULL || bytes == 0)
+    if (h_dest == nullptr || d_src == nullptr || bytes == 0)
     {
         return -1;
     }
@@ -85,18 +85,18 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
 /*!
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
+int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = nullptr)
 {
     return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
 }
 
 // TODO: template on transferKind to avoid runtime conditionals
 int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
-                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
+                GpuApiCallBehavior transferKind, cudaStream_t s = nullptr)
 {
     cudaError_t stat;
 
-    if (d_dest == NULL || h_src == NULL || bytes == 0)
+    if (d_dest == nullptr || h_src == nullptr || bytes == 0)
     {
         return -1;
     }
@@ -129,7 +129,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
 /*!
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
+int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = nullptr)
 {
     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 }
@@ -175,7 +175,7 @@ static void setup1DTexture(cudaTextureObject_t &texObj,
 
     memset(&td, 0, sizeof(td));
     td.readMode                 = cudaReadModeElementType;
-    stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
+    stat = cudaCreateTextureObject(&texObj, &rd, &td, nullptr);
     CU_RET_ERR(stat, "cudaCreateTextureObject failed");
 }
 
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 230f968e36..2fb14ba46a 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -145,25 +145,25 @@ struct gmx_device_info_t
  *
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/);
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
 
 /*! Launches synchronous host to device memory copy in stream 0. */
 int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
 
 /*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
 
 /*! Launches synchronous or asynchronous host to device memory copy.
  *
  *  The copy is launched in stream s or if not specified, in stream 0.
  */
-int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/);
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
 
 /*! Launches synchronous host to device memory copy. */
 int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 
 /*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
 
 // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
 // GPU table object. There is also almost self-contained fetchFromParamLookupTable()
@@ -188,6 +188,11 @@ void initParamLookupTable(T                        * &d_ptr,
                           int                         numElem,
                           const gmx_device_info_t    *devInfo);
 
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
+extern template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
+
 /*! \brief Destroy parameter lookup table.
  *
  * Unbinds texture object, deallocates device memory.
@@ -202,6 +207,11 @@ void destroyParamLookupTable(T                       *d_ptr,
                              cudaTextureObject_t      texObj,
                              const gmx_device_info_t *devInfo);
 
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);
+extern template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
+
 /*! \brief Add a triplets stored in a float3 to an rvec variable.
  *
  * \param[out]  a Rvec to increment
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 110c6fc0b0..04afee54db 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -68,7 +68,7 @@
  */
 static int  cuda_max_device_count = 32;
 
-static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != NULL));
+static bool cudaProfilerRun      = ((getenv("NVPROF_ID") != nullptr));
 
 /** Dummy kernel used for sanity checking. */
 static __global__ void k_dummy_test(void)
@@ -421,19 +421,12 @@ bool canDetectGpus(std::string *errorMessage)
 
 void findGpus(gmx_gpu_info_t *gpu_info)
 {
-    int                i, ndev, checkres;
-    cudaError_t        stat;
-    cudaDeviceProp     prop;
-    gmx_device_info_t *devs;
-
     assert(gpu_info);
 
     gpu_info->n_dev_compatible = 0;
 
-    ndev    = 0;
-    devs    = NULL;
-
-    stat = cudaGetDeviceCount(&ndev);
+    int         ndev;
+    cudaError_t stat = cudaGetDeviceCount(&ndev);
     if (stat != cudaSuccess)
     {
         GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
@@ -443,10 +436,12 @@ void findGpus(gmx_gpu_info_t *gpu_info)
     // We expect to start device support/sanity checks with a clean runtime error state
     gmx::ensureNoPendingCudaError("");
 
+    gmx_device_info_t *devs;
     snew(devs, ndev);
-    for (i = 0; i < ndev; i++)
+    for (int i = 0; i < ndev; i++)
     {
-        checkres = is_gmx_supported_gpu_id(i, &prop);
+        cudaDeviceProp prop;
+        int            checkres = is_gmx_supported_gpu_id(i, &prop);
 
         devs[i].id   = i;
         devs[i].prop = prop;
@@ -540,8 +535,8 @@ void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
     }
     else
     {
-        *nb_alloc = NULL;
-        *nb_free  = NULL;
+        *nb_alloc = nullptr;
+        *nb_free  = nullptr;
     }
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 1c4f06e476..58e6d343e0 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -104,7 +104,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
                                            cu_nbparam_t              *nbp,
                                            const gmx_device_info_t   *dev_info)
 {
-    if (nbp->coulomb_tab != NULL)
+    if (nbp->coulomb_tab != nullptr)
     {
         nbnxn_cuda_free_nbparam_table(nbp, dev_info);
     }
@@ -134,10 +134,10 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
     stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
     CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
 
-    /* initialize to NULL poiters to data that is not allocated here and will
+    /* initialize to nullptr poiters to data that is not allocated here and will
        need reallocation in nbnxn_cuda_init_atomdata */
-    ad->xq = NULL;
-    ad->f  = NULL;
+    ad->xq = nullptr;
+    ad->f  = nullptr;
 
     /* size -1 indicates that the respective array hasn't been initialized yet */
     ad->natoms = -1;
@@ -154,8 +154,8 @@ static int pick_ewald_kernel_type(bool                     bTwinCut,
 
     /* Benchmarking/development environment variables to force the use of
        analytical or tabulated Ewald kernel. */
-    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
-    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr);
+    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr);
 
     if (bForceAnalyticalEwald && bForceTabulatedEwald)
     {
@@ -185,7 +185,7 @@ static int pick_ewald_kernel_type(bool                     bTwinCut,
 
     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
        forces it (use it for debugging/benchmarking only). */
-    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == nullptr))
     {
         kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
     }
@@ -263,7 +263,6 @@ static void init_nbparam(cu_nbparam_t              *nbp,
                         break;
                     default:
                         gmx_incons("The requested LJ combination rule is not implemented in the CUDA GPU accelerated kernels!");
-                        break;
                 }
                 break;
             case eintmodFORCESWITCH:
@@ -274,7 +273,6 @@ static void init_nbparam(cu_nbparam_t              *nbp,
                 break;
             default:
                 gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
-                break;
         }
     }
     else if (ic->vdwtype == evdwPME)
@@ -315,7 +313,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
     }
 
     /* generate table for PME */
-    nbp->coulomb_tab = NULL;
+    nbp->coulomb_tab = nullptr;
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
         init_ewald_coulomb_force_table(ic, nbp, dev_info);
@@ -360,12 +358,12 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 /*! Initializes the pair list data structure. */
 static void init_plist(cu_plist_t *pl)
 {
-    /* initialize to NULL pointers to data that is not allocated here and will
+    /* initialize to nullptr pointers to data that is not allocated here and will
        need reallocation in nbnxn_gpu_init_pairlist */
-    pl->sci      = NULL;
-    pl->cj4      = NULL;
-    pl->imask    = NULL;
-    pl->excl     = NULL;
+    pl->sci      = nullptr;
+    pl->cj4      = nullptr;
+    pl->imask    = nullptr;
+    pl->excl     = nullptr;
 
     /* size -1 indicates that the respective array hasn't been initialized yet */
     pl->na_c           = -1;
@@ -440,7 +438,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     cudaError_t       stat;
     gmx_nbnxn_cuda_t *nb;
 
-    if (p_nb == NULL)
+    if (p_nb == nullptr)
     {
         return;
     }
@@ -481,7 +479,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
          * case will be a single value.
          */
         int highest_priority;
-        stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
+        stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
         CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 
         stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
@@ -500,7 +498,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
      *          This is the main reason why they are disabled by default.
      */
     // TODO: Consider turning on by default when we can detect nr of streams.
-    nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != NULL);
+    nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
 
     if (nb->bDoTime)
     {
@@ -731,7 +729,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     cu_atomdata_t   *atdat;
     cu_nbparam_t    *nbparam;
 
-    if (nb == NULL)
+    if (nb == nullptr)
     {
         return;
     }
@@ -804,13 +802,13 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 
     /* Free nbst */
     pfree(nb->nbst.e_lj);
-    nb->nbst.e_lj = NULL;
+    nb->nbst.e_lj = nullptr;
 
     pfree(nb->nbst.e_el);
-    nb->nbst.e_el = NULL;
+    nb->nbst.e_el = nullptr;
 
     pfree(nb->nbst.fshift);
-    nb->nbst.fshift = NULL;
+    nb->nbst.fshift = nullptr;
 
     sfree(atdat);
     sfree(nbparam);
@@ -839,7 +837,7 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
 
 int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
 {
-    return nb != NULL ?
+    return nb != nullptr ?
            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
 
 }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh
index df6185bcd4..d2d1ebb808 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -110,6 +110,17 @@ __global__ void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat,
                                         int                 part)
 #ifdef FUNCTION_DECLARATION_ONLY
 ;     /* Only do function declaration, omit the function body. */
+
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template
+__global__ void
+nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const cu_nbparam_t,
+                              const cu_plist_t, int, int);
+extern template
+__global__ void
+nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
+                               const cu_plist_t, int, int);
 #else
 {
 
-- 
2.22.0