#include "pme-gpu-internal.h" // for GridOrdering enum
#include "pme-gpu-types-host.h"
+// PME interpolation order
+constexpr int c_pmeOrder = 4;
+// These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+constexpr bool c_wrapX = true;
+constexpr bool c_wrapY = true;
+
//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
template <
const int order,
>
void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+
template<
GridOrdering gridOrdering,
bool computeEnergyAndVirial
>
void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
+
template <
const int order,
const bool overwriteForces,
>
void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+// Add extern declarations to inform that there will be a definition
+// provided in another translation unit.
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
{
solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
gatherWorkGroupSize = c_gatherMaxThreadsPerBlock;
- // PME interpolation order
- constexpr int pmeOrder = 4;
- GMX_UNUSED_VALUE(pmeOrder);
- // These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
- constexpr bool wrapX = true;
- constexpr bool wrapY = true;
- GMX_UNUSED_VALUE(wrapX);
- GMX_UNUSED_VALUE(wrapY);
- splineAndSpreadKernel = pme_spline_and_spread_kernel<pmeOrder, true, true, wrapX, wrapY>;
- splineKernel = pme_spline_and_spread_kernel<pmeOrder, true, false, wrapX, wrapY>;
- spreadKernel = pme_spline_and_spread_kernel<pmeOrder, false, true, wrapX, wrapY>;
- gatherKernel = pme_gather_kernel<pmeOrder, true, wrapX, wrapY>;
- gatherReduceWithInputKernel = pme_gather_kernel<pmeOrder, false, wrapX, wrapY>;
+ splineAndSpreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>;
+ splineKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>;
+ spreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>;
+ gatherKernel = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>;
+ gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>;
solveXYZKernel = pme_solve_kernel<GridOrdering::XYZ, false>;
solveXYZEnergyKernel = pme_solve_kernel<GridOrdering::XYZ, true>;
solveYZXKernel = pme_solve_kernel<GridOrdering::YZX, false>;
// TODO: template on transferKind to avoid runtime conditionals
int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
- GpuApiCallBehavior transferKind, cudaStream_t s = 0)
+ GpuApiCallBehavior transferKind, cudaStream_t s = nullptr)
{
cudaError_t stat;
- if (h_dest == NULL || d_src == NULL || bytes == 0)
+ if (h_dest == nullptr || d_src == nullptr || bytes == 0)
{
return -1;
}
/*!
* The copy is launched in stream s or if not specified, in stream 0.
*/
-int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
+int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = nullptr)
{
return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
}
// TODO: template on transferKind to avoid runtime conditionals
int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
- GpuApiCallBehavior transferKind, cudaStream_t s = 0)
+ GpuApiCallBehavior transferKind, cudaStream_t s = nullptr)
{
cudaError_t stat;
- if (d_dest == NULL || h_src == NULL || bytes == 0)
+ if (d_dest == nullptr || h_src == nullptr || bytes == 0)
{
return -1;
}
/*!
* The copy is launched in stream s or if not specified, in stream 0.
*/
-int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
+int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = nullptr)
{
return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
}
memset(&td, 0, sizeof(td));
td.readMode = cudaReadModeElementType;
- stat = cudaCreateTextureObject(&texObj, &rd, &td, NULL);
+ stat = cudaCreateTextureObject(&texObj, &rd, &td, nullptr);
CU_RET_ERR(stat, "cudaCreateTextureObject failed");
}
*
* The copy is launched in stream s or if not specified, in stream 0.
*/
-int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/);
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
/*! Launches synchronous host to device memory copy in stream 0. */
int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
/*! Launches synchronous or asynchronous host to device memory copy.
*
* The copy is launched in stream s or if not specified, in stream 0.
*/
-int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/);
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
/*! Launches synchronous host to device memory copy. */
int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
// TODO: the 2 functions below are pretty much a constructor/destructor of a simple
// GPU table object. There is also almost self-contained fetchFromParamLookupTable()
int numElem,
const gmx_device_info_t *devInfo);
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
+extern template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
+
/*! \brief Destroy parameter lookup table.
*
* Unbinds texture object, deallocates device memory.
cudaTextureObject_t texObj,
const gmx_device_info_t *devInfo);
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);
+extern template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
+
/*! \brief Add a triplets stored in a float3 to an rvec variable.
*
* \param[out] a Rvec to increment
*/
static int cuda_max_device_count = 32;
-static bool cudaProfilerRun = ((getenv("NVPROF_ID") != NULL));
+static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
/** Dummy kernel used for sanity checking. */
static __global__ void k_dummy_test(void)
void findGpus(gmx_gpu_info_t *gpu_info)
{
- int i, ndev, checkres;
- cudaError_t stat;
- cudaDeviceProp prop;
- gmx_device_info_t *devs;
-
assert(gpu_info);
gpu_info->n_dev_compatible = 0;
- ndev = 0;
- devs = NULL;
-
- stat = cudaGetDeviceCount(&ndev);
+ int ndev;
+ cudaError_t stat = cudaGetDeviceCount(&ndev);
if (stat != cudaSuccess)
{
GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
// We expect to start device support/sanity checks with a clean runtime error state
gmx::ensureNoPendingCudaError("");
+ gmx_device_info_t *devs;
snew(devs, ndev);
- for (i = 0; i < ndev; i++)
+ for (int i = 0; i < ndev; i++)
{
- checkres = is_gmx_supported_gpu_id(i, &prop);
+ cudaDeviceProp prop;
+ int checkres = is_gmx_supported_gpu_id(i, &prop);
devs[i].id = i;
devs[i].prop = prop;
}
else
{
- *nb_alloc = NULL;
- *nb_free = NULL;
+ *nb_alloc = nullptr;
+ *nb_free = nullptr;
}
}
cu_nbparam_t *nbp,
const gmx_device_info_t *dev_info)
{
- if (nbp->coulomb_tab != NULL)
+ if (nbp->coulomb_tab != nullptr)
{
nbnxn_cuda_free_nbparam_table(nbp, dev_info);
}
stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
- /* initialize to NULL poiters to data that is not allocated here and will
+ /* initialize to nullptr poiters to data that is not allocated here and will
need reallocation in nbnxn_cuda_init_atomdata */
- ad->xq = NULL;
- ad->f = NULL;
+ ad->xq = nullptr;
+ ad->f = nullptr;
/* size -1 indicates that the respective array hasn't been initialized yet */
ad->natoms = -1;
/* Benchmarking/development environment variables to force the use of
analytical or tabulated Ewald kernel. */
- bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
- bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+ bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr);
+ bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr);
if (bForceAnalyticalEwald && bForceTabulatedEwald)
{
/* Use twin cut-off kernels if requested by bTwinCut or the env. var.
forces it (use it for debugging/benchmarking only). */
- if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+ if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == nullptr))
{
kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
}
break;
default:
gmx_incons("The requested LJ combination rule is not implemented in the CUDA GPU accelerated kernels!");
- break;
}
break;
case eintmodFORCESWITCH:
break;
default:
gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
- break;
}
}
else if (ic->vdwtype == evdwPME)
}
/* generate table for PME */
- nbp->coulomb_tab = NULL;
+ nbp->coulomb_tab = nullptr;
if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
{
init_ewald_coulomb_force_table(ic, nbp, dev_info);
/*! Initializes the pair list data structure. */
static void init_plist(cu_plist_t *pl)
{
- /* initialize to NULL pointers to data that is not allocated here and will
+ /* initialize to nullptr pointers to data that is not allocated here and will
need reallocation in nbnxn_gpu_init_pairlist */
- pl->sci = NULL;
- pl->cj4 = NULL;
- pl->imask = NULL;
- pl->excl = NULL;
+ pl->sci = nullptr;
+ pl->cj4 = nullptr;
+ pl->imask = nullptr;
+ pl->excl = nullptr;
/* size -1 indicates that the respective array hasn't been initialized yet */
pl->na_c = -1;
cudaError_t stat;
gmx_nbnxn_cuda_t *nb;
- if (p_nb == NULL)
+ if (p_nb == nullptr)
{
return;
}
* case will be a single value.
*/
int highest_priority;
- stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
+ stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
* This is the main reason why they are disabled by default.
*/
// TODO: Consider turning on by default when we can detect nr of streams.
- nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != NULL);
+ nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
if (nb->bDoTime)
{
cu_atomdata_t *atdat;
cu_nbparam_t *nbparam;
- if (nb == NULL)
+ if (nb == nullptr)
{
return;
}
/* Free nbst */
pfree(nb->nbst.e_lj);
- nb->nbst.e_lj = NULL;
+ nb->nbst.e_lj = nullptr;
pfree(nb->nbst.e_el);
- nb->nbst.e_el = NULL;
+ nb->nbst.e_el = nullptr;
pfree(nb->nbst.fshift);
- nb->nbst.fshift = NULL;
+ nb->nbst.fshift = nullptr;
sfree(atdat);
sfree(nbparam);
int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
{
- return nb != NULL ?
+ return nb != nullptr ?
gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int part)
#ifdef FUNCTION_DECLARATION_ONLY
; /* Only do function declaration, omit the function body. */
+
+// Add extern declarations so each translation unit understands that
+// there will be a definition provided.
+extern template
+__global__ void
+nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const cu_nbparam_t,
+ const cu_plist_t, int, int);
+extern template
+__global__ void
+nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
+ const cu_plist_t, int, int);
#else
{