/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <stdlib.h>
#if defined(_MSVC)
-#include <limits>
+# include <limits>
#endif
-#include "thread_mpi/atomic.h"
-
+#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/hardware/device_information.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
-#include "nbnxm_ocl_internal.h"
#include "nbnxm_ocl_types.h"
namespace Nbnxm
/*! \brief Convenience constants */
//@{
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
//@}
/*! \brief Validates the input global work size parameter.
*/
-static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
+static inline void validate_global_work_size(const KernelLaunchConfig& config,
+ int work_dim,
+ const DeviceInformation* dinfo)
{
cl_uint device_size_t_size_bits;
cl_uint host_size_t_size_bits;
- assert(dinfo);
+ GMX_ASSERT(dinfo, "Need a valid device info object");
size_t global_work_size[3];
GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
{
if (global_work_size[i] > device_limit)
{
- gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
- "The number of nonbonded work units (=number of super-clusters) exceeds the"
- "device capabilities. Global work size limit exceeded (%zu > %zu)!",
- global_work_size[i], device_limit);
+ gmx_fatal(
+ FARGS,
+ "Watch out, the input system is too large to simulate!\n"
+ "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ "device capabilities. Global work size limit exceeded (%zu > %zu)!",
+ global_work_size[i],
+ device_limit);
}
}
}
*/
/*! \brief Force-only kernel function names. */
-static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
-{
- { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
- { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
- { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
- { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
- { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
- { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
+static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
};
/*! \brief Force + energy kernel function pointers. */
-static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
-{
- { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
- { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
- { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
- { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
- { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
- { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
+static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
};
/*! \brief Force + pruning kernel function pointers. */
-static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
-{
- { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
- { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
- { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
- { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
- { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
- { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
+static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
};
/*! \brief Force + energy + pruning kernel function pointers. */
-static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
-{
- { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
- { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
- { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
- { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
- { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
- { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
+static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
+ "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
};
/*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
* \param[in] kernel_pruneonly array of prune kernel objects
* \param[in] firstPrunePass true if the first pruning pass is being executed
*/
-static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
- bool firstPrunePass)
+static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool firstPrunePass)
{
- cl_kernel *kernelPtr;
+ cl_kernel* kernelPtr;
if (firstPrunePass)
{
* OpenCL kernel objects are cached in nb. If the requested kernel is not
* found in the cache, it will be created and the cache will be updated.
*/
-static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
- int eeltype,
- int evdwtype,
- bool bDoEne,
- bool bDoPrune)
+static inline cl_kernel
+select_nbnxn_kernel(NbnxmGpu* nb, enum ElecType elecType, enum VdwType vdwType, bool bDoEne, bool bDoPrune)
{
const char* kernel_name_to_run;
- cl_kernel *kernel_ptr;
+ cl_kernel* kernel_ptr;
cl_int cl_error;
- assert(eeltype < eelOclNR);
- assert(evdwtype < evdwOclNR);
+ const int elecTypeIdx = static_cast<int>(elecType);
+ const int vdwTypeIdx = static_cast<int>(vdwType);
+
+ GMX_ASSERT(elecTypeIdx < c_numElecTypes,
+ "The electrostatics type requested is not implemented in the OpenCL kernels.");
+ GMX_ASSERT(vdwTypeIdx < c_numVdwTypes,
+ "The VdW type requested is not implemented in the OpenCL kernels.");
if (bDoEne)
{
if (bDoPrune)
{
- kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
- kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+ kernel_name_to_run = nb_kfunc_ener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+ kernel_ptr = &(nb->kernel_ener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
}
else
{
- kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
- kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+ kernel_name_to_run = nb_kfunc_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+ kernel_ptr = &(nb->kernel_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
}
}
else
{
if (bDoPrune)
{
- kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
- kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+ kernel_name_to_run = nb_kfunc_noener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+ kernel_ptr = &(nb->kernel_noener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
}
else
{
- kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
- kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+ kernel_name_to_run = nb_kfunc_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+ kernel_ptr = &(nb->kernel_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
}
}
if (nullptr == kernel_ptr[0])
{
*kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
- assert(cl_error == CL_SUCCESS);
+ GMX_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
+ + " for kernel named " + kernel_name_to_run)
+ .c_str());
}
- // TODO: handle errors
return *kernel_ptr;
}
/*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
*/
-static inline int calc_shmem_required_nonbonded(int vdwType,
- bool bPrefetchLjParam)
+static inline int calc_shmem_required_nonbonded(enum VdwType vdwType, bool bPrefetchLjParam)
{
int shmem;
/* size of shmem (force-buffers/xq/atom type preloading) */
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
/* cj in shared memory, for both warps separately
* TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
*/
- shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
+ shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
if (bPrefetchLjParam)
{
if (useLjCombRule(vdwType))
{
/* i-atom LJ combination parameters in shared memory */
- shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
+ * sizeof(float); /* atib abused for ljcp, float2 */
}
else
{
/* i-atom types in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
}
}
/* force reduction buffers in shared memory */
- shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
+ shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
/* Warp vote. In fact it must be * number of warps in block.. */
- shmem += sizeof(cl_uint) * 2; /* warp_any */
+ shmem += sizeof(cl_uint) * 2; /* warp_any */
return shmem;
}
*
* This function is called before the launch of both nbnxn and prune kernels.
*/
-static void fillin_ocl_structures(cl_nbparam_t *nbp,
- cl_nbparam_params_t *nbparams_params)
+static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
{
nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
nbparams_params->c_rf = nbp->c_rf;
nbparams_params->dispersion_shift = nbp->dispersion_shift;
- nbparams_params->eeltype = nbp->eeltype;
+ nbparams_params->elecType = nbp->elecType;
nbparams_params->epsfac = nbp->epsfac;
nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
nbparams_params->ewald_beta = nbp->ewald_beta;
nbparams_params->sh_ewald = nbp->sh_ewald;
nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
nbparams_params->two_k_rf = nbp->two_k_rf;
- nbparams_params->vdwtype = nbp->vdwtype;
+ nbparams_params->vdwType = nbp->vdwType;
nbparams_params->vdw_switch = nbp->vdw_switch;
}
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
-{
- cl_int gmx_unused cl_error;
-
- /* Enqueue wait */
- cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
- GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
-
- /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
- cl_error = clReleaseEvent(*ocl_event);
- assert(CL_SUCCESS == cl_error);
- *ocl_event = nullptr;
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb,
- const nbnxn_atomdata_t *nbatom,
- const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
- /* local/nonlocal offset and length used for xq and f */
- int adat_begin, adat_len;
-
- cl_atomdata_t *adat = nb->atdat;
- cl_plist_t *plist = nb->plist[iloc];
- cl_timers_t *t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
-
- bool bDoTime = (nb->bDoTime) != 0;
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordinates.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- return;
- }
-
- /* calculate the atom data index range based on locality */
- if (atomLocality == AtomLocality::Local)
- {
- adat_begin = 0;
- adat_len = adat->natoms_local;
- }
- else
- {
- adat_begin = adat->natoms_local;
- adat_len = adat->natoms - adat->natoms_local;
- }
-
- /* beginning of timed HtoD section */
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
- }
-
- /* HtoD x, q */
- ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
- adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
- }
-
- /* When we get here all misc operations issues in the local stream as well as
- the local xq H2D are done,
- so we record that in the local stream and wait for it in the nonlocal one. */
- if (nb->bUseTwoStreams)
- {
- if (iloc == InteractionLocality::Local)
- {
- cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
- assert(CL_SUCCESS == cl_error);
-
- /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
- * in the local stream in order to be able to sync with the above event
- * from the non-local stream.
- */
- cl_error = clFlush(stream);
- assert(CL_SUCCESS == cl_error);
- }
- else
- {
- sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
- }
- }
-}
-
-
/*! \brief Launch GPU kernel
As we execute nonbonded workload in separate queues, before launching
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
*/
-void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
- const gmx::StepWorkload &stepWork,
- const Nbnxm::InteractionLocality iloc)
+void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
{
- cl_atomdata_t *adat = nb->atdat;
- cl_nbparam_t *nbp = nb->nbparam;
- cl_plist_t *plist = nb->plist[iloc];
- cl_timers_t *t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
+ NBAtomDataGpu* adat = nb->atdat;
+ NBParamGpu* nbp = nb->nbparam;
+ gpu_plist* plist = nb->plist[iloc];
+ Nbnxm::GpuTimers* timers = nb->timers;
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
- bool bDoTime = (nb->bDoTime) != 0;
+ bool bDoTime = nb->bDoTime;
- cl_nbparam_params_t nbparams_params;
+ cl_nbparam_params_t nbparams_params;
/* Don't launch the non-local kernel if there is no work to do.
Doing the same for the local kernel is more complicated, since the
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(stream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
KernelLaunchConfig config;
- config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
- config.stream = stream;
+ config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwType, nb->bPrefetchLjParam);
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.gridSize[0] = plist->nsci;
- validate_global_work_size(config, 3, nb->dev_info);
+ validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
if (debug)
{
- fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ fprintf(debug,
+ "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
"Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
- config.blockSize[0], config.blockSize[1], config.blockSize[2],
- config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
- c_numClPerSupercl, plist->na_c);
+ config.blockSize[0],
+ config.blockSize[1],
+ config.blockSize[2],
+ config.blockSize[0] * config.gridSize[0],
+ config.blockSize[1] * config.gridSize[1],
+ plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster,
+ plist->na_c);
}
fillin_ocl_structures(nbp, &nbparams_params);
- auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_calc_nb";
- const auto kernel = select_nbnxn_kernel(nb,
- nbp->eeltype,
- nbp->vdwtype,
- stepWork.computeEnergy,
- (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
+ const auto kernel =
+ select_nbnxn_kernel(nb,
+ nbp->elecType,
+ nbp->vdwType,
+ stepWork.computeEnergy,
+ (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
// The OpenCL kernel takes int as second to last argument because bool is
// not supported as a kernel argument type (sizeof(bool) is implementation defined).
const int computeFshift = static_cast<int>(stepWork.computeVirial);
- if (useLjCombRule(nb->nbparam->vdwtype))
- {
- const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
- &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
- &adat->lj_comb,
- &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
- &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
-
- launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ const auto kernelArgs = prepareGpuKernelArguments(kernel,
+ config,
+ &nbparams_params,
+ &adat->xq,
+ &adat->f,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->ljComb,
+ &adat->shiftVec,
+ &nbp->nbfp,
+ &nbp->nbfp_comb,
+ &nbp->coulomb_tab,
+ &plist->sci,
+ &plist->cj4,
+ &plist->excl,
+ &computeFshift);
+
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
}
else
{
- const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
- &adat->ntypes,
- &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
- &adat->atom_types,
- &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
- &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
- launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ const auto kernelArgs = prepareGpuKernelArguments(kernel,
+ config,
+ &adat->numTypes,
+ &nbparams_params,
+ &adat->xq,
+ &adat->f,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->atomTypes,
+ &adat->shiftVec,
+ &nbp->nbfp,
+ &nbp->nbfp_comb,
+ &nbp->coulomb_tab,
+ &plist->sci,
+ &plist->cj4,
+ &plist->excl,
+ &computeFshift);
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
}
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
}
* Note that for the sake of simplicity we use the CUDA terminology "shared memory"
* for OpenCL local memory.
*
- * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
+ * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
+ * dimension.
* \returns the amount of local memory in bytes required by the pruning kernel
*/
static inline int calc_shmem_required_prune(const int num_threads_z)
int shmem;
/* i-atom x in shared memory (for convenience we load all 4 components including q) */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
/* cj in shared memory, for each warp separately
* Note: only need to load once per wavefront, but to keep the code simple,
* for now we load twice on AMD.
*/
shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
/* Warp vote, requires one uint per warp/32 threads per block. */
- shmem += sizeof(cl_uint) * 2*num_threads_z;
+ shmem += sizeof(cl_uint) * 2 * num_threads_z;
return shmem;
}
-void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
- const InteractionLocality iloc,
- const int numParts)
+/*! \brief
+ * Launch the pairlist prune only kernel for the given locality.
+ * \p numParts tells in how many parts, i.e. calls the list will be pruned.
+ */
+void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cl_atomdata_t *adat = nb->atdat;
- cl_nbparam_t *nbp = nb->nbparam;
- cl_plist_t *plist = nb->plist[iloc];
- cl_timers_t *t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
- bool bDoTime = nb->bDoTime == CL_TRUE;
+ NBAtomDataGpu* adat = nb->atdat;
+ NBParamGpu* nbp = nb->nbparam;
+ gpu_plist* plist = nb->plist[iloc];
+ Nbnxm::GpuTimers* timers = nb->timers;
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+ bool bDoTime = nb->bDoTime;
if (plist->haveFreshList)
{
}
else
{
- GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
+ GMX_ASSERT(numParts == plist->rollingPruningNumParts,
+ "It is not allowed to change numParts in between list generation steps");
}
}
}
/* Compute the number of list entries to prune in this pass */
- int numSciInPart = (plist->nsci - part)/numParts;
+ int numSciInPart = (plist->nsci - part) / numParts;
/* Don't launch the kernel if there is no work to do. */
if (numSciInPart <= 0)
return;
}
- GpuRegionTimer *timer = nullptr;
+ GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
if (bDoTime)
{
- timer->openTimingRegion(stream);
+ timer->openTimingRegion(deviceStream);
}
/* Kernel launch config:
* and j-cluster concurrency, in x, y, and z, respectively.
* - The 1D block-grid contains as many blocks as super-clusters.
*/
- int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
-
+ int num_threads_z = c_pruneKernelJ4Concurrency;
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = stream;
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.blockSize[2] = num_threads_z;
config.gridSize[0] = numSciInPart;
- validate_global_work_size(config, 3, nb->dev_info);
+ validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
if (debug)
{
- fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ fprintf(debug,
+ "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
"\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
"\tShMem: %zu\n",
- config.blockSize[0], config.blockSize[1], config.blockSize[2],
- config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
- c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
- }
-
- cl_nbparam_params_t nbparams_params;
+ config.blockSize[0],
+ config.blockSize[1],
+ config.blockSize[2],
+ config.blockSize[0] * config.gridSize[0],
+ config.blockSize[1] * config.gridSize[1],
+ plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster,
+ plist->na_c,
+ config.sharedMemorySize);
+ }
+
+ cl_nbparam_params_t nbparams_params;
fillin_ocl_structures(nbp, &nbparams_params);
- auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_pruneonly";
const auto pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
- const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config,
- &nbparams_params, &adat->xq, &adat->shift_vec,
- &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
- launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+ const auto kernelArgs = prepareGpuKernelArguments(pruneKernel,
+ config,
+ &nbparams_params,
+ &adat->xq,
+ &adat->shiftVec,
+ &plist->sci,
+ &plist->cj4,
+ &plist->imask,
+ &numParts,
+ &part);
+ launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
if (plist->haveFreshList)
{
- plist->haveFreshList = false;
+ plist->haveFreshList = false;
/* Mark that pruning has been done */
nb->timers->interaction[iloc].didPrune = true;
}
if (bDoTime)
{
- timer->closeTimingRegion(stream);
+ timer->closeTimingRegion(deviceStream);
}
}
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
- struct nbnxn_atomdata_t *nbatom,
- const gmx::StepWorkload &stepWork,
- const AtomLocality aloc,
- const bool gmx_unused copyBackNbForce)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- cl_int gmx_unused cl_error;
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
- /* determine interaction locality from atom locality */
- const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
-
- cl_atomdata_t *adat = nb->atdat;
- cl_timers_t *t = nb->timers;
- bool bDoTime = nb->bDoTime == CL_TRUE;
- cl_command_queue stream = nb->stream[iloc];
-
- /* don't launch non-local copy-back if there was no non-local work to do */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- /* TODO An alternative way to signal that non-local work is
- complete is to use a clEnqueueMarker+clEnqueueBarrier
- pair. However, the use of bNonLocalStreamActive has the
- advantage of being local to the host, so probably minimizes
- overhead. Curiously, for NVIDIA OpenCL with an empty-domain
- test case, overall simulation performance was higher with
- the API calls, but this has not been tested on AMD OpenCL,
- so could be worth considering in future. */
- nb->bNonLocalStreamActive = CL_FALSE;
- return;
- }
-
- getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
-
- /* beginning of timed D2H section */
- if (bDoTime)
- {
- t->xf[aloc].nb_d2h.openTimingRegion(stream);
- }
-
- /* With DD the local D2H transfer can only start after the non-local
- has been launched. */
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
- {
- sync_ocl_event(stream, &(nb->nonlocal_done));
- }
-
- /* DtoH f */
- ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
- (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
- /* kick off work */
- cl_error = clFlush(stream);
- assert(CL_SUCCESS == cl_error);
-
- /* After the non-local D2H is launched the nonlocal_done event can be
- recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we first need the non-local
- data back first. */
- if (iloc == InteractionLocality::NonLocal)
- {
- cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
- assert(CL_SUCCESS == cl_error);
- nb->bNonLocalStreamActive = CL_TRUE;
- }
-
- /* only transfer energies in the local stream */
- if (iloc == InteractionLocality::Local)
- {
- /* DtoH fshift when virial is needed */
- if (stepWork.computeVirial)
- {
- ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
- SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- }
-
- /* DtoH energies */
- if (stepWork.computeEnergy)
- {
- ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
- sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
- ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
- sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- }
- }
-
- if (bDoTime)
- {
- t->xf[aloc].nb_d2h.closeTimingRegion(stream);
- }
-}
-
-
-/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t &ic)
-{
- bool bTwinCut = (ic.rcoulomb != ic.rvdw);
- bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
- int kernel_type;
-
- /* Benchmarking/development environment variables to force the use of
- analytical or tabulated Ewald kernel. */
- bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
- bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
-
- if (bForceAnalyticalEwald && bForceTabulatedEwald)
- {
- gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
- "requested through environment variables.");
- }
-
- /* OpenCL: By default, use analytical Ewald
- * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
- *
- * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
- *
- */
- /* By default use analytical Ewald. */
- bUseAnalyticalEwald = true;
- if (bForceAnalyticalEwald)
- {
- if (debug)
- {
- fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
- }
- }
- else if (bForceTabulatedEwald)
- {
- bUseAnalyticalEwald = false;
-
- if (debug)
- {
- fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
- }
- }
-
- /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
- forces it (use it for debugging/benchmarking only). */
- if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr))
- {
- kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
- }
- else
- {
- kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
- }
-
- return kernel_type;
-}
-
} // namespace Nbnxm