{
cl_int cl_error;
std::string kernelRootPath = getSourceRootPath(kernelRelativePath);
- std::string includeRootPath = getSourceRootPath("src/gromacs/gpu_utils");
+ std::string includeRootPath = getSourceRootPath("src");
GMX_RELEASE_ASSERT(fplog != nullptr, "Need a valid log file for building OpenCL programs");
#ifndef _nbnxn_consts_h
#define _nbnxn_consts_h
-
/* With CPU kernels the i-cluster size is always 4 atoms.
* With x86 SIMD the j-cluster size can be 2, 4 or 8, otherwise 4.
*/
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+/* The number of clusters in a super-cluster, used for GPU */
+#define c_nbnxnGpuNumClusterPerSupercluster 8
+
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
+ * of integers containing 32 bits.
+ */
+#define c_nbnxnGpuJgroupSize (32/c_nbnxnGpuNumClusterPerSupercluster)
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef NBNXN_OPENCL_CONSTS_H
+#define NBNXN_OPENCL_CONSTS_H
+
+/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
+ *
+ * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ */
+/*! @{ */
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT 4
+//The following has to match getOclPruneKernelJ4Concurrency
+#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT
+/*! @} */
+#endif
*/
extraDefines += gmx::formatString(
- " -DCENTRAL=%d "
- "-DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d "
- "-DGMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY=%d "
- "-DNBNXN_MIN_RSQ=%s %s",
- CENTRAL, /* Defined in ishift.h */
- c_nbnxnGpuNumClusterPerSupercluster, /* Defined in nbnxn_pairlist.h */
+ " -DNBNXN_GPU_CLUSTER_SIZE=%d "
+ "%s",
c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */
- c_nbnxnGpuJgroupSize, /* Defined in nbnxn_pairlist.h */
- getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e), /* In nbnxn_ocl_types.h */
- STRINGIFY_MACRO(NBNXN_MIN_RSQ) /* Defined in nbnxn_consts.h */
- /* NBNXN_MIN_RSQ passed as string to avoid
- floating point representation problems with sprintf */
- , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
+ (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
);
try
/* shmem buffer for cj, for both warps separately */
cjs = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
- #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+ #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize
#endif //USE_CJ_PREFETCH
#ifdef IATYPE_SHMEM
#if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
#pragma unroll 4
#endif
- for (int jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+ for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
{
// TODO move these consts to utils and unify their use with the nonbonded kernels
const int c_numClPerSupercl = NCL_PER_SUPERCL;
const int c_clSize = CL_SIZE;
- const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
// TODO pass this value at compile-time as a macro
const int c_nbnxnGpuClusterpairSplit = 2;
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#include "device_utils.clh"
-#include "vectype_ops.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "nbnxn_ocl_consts.h"
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
-#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
{
/* Pre-load cj into shared memory */
#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
- if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+ if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
{
sm_cjPreload[tidxi] = gm_cj[tidxi];
}
#else
const int c_clSize = CL_SIZE;
- const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
#else
const int c_clSize = CL_SIZE;
- const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/mdlib/nbnxn_gpu_types_common.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_consts.h"
#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/real.h"
/* kernel does #include "gromacs/math/utilities.h" */
//! Define 1/sqrt(pi)
#define M_FLOAT_1_SQRTPI 0.564189583547756f
-/*! \brief Macros defining platform-dependent defaults for the prune kernel's j4 processing concurrency.
- *
- * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
- */
-/*! @{ */
-#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD 4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA 4
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT 4
-#else
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
-#endif
/*! @} */
/*! \brief Constants for platform-dependent defaults for the prune kernel's j4 processing concurrency.
*
* Initialized using macros that can be overridden at compile-time (using #GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY).
*/
/*! @{ */
-const int c_oclPruneKernelJ4ConcurrencyAMD = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_AMD;
-const int c_oclPruneKernelJ4ConcurrencyNVIDIA = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_NVIDIA;
-const int c_oclPruneKernelJ4ConcurrencyDefault = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
+const int c_oclPruneKernelJ4ConcurrencyDEFAULT = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY_DEFAULT;
/*! @} */
/*! \brief Returns the j4 processing concurrency parameter for the vendor \p vendorId
*/
static inline int getOclPruneKernelJ4Concurrency(int vendorId)
{
- assert(vendorId < OCL_VENDOR_UNKNOWN);
switch (vendorId)
{
- case OCL_VENDOR_AMD: return c_oclPruneKernelJ4ConcurrencyAMD; break;
- case OCL_VENDOR_NVIDIA: return c_oclPruneKernelJ4ConcurrencyNVIDIA; break;
- default: return c_oclPruneKernelJ4ConcurrencyDefault; break;
+ default: return c_oclPruneKernelJ4ConcurrencyDEFAULT;
}
}
#include <cstddef>
#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/mdtypes/nblist.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/bitmask.h"
static constexpr int c_nbnxnGpuClusterSize = 8;
#endif
-/* The number of clusters in a super-cluster, used for GPU */
-static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
-
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
- * of integers containing 32 bits.
- */
-static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
/* In CUDA the number of threads in a warp is 32 and we have cluster pairs
* of 8*8=64 atoms, so it's convenient to store data for cluster pair halves.
{
fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
b, c[b],
- 100.0*c[b]/static_cast<double>(nbl->ncj4*c_nbnxnGpuJgroupSize));
+ 100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
}
}
}