function properly with this option on, it is solely for the
simplicity of stepping in a kernel and see what is happening.
+``GMX_OCL_DISABLE_I_PREFETCH``
+ Disables i-atom data (type or LJ parameter) prefetch allowig
+ testing.
+
+``GMX_OCL_ENABLE_I_PREFETCH``
+ Enables i-atom data (type or LJ parameter) prefetch allowig
+ testing on platforms where this behavior is not default.
+
``GMX_OCL_NB_ANA_EWALD``
Forces the use of analytical Ewald kernels. Equivalent of
CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD``
/*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
*/
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(bool bPrefetchLjParam)
{
int shmem;
shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
/* cj in shared memory, for both warps separately */
shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
-#ifdef IATYPE_SHMEM
- /* FIXME: this should not be compile-time decided but rather at runtime.
- * This issue propagated from the CUDA code where due to the source to source
- * compilation there was confusion the way to set up arch-dependent launch parameters.
- * Here too this should be converted to a hardware/arch/generation dependent
- * conditional when re-evaluating the need for i atom type preloading.
- */
- /* i-atom types in shared memory */
- #pragma error "Should not be defined"
- shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
-#endif
+ if (bPrefetchLjParam)
+ {
+ /* i-atom types in shared memory */
+ shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+ }
/* force reduction buffers in shared memory */
shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
/* Warp vote. In fact it must be * number of warps in block.. */
validate_global_work_size(global_work_size, 3, nb->dev_info);
- shmem = calc_shmem_required();
+ shmem = calc_shmem_required(nb->bPrefetchLjParam);
#ifdef DEBUG_OCL
{
nbnxn_ocl_init_const(nb, ic, nbv_grp);
+ /* Enable LJ param manual prefetch for AMD or if we request through env. var.
+ * TODO: decide about NVIDIA
+ */
+ nb->bPrefetchLjParam =
+ (getenv("GMX_OCL_DISABLE_I_PREFETCH") == NULL) &&
+ ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != NULL));
+
/* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
* but sadly this is not supported in OpenCL (yet?). Consider adding it if
* it becomes supported.
* in the JIT compilation that happens at runtime.
*/
sprintf(runtime_consts,
- "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s",
+ "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s %s",
CENTRAL, /* Defined in ishift.h */
c_nbnxnGpuNumClusterPerSupercluster, /* Defined in nbnxn_pairlist.h */
c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */
STRINGIFY_MACRO(NBNXN_AVOID_SING_R2_INC) /* Defined in nbnxn_consts.h */
/* NBNXN_AVOID_SING_R2_INC passed as string to avoid
floating point representation problems with sprintf */
+ , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
);
/* Need to catch std::bad_alloc here and during compilation string
__local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
#define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
/* shmem buffer for i atom-type pre-loading */
__local int *atib = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
xqbuf.w *= nbparam->epsfac;
xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
/* Pre-load the i-atom types into shared memory */
atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
{
/* load the rest of the i-atom parameters */
qi = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
typei = atib[i * CL_SIZE + tidxi];
#else
typei = atom_types[ai];
__local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
#define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
/* shmem buffer for i atom-type pre-loading */
__local int *atib = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
xqbuf.w *= nbparam->epsfac;
xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
/* Pre-load the i-atom types into shared memory */
atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
{
/* load the rest of the i-atom parameters */
qi = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
typei = atib[i * CL_SIZE + tidxi];
#else
typei = atom_types[ai];
__local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
#define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
/* shmem buffer for i atom-type pre-loading */
__local int *atib = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
xqbuf.w *= nbparam->epsfac;
xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Used with CUDA > 3.0 Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
/* Pre-load the i-atom types into shared memory */
atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
{
/* load the rest of the i-atom parameters */
qi = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
typei = atib[i * CL_SIZE + tidxi];
#else
typei = atom_types[ai];
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
cl_kernel kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
///@}
+ bool bPrefetchLjParam; /**< true if prefetching fg i-atom LJ parameters should be used in the kernels */
+
/**< auxiliary kernels implementing memset-like functions */
///@{
cl_kernel kernel_memset_f;