Re-enable i-atom type local mem prefetch in OpenCL

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 11 Mar 2016 18:05:47 +0000 (19:05 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 30 Mar 2016 17:02:03 +0000 (19:02 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 11 Mar 2016 18:05:47 +0000 (19:05 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 30 Mar 2016 17:02:03 +0000 (19:02 +0200)
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index f1bf98fe306e43d62338320a6b1c4a313322be5b..01a62e55936f0321311c69205299f5fbfa99508b 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -416,6 +416,14 @@ compilation of OpenCL kernels, but they are also used in device selection.
          function properly with this option on, it is solely for the
          simplicity of stepping in a kernel and see what is happening.
  
+``GMX_OCL_DISABLE_I_PREFETCH``
+        Disables i-atom data (type or LJ parameter) prefetch allowig
+        testing.
+
+``GMX_OCL_ENABLE_I_PREFETCH``
+        Enables i-atom data (type or LJ parameter) prefetch allowig
+        testing on platforms where this behavior is not default.
+
  ``GMX_OCL_NB_ANA_EWALD``
          Forces the use of analytical Ewald kernels. Equivalent of
          CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD``
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

index 6cfd65796807ed271408ec910db146568d249a82..0fa46ed5b55c66ee31f76ecc8a0ec62975d559e8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -245,7 +245,7 @@ static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
  
  /*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
   */
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(bool bPrefetchLjParam)
  {
      int shmem;
  
@@ -255,17 +255,11 @@ static inline int calc_shmem_required()
      shmem  = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
      /* cj in shared memory, for both warps separately */
      shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int);           /* cjs  */
-#ifdef IATYPE_SHMEM
-    /* FIXME: this should not be compile-time decided but rather at runtime.
-     * This issue propagated from the CUDA code where due to the source to source
-     * compilation there was confusion the way to set up arch-dependent launch parameters.
-     * Here too this should be converted to a hardware/arch/generation dependent
-     * conditional when re-evaluating the need for i atom type preloading.
-     */
-    /* i-atom types in shared memory */
-    #pragma error "Should not be defined"
-    shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
-#endif
+    if (bPrefetchLjParam)
+    {
+        /* i-atom types in shared memory */
+        shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+    }
      /* force reduction buffers in shared memory */
      shmem += c_clSize * c_clSize * 3 * sizeof(float);    /* f_buf */
      /* Warp vote. In fact it must be * number of warps in block.. */
@@ -517,7 +511,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
  
      validate_global_work_size(global_work_size, 3, nb->dev_info);
  
-    shmem     = calc_shmem_required();
+    shmem     = calc_shmem_required(nb->bPrefetchLjParam);
  
  #ifdef DEBUG_OCL
      {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index 6011046e7d8640011eaab704f56ff4f1316636b2..f74d574aa59c0a0c7fc936e28d8068d235c51a1b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -753,6 +753,13 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
  
      nbnxn_ocl_init_const(nb, ic, nbv_grp);
  
+    /* Enable LJ param manual prefetch for AMD or if we request through env. var.
+     * TODO: decide about NVIDIA
+     */
+    nb->bPrefetchLjParam =
+        (getenv("GMX_OCL_DISABLE_I_PREFETCH") == NULL) &&
+        ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != NULL));
+
      /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
       * but sadly this is not supported in OpenCL (yet?). Consider adding it if
       * it becomes supported.
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp

index 79daac0d7d963daa4a68ce34f0166471a03d4b5c..2e095ea778b07887263808f0f8e3c7525716d043 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
@@ -197,7 +197,7 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
       * in the JIT compilation that happens at runtime.
       */
      sprintf(runtime_consts,
-            "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s",
+            "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s %s",
              CENTRAL,                                    /* Defined in ishift.h */
              c_nbnxnGpuNumClusterPerSupercluster,        /* Defined in nbnxn_pairlist.h */
              c_nbnxnGpuClusterSize,                      /* Defined in nbnxn_pairlist.h */
@@ -205,6 +205,7 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
              STRINGIFY_MACRO(NBNXN_AVOID_SING_R2_INC)    /* Defined in nbnxn_consts.h */
                                                          /* NBNXN_AVOID_SING_R2_INC passed as string to avoid
                                                             floating point representation problems with sprintf */
+            , (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : ""
              );
  
      /* Need to catch std::bad_alloc here and during compilation string
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh

index 20f77b0dc90b7c11b542906eebc2fa923c896246..1037ce5ecfb16f3d463992907cb62107b59af835 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
@@ -182,7 +182,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
      #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
  
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
      __local int *atib = (__local int *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
@@ -213,7 +213,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      xqbuf.w *= nbparam->epsfac;
      xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
  
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
      /* Pre-load the i-atom types into shared memory */
      atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
@@ -364,7 +364,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                              {
                                  /* load the rest of the i-atom parameters */
                                  qi      = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
                                  typei   = atib[i * CL_SIZE + tidxi];
  #else
                                  typei   = atom_types[ai];
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh

index 3fdf56c55d506d33000bb67ab888699c9dabbee2..aeeae84dbeb5b971cb7ce4d35823181feee727c0 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
@@ -182,7 +182,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
      #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
  
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
      __local int *atib = (__local int *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
@@ -213,7 +213,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      xqbuf.w *= nbparam->epsfac;
      xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
  
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
      /* Pre-load the i-atom types into shared memory */
      atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
@@ -364,7 +364,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                              {
                                  /* load the rest of the i-atom parameters */
                                  qi      = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
                                  typei   = atib[i * CL_SIZE + tidxi];
  #else
                                  typei   = atom_types[ai];
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh

index 40d867badebced4aca6c71046cabcc6d244c756e..a75c684b88968611f7fd9eae9dbc06ca5407affa 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
@@ -179,7 +179,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
      #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
  
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
      __local int *atib = (__local int *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
@@ -210,7 +210,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
      xqbuf.w *= nbparam->epsfac;
      xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
  
-#ifdef IATYPE_SHMEM //NOTE: Should not be defined. Used with CUDA > 3.0 Re-evaluate the effect of preloading at a suitable time.
+#ifdef IATYPE_SHMEM
      /* Pre-load the i-atom types into shared memory */
      atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
@@ -357,7 +357,7 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                              {
                                  /* load the rest of the i-atom parameters */
                                  qi      = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined!
+#ifdef IATYPE_SHMEM
                                  typei   = atib[i * CL_SIZE + tidxi];
  #else
                                  typei   = atom_types[ai];
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index 67529febe0bf496dcc3a3beedd8d6771f7038497..7fa7cf44c723e2aef76ff4f5cebf9a85266ddf59 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -281,6 +281,8 @@ struct gmx_nbnxn_ocl_t
      cl_kernel           kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
      ///@}
  
+    bool bPrefetchLjParam; /**< true if prefetching fg i-atom LJ parameters should be used in the kernels */
+
      /**< auxiliary kernels implementing memset-like functions */
      ///@{
      cl_kernel           kernel_memset_f;
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 11 Mar 2016 18:05:47 +0000 (19:05 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 30 Mar 2016 17:02:03 +0000 (19:02 +0200)
docs/user-guide/environment-variables.rst		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h		patch \| blob \| history