Support 4 sized cluster for GPU NBL search

author Roland Schulz <roland.schulz@intel.com>

Sat, 28 Apr 2018 04:52:23 +0000 (21:52 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 2 May 2018 13:21:17 +0000 (15:21 +0200)
author Roland Schulz <roland.schulz@intel.com>
Sat, 28 Apr 2018 04:52:23 +0000 (21:52 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 2 May 2018 13:21:17 +0000 (15:21 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h

index 8ccb2224cedd69185442f936577c55464cf05417..4e064ec910a917aff85339cead883dfc71637da8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -75,23 +75,23 @@ struct NbnxnListParameters
  /*! \endcond */
  
  /* With GPU kernels the i and j cluster size is 8 atoms */
-static const int c_nbnxnGpuClusterSize = 8;
+static constexpr int c_nbnxnGpuClusterSize = 8;
  
  /* The number of clusters in a super-cluster, used for GPU */
-static const int c_nbnxnGpuNumClusterPerSupercluster = 8;
+static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
  
  /* With GPU kernels we group cluster pairs in 4 to optimize memory usage
   * of integers containing 32 bits.
   */
-static const int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
+static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
  
  /* In CUDA the number of threads in a warp is 32 and we have cluster pairs
   * of 8*8=64 atoms, so it's convenient to store data for cluster pair halves.
   */
-static const int c_nbnxnGpuClusterpairSplit = 2;
+static constexpr int c_nbnxnGpuClusterpairSplit = 2;
  
  /* The fixed size of the exclusion mask array for a half cluster pair */
-static const int c_nbnxnGpuExclSize = c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
+static constexpr int c_nbnxnGpuExclSize = c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
  
  /* A buffer data structure of 64 bytes
   * to be placed at the beginning and end of structs
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp

index dddb8ba89e205b5f3db5949482844b0304950381..917319f9108a8c1e75ac01f5943824c7df3c4439 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -588,11 +588,11 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
  #else /* !GMX_SIMD4_HAVE_REAL */
  
      /* 4-wide SIMD version.
-     * A cluster is hard-coded to 8 atoms.
       * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
       * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
       */
-    assert(c_nbnxnGpuClusterSize == 8);
+    static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
+                  "A cluster is hard-coded to 4/8 atoms.");
  
      Simd4Real   rc2_S      = Simd4Real(rlist2);
  
@@ -602,10 +602,14 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
      Simd4Real   ix_S0      = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
      Simd4Real   iy_S0      = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
      Simd4Real   iz_S0      = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
-    Simd4Real   ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
-    Simd4Real   iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
-    Simd4Real   iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
  
+    Simd4Real   ix_S1, iy_S1, iz_S1;
+    if (c_nbnxnGpuClusterSize == 8)
+    {
+        ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
+        iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
+        iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
+    }
      /* We loop from the outer to the inner particles to maximize
       * the chance that we find a pair in range quickly and return.
       */
@@ -644,30 +648,45 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
          dx_S0            = ix_S0 - jx0_S;
          dy_S0            = iy_S0 - jy0_S;
          dz_S0            = iz_S0 - jz0_S;
-        dx_S1            = ix_S1 - jx0_S;
-        dy_S1            = iy_S1 - jy0_S;
-        dz_S1            = iz_S1 - jz0_S;
          dx_S2            = ix_S0 - jx1_S;
          dy_S2            = iy_S0 - jy1_S;
          dz_S2            = iz_S0 - jz1_S;
-        dx_S3            = ix_S1 - jx1_S;
-        dy_S3            = iy_S1 - jy1_S;
-        dz_S3            = iz_S1 - jz1_S;
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            dx_S1            = ix_S1 - jx0_S;
+            dy_S1            = iy_S1 - jy0_S;
+            dz_S1            = iz_S1 - jz0_S;
+            dx_S3            = ix_S1 - jx1_S;
+            dy_S3            = iy_S1 - jy1_S;
+            dz_S3            = iz_S1 - jz1_S;
+        }
  
          /* rsq = dx*dx+dy*dy+dz*dz */
          rsq_S0           = norm2(dx_S0, dy_S0, dz_S0);
-        rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
          rsq_S2           = norm2(dx_S2, dy_S2, dz_S2);
-        rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
+            rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
+        }
  
          wco_S0           = (rsq_S0 < rc2_S);
-        wco_S1           = (rsq_S1 < rc2_S);
          wco_S2           = (rsq_S2 < rc2_S);
-        wco_S3           = (rsq_S3 < rc2_S);
-
-        wco_any_S01      = wco_S0 || wco_S1;
-        wco_any_S23      = wco_S2 || wco_S3;
-        wco_any_S        = wco_any_S01 || wco_any_S23;
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            wco_S1           = (rsq_S1 < rc2_S);
+            wco_S3           = (rsq_S3 < rc2_S);
+        }
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            wco_any_S01      = wco_S0 || wco_S1;
+            wco_any_S23      = wco_S2 || wco_S3;
+            wco_any_S        = wco_any_S01 || wco_any_S23;
+        }
+        else
+        {
+            wco_any_S = wco_S0 || wco_S2;
+        }
  
          if (anyTrue(wco_any_S))
          {
@@ -1103,7 +1122,7 @@ static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
  
      /* Here we only set the set self and double pair exclusions */
  
-    assert(c_nbnxnGpuClusterpairSplit == 2);
+    static_assert(c_nbnxnGpuClusterpairSplit == 2, "");
  
      get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
  
diff --git a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h

index 3d2ec0fbe6274997996aff097e5574fffba48ca2..1a06376f074e0a60692a9996e7b6f3ef71ea408c 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
+++ b/src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
@@ -119,7 +119,7 @@ makeClusterListSimd2xnn(const nbnxn_grid_t *      gridj,
      InRange = FALSE;
      while (!InRange && jclusterFirst <= jclusterLast)
      {
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
          d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterFirst, gridj->bbj);
  #else
          d2 = subc_bb_dist2(0, bb_ci, jclusterFirst, gridj->bbj);
@@ -177,7 +177,7 @@ makeClusterListSimd2xnn(const nbnxn_grid_t *      gridj,
      InRange = FALSE;
      while (!InRange && jclusterLast > jclusterFirst)
      {
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
          d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterLast, gridj->bbj);
  #else
          d2 = subc_bb_dist2(0, bb_ci, jclusterLast, gridj->bbj);
diff --git a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h

index 36598d33980cdf557c1ed0f21f90114a951602ff..c226d86021ec42a182a25162b5e09d13a67d9568 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
+++ b/src/gromacs/mdlib/nbnxn_search_simd_4xn.h
@@ -138,7 +138,7 @@ makeClusterListSimd4xn(const nbnxn_grid_t *      gridj,
      InRange = FALSE;
      while (!InRange && jclusterFirst <= jclusterLast)
      {
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
          d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterFirst, gridj->bbj);
  #else
          d2 = subc_bb_dist2(0, bb_ci, jclusterFirst, gridj->bbj);
@@ -209,7 +209,7 @@ makeClusterListSimd4xn(const nbnxn_grid_t *      gridj,
      InRange = FALSE;
      while (!InRange && jclusterLast > jclusterFirst)
      {
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
          d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterLast, gridj->bbj);
  #else
          d2 = subc_bb_dist2(0, bb_ci, jclusterLast, gridj->bbj);
author	Roland Schulz <roland.schulz@intel.com>
	Sat, 28 Apr 2018 04:52:23 +0000 (21:52 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 2 May 2018 13:21:17 +0000 (15:21 +0200)
src/gromacs/mdlib/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search_simd_2xnn.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search_simd_4xn.h		patch \| blob \| history