Merge release-5-0 into master
authorRoland Schulz <roland@utk.edu>
Thu, 12 Feb 2015 18:38:31 +0000 (13:38 -0500)
committerRoland Schulz <roland@utk.edu>
Thu, 12 Feb 2015 18:38:51 +0000 (13:38 -0500)
Change-Id: I23579c289b8a057a94cf64b142759934b9402ee5

1  2 
src/gromacs/ewald/pme.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

diff --combined src/gromacs/ewald/pme.c
index 965111e4666f1fbda7cf766e3571988b058b2311,82b597d5f0c3eeea5fc1a180b9ed4497480f833c..3a5c710a957c3eb69330a6c192234c5371fc59d8
@@@ -3,7 -3,7 +3,7 @@@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
-  * Copyright (c) 2013,2014, by the GROMACS development team, led by
+  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * /Erik 001109
   */
  
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
  
 -#include <stdio.h>
 -#include <string.h>
 -#include <math.h>
 -#include <assert.h>
 -#include "typedefs.h"
 -#include "txtdump.h"
 -#include "vec.h"
 -#include "gromacs/utility/smalloc.h"
 -#include "coulomb.h"
 -#include "gmx_fatal.h"
  #include "pme.h"
 -#include "network.h"
 -#include "physics.h"
 -#include "nrnb.h"
 -#include "macros.h"
  
 +#include "config.h"
 +
 +#include <assert.h>
 +#include <math.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +
 +#include "gromacs/ewald/pme-internal.h"
 +#include "gromacs/fft/fft.h"
  #include "gromacs/fft/parallel_3dfft.h"
 -#include "gromacs/fileio/futil.h"
 -#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/legacyheaders/macros.h"
 +#include "gromacs/legacyheaders/nrnb.h"
 +#include "gromacs/legacyheaders/types/commrec.h"
 +#include "gromacs/legacyheaders/types/enums.h"
 +#include "gromacs/legacyheaders/types/forcerec.h"
 +#include "gromacs/legacyheaders/types/inputrec.h"
 +#include "gromacs/legacyheaders/types/nrnb.h"
  #include "gromacs/math/gmxcomplex.h"
 +#include "gromacs/math/units.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/math/vectypes.h"
 +/* Include the SIMD macro file and then check for support */
 +#include "gromacs/simd/simd.h"
 +#include "gromacs/simd/simd_math.h"
  #include "gromacs/timing/cyclecounter.h"
  #include "gromacs/timing/wallcycle.h"
 +#include "gromacs/timing/walltime_accounting.h"
 +#include "gromacs/utility/basedefinitions.h"
 +#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/gmxomp.h"
 +#include "gromacs/utility/real.h"
 +#include "gromacs/utility/smalloc.h"
 +
 +#ifdef DEBUG_PME
 +#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/futil.h"
 +#endif
  
 -/* Include the SIMD macro file and then check for support */
 -#include "gromacs/simd/simd.h"
 -#include "gromacs/simd/simd_math.h"
  #ifdef GMX_SIMD_HAVE_REAL
  /* Turn on arbitrary width SIMD intrinsics for PME solve */
  #    define PME_SIMD_SOLVE
@@@ -177,8 -165,6 +177,8 @@@ typedef struct 
      int recv_size;   /* Receive buffer width, used with OpenMP */
  } pme_grid_comm_t;
  
 +typedef real *splinevec[DIM];
 +
  typedef struct {
  #ifdef GMX_MPI
      MPI_Comm         mpi_comm;
@@@ -1447,7 -1433,7 +1447,7 @@@ static void spread_coefficients_bspline
  #define PME_SPREAD_SIMD4_ALIGNED
  #define PME_ORDER 4
  #endif
 -#include "pme_simd4.h"
 +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
  #else
                      DO_BSPLINE(4);
  #endif
  #ifdef PME_SIMD4_SPREAD_GATHER
  #define PME_SPREAD_SIMD4_ALIGNED
  #define PME_ORDER 5
 -#include "pme_simd4.h"
 +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
  #else
                      DO_BSPLINE(5);
  #endif
@@@ -1756,7 -1742,7 +1756,7 @@@ static void pmegrids_destroy(pmegrids_
  
  static void realloc_work(pme_work_t *work, int nkx)
  {
 -    int simd_width;
 +    int simd_width, i;
  
      if (nkx > work->nalloc)
      {
          snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
          snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
          srenew(work->m2inv, work->nalloc);
 +#ifndef NDEBUG
 +        for (i = 0; i < work->nalloc+simd_width; i++)
 +        {
 +            work->denom[i] = 1; /* init to 1 to avoid 1/0 exceptions of simd padded elements */
 +        }
 +#endif
      }
  }
  
@@@ -2616,7 -2596,7 +2616,7 @@@ static void gather_f_bsplines(gmx_pme_
  #define PME_GATHER_F_SIMD4_ALIGNED
  #define PME_ORDER 4
  #endif
 -#include "pme_simd4.h"
 +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
  #else
                      DO_FSPLINE(4);
  #endif
  #ifdef PME_SIMD4_SPREAD_GATHER
  #define PME_GATHER_F_SIMD4_ALIGNED
  #define PME_ORDER 5
 -#include "pme_simd4.h"
 +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
  #else
                      DO_FSPLINE(5);
  #endif
@@@ -3854,7 -3834,7 +3854,7 @@@ reduce_threadgrid_overlap(gmx_pme_t pme
      int  fft_my, fft_mz;
      int  buf_my = -1;
      int  nsx, nsy, nsz;
-     ivec localcopy_end;
+     ivec localcopy_end, commcopy_end;
      int  offx, offy, offz, x, y, z, i0, i0t;
      int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
      gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
           * not beyond the local FFT grid.
           */
          localcopy_end[d] =
-             min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+             min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
                  local_fft_ndata[d]);
+         /* Determine up to where our thread needs to copy from the
+          * thread-local charge spreading grid to the communication buffer.
+          * Note: only relevant with communication, ignored otherwise.
+          */
+         commcopy_end[d]  = localcopy_end[d];
+         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
+         {
+             /* The last thread should copy up to the last pme grid line.
+              * When the rank-local FFT grid is narrower than pme-order,
+              * we need the max below to ensure copying of all data.
+              */
+             commcopy_end[d] = max(commcopy_end[d], pme->pme_order);
+         }
      }
  
      offx = pmegrid->offset[XX];
          }
          pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
          ox       += pmegrid_g->offset[XX];
-         /* Determine the end of our part of the source grid */
-         if (!bCommX)
-         {
-             /* Use our thread local source grid and target grid part */
-             tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
-         }
-         else
-         {
-             /* Use our thread local source grid and the spreading range */
-             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
-         }
+         /* Determine the end of our part of the source grid.
+          * Use our thread local source grid and target grid part
+          */
+         tx1 = min(ox + pmegrid_g->n[XX],
+                   !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
  
          for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
          {
              }
              pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
              oy       += pmegrid_g->offset[YY];
-             /* Determine the end of our part of the source grid */
-             if (!bCommY)
-             {
-                 /* Use our thread local source grid and target grid part */
-                 ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
-             }
-             else
-             {
-                 /* Use our thread local source grid and the spreading range */
-                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
-             }
+             /* Determine the end of our part of the source grid.
+              * Use our thread local source grid and target grid part
+              */
+             ty1 = min(oy + pmegrid_g->n[YY],
+                       !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
  
              for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
              {
index 793f13fc7c8e1bf8dbf6ad8d28a26e2990d9f61c,29af9eb52f937c73df36f5b037f7bee99a8cab72..9ee57c656da43a53753988d19d5df8a34fab331a
@@@ -1,7 -1,7 +1,7 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
-  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
 -#ifdef HAVE_CONFIG_H
 -#include <config.h>
 -#endif
 +#include "gmxpre.h"
 +
 +#include "nbnxn_cuda.h"
 +
 +#include "config.h"
  
 -#include <stdlib.h>
  #include <assert.h>
 +#include <stdlib.h>
  
  #if defined(_MSVC)
  #include <limits>
  
  #include <cuda.h>
  
 -#include "types/simple.h"
 -#include "types/nbnxn_pairlist.h"
 -#include "types/nb_verlet.h"
 -#include "types/ishift.h"
 -#include "types/force_flags.h"
 -#include "../nbnxn_consts.h"
 -
  #ifdef TMPI_ATOMICS
  #include "thread_mpi/atomic.h"
  #endif
  
 +#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
 +#include "gromacs/legacyheaders/types/force_flags.h"
 +#include "gromacs/legacyheaders/types/simple.h"
 +#include "gromacs/mdlib/nb_verlet.h"
 +#include "gromacs/mdlib/nbnxn_consts.h"
 +#include "gromacs/mdlib/nbnxn_pairlist.h"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
 +#include "gromacs/pbcutil/ishift.h"
 +#include "gromacs/utility/cstringutil.h"
 +
  #include "nbnxn_cuda_types.h"
 -#include "../../gmxlib/cuda_tools/cudautils.cuh"
 -#include "nbnxn_cuda.h"
 -#include "nbnxn_cuda_data_mgmt.h"
  
  #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  #define USE_TEXOBJ
@@@ -80,38 -78,8 +80,38 @@@ texture<float, 1, cudaReadModeElementTy
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  
 +/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
 + * warp-pairs per block.
 + *
 + * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
 + * blocks/multiproc, is the fastest even though this setup gives low occupancy.
 + * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
 + * per multiprocessor is reduced proportionally to get the original number of max
 + * threads in flight (and slightly lower performance).
 + * - On CC 3.7 there are enough registers to double the number of threads; using
 + * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
 + * with low-register use).
 + *
 + * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
 + * shuffle-based reduction, hence CC >= 3.0.
 + */
 +
 +/* Kernel launch bounds as function of NTHREAD_Z.
 + * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
 + * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
 + */
 +#if __CUDA_ARCH__ == 370
 +#define NTHREAD_Z           (2)
 +#define MIN_BLOCKS_PER_MP   (16)
 +#else
 +#define NTHREAD_Z           (1)
 +#define MIN_BLOCKS_PER_MP   (16)
 +#endif
 +#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
 +
 +
  /***** The kernels come here *****/
 -#include "nbnxn_cuda_kernel_utils.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
  
  /* Top-level kernel generation: will generate through multiple inclusion the
   * following flavors for all kernels:
   * - force and energy output with pair list pruning.
   */
  /** Force only **/
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  
  /*** Pair-list pruning kernels ***/
  /** Force only **/
  #define PRUNE_NBL
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
 -#include "nbnxn_cuda_kernels.cuh"
 +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  #undef PRUNE_NBL
  
@@@ -271,7 -239,7 +271,7 @@@ static inline nbnxn_cu_kfunc_ptr_t sele
  }
  
  /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
 -static inline int calc_shmem_required()
 +static inline int calc_shmem_required(const int num_threads_z)
  {
      int shmem;
  
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
      shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 -    /* cj in shared memory, for both warps separately */
 -    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 +    /* cj in shared memory, for each warp separately */
 +    shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
  #ifdef IATYPE_SHMEM
      /* i-atom types in shared memory */
      shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
@@@ -327,7 -295,7 +327,7 @@@ void nbnxn_cuda_launch_kernel(nbnxn_cud
      cu_timers_t         *t       = cu_nb->timers;
      cudaStream_t         stream  = cu_nb->stream[iloc];
  
-     bool                 bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
      bool                 bDoTime     = cu_nb->bDoTime;
  
                                      bCalcEner,
                                      plist->bDoPrune || always_prune);
  
 -    /* kernel launch config */
 +    /* Kernel launch config:
 +     * - The thread block dimensions match the size of i-clusters, j-clusters,
 +     *   and j-cluster concurrency, in x, y, and z, respectively.
 +     * - The 1D block-grid contains as many blocks as super-clusters.
 +     */
 +    int num_threads_z = 1;
 +    if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
 +    {
 +        num_threads_z = 2;
 +    }
      nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 -    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 +    dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
 -    shmem     = calc_shmem_required();
 +    shmem     = calc_shmem_required(num_threads_z);
  
      if (debug)
      {
          fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 -                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 +                "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
 +                "\tShMem: %d\n",
                  dim_block.x, dim_block.y, dim_block.z,
                  dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 -                NCL_PER_SUPERCL, plist->na_c);
 +                NCL_PER_SUPERCL, plist->na_c,
 +                shmem);
      }
  
      nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
@@@ -465,7 -422,7 +465,7 @@@ void nbnxn_cuda_launch_cpyback(nbnxn_cu
      bool             bDoTime = cu_nb->bDoTime;
      cudaStream_t     stream  = cu_nb->stream[iloc];
  
-     bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch copy-back if there was no work to do */
@@@ -621,7 -578,7 +621,7 @@@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr
      wallclock_gpu_t *timings  = cu_nb->timings;
      nb_staging       nbst     = cu_nb->nbst;
  
-     bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
@@@ -757,18 -714,18 +757,18 @@@ void nbnxn_cuda_set_cacheconfig(cuda_de
              if (devinfo->prop.major >= 3)
              {
                  /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
 +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
              }
              else
              {
                  /* On Fermi prefer L1 gives 2% higher performance */
                  /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
 +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
              }
              CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");