Merge branch release-2016
authorMark Abraham <mark.j.abraham@gmail.com>
Mon, 21 Aug 2017 23:51:09 +0000 (01:51 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 22 Aug 2017 11:19:22 +0000 (13:19 +0200)
Made matching change for ARM NEON SIMD to newly refactored SIMD code

Trivial resolutions of adjacent changes in CUDA kernel code.

Adjacent resolutions for changes for dynamic pruning and disabling
of PME tuning for the group scheme need checking.

Change-Id: I024878fa50ba815960d00ad6e811af181323b4db

18 files changed:
1  2 
cmake/gmxDetectSimd.cmake
cmake/gmxManageNvccConfig.cmake
cmake/gmxManageSimd.cmake
docs/CMakeLists.txt
docs/manual/algorithms.tex
docs/manual/special.tex
src/gromacs/ewald/pme-load-balancing.cpp
src/gromacs/gmxana/gmx_msd.cpp
src/gromacs/gpu_utils/cuda_arch_utils.cuh
src/gromacs/gpu_utils/cudautils.cu
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
src/gromacs/mdrunutility/handlerestart.cpp
src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
src/programs/mdrun/md.cpp
src/programs/mdrun/mdrun.cpp

index e83c3abd790d0e6c65d6a22b2a0a76eb321af918,c0cf2f56ac17db11670d50fc40101c7a3c223233..50088be99f190879bf4d71a9804e27ddba2a99d5
@@@ -61,64 -72,89 +61,64 @@@ function(gmx_suggest_simd _suggested_si
      # Prepare a default suggestion
      set(OUTPUT_SIMD "None")
  
 -    # We need to execute the binary, so this only works if not cross-compiling.
 -    # However, note that we are NOT limited to x86.
 -    if(NOT CMAKE_CROSSCOMPILING)
 -        # TODO Extract this try_compile to a helper function, because
 -        # it duplicates code in gmxSetBuildInformation.cmake
 -        set(GMX_DETECTSIMD_BINARY "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/GmxDetectSimd${CMAKE_EXECUTABLE_SUFFIX}")
 -        set(LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}")
 -        try_compile(GMX_DETECTSIMD_COMPILED
 -            "${CMAKE_CURRENT_BINARY_DIR}"
 -            "${CMAKE_CURRENT_SOURCE_DIR}/src/gromacs/hardware/cpuinfo.cpp"
 -            COMPILE_DEFINITIONS "${_compile_definitions}"
 -            CMAKE_FLAGS "-DLINK_LIBRARIES=${LINK_LIBRARIES}"
 -            OUTPUT_VARIABLE GMX_DETECTSIMD_COMPILED_OUTPUT
 -            COPY_FILE ${GMX_DETECTSIMD_BINARY})
 -        unset(_compile_definitions)
 +    # Detect CPU features and place the string in CPU_DETECTION_FEATURES
 +    # Note that we are NOT limited to x86.
 +    gmx_run_cpu_detection(features)
 +
 +    if (DEFINED CPU_DETECTION_FEATURES)
 +        # Make a concrete suggestion of SIMD level if a feature flag
 +        # matches. Make sure that the match strings below work even if
 +        # the feature is first or last.
 +        set(CPU_DETECTION_FEATURES " ${CPU_DETECTION_FEATURES} ")
  
 -        if(GMX_DETECTSIMD_COMPILED)
 -            if(NOT DEFINED GMX_DETECTSIMD_RUN)
 -                execute_process(COMMAND ${GMX_DETECTSIMD_BINARY} "-features"
 -                    RESULT_VARIABLE GMX_DETECTSIMD_RUN
 -                    OUTPUT_VARIABLE OUTPUT_TMP
 -                    ERROR_QUIET)
 -                set(GMX_DETECTSIMD_RUN "${GMX_DETECTSIMD_RUN}" CACHE INTERNAL "Result of running cpuinfo code to detect SIMD support")
 -                if(GMX_DETECTSIMD_RUN EQUAL 0)
 -                    # Make a concrete suggestion of SIMD level
 -                    if(GMX_TARGET_X86)
 -                        if(OUTPUT_TMP MATCHES " avx512er ")
 -                            set(OUTPUT_SIMD "AVX_512_KNL")
 -                        elseif(OUTPUT_TMP MATCHES " avx512f ")
 -                            set(OUTPUT_SIMD "AVX_512")
 -                        elseif(OUTPUT_TMP MATCHES " avx2 ")
 -                            set(OUTPUT_SIMD "AVX2_256")
 -                        elseif(OUTPUT_TMP MATCHES " avx ")
 -                            if(OUTPUT_TMP MATCHES " fma4 ")
 -                                # AMD that works better with avx-128-fma
 -                                set(OUTPUT_SIMD "AVX_128_FMA")
 -                            else()
 -                                # Intel
 -                                set(OUTPUT_SIMD "AVX_256")
 -                            endif()
 -                        elseif(OUTPUT_TMP MATCHES " sse4.1 ")
 -                            set(OUTPUT_SIMD "SSE4.1")
 -                        elseif(OUTPUT_TMP MATCHES " sse2 ")
 -                            set(OUTPUT_SIMD "SSE2")
 -                        endif()
 -                    else()
 -                        if(OUTPUT_TMP MATCHES " vsx ")
 -                            set(OUTPUT_SIMD "IBM_VSX")
 -                        elseif(OUTPUT_TMP MATCHES " vmx ")
 -                            set(OUTPUT_SIMD "IBM_VMX")
 -                        elseif(OUTPUT_TMP MATCHES " qpx ")
 -                            set(OUTPUT_SIMD "IBM_QPX")
 -                        elseif(OUTPUT_TMP MATCHES " neon_asimd ")
 -                            set(OUTPUT_SIMD "ARM_NEON_ASIMD")
 -                        elseif(OUTPUT_TMP MATCHES " neon " AND NOT GMX_DOUBLE)
 -                            set(OUTPUT_SIMD "ARM_NEON")
 -                        endif()
 -                    endif()
 -                    message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
 +        if(GMX_TARGET_X86)
 +            if(CPU_DETECTION_FEATURES MATCHES " avx512er ")
 +                set(OUTPUT_SIMD "AVX_512_KNL")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ")
 +                set(OUTPUT_SIMD "AVX_512")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ")
 +                if(CPU_DETECTION_FEATURES MATCHES " amd ")
 +                    set(OUTPUT_SIMD "AVX2_128")
 +                else()
 +                    set(OUTPUT_SIMD "AVX2_256")
 +                endif()
 +            elseif(CPU_DETECTION_FEATURES MATCHES " avx ")
 +                if(CPU_DETECTION_FEATURES MATCHES " fma4 ")
 +                    # AMD that works better with avx-128-fma
 +                    set(OUTPUT_SIMD "AVX_128_FMA")
                  else()
 -                    message(WARNING "Cannot run cpuinfo code, which means no SIMD suggestion can be made.")
 -                    message(STATUS "Run output: ${OUTPUT_TMP}")
 +                    # Intel
 +                    set(OUTPUT_SIMD "AVX_256")
                  endif()
 +            elseif(CPU_DETECTION_FEATURES MATCHES " sse4.1 ")
 +                set(OUTPUT_SIMD "SSE4.1")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " sse2 ")
 +                set(OUTPUT_SIMD "SSE2")
              endif()
          else()
 -            message(WARNING "Cannot compile cpuinfo code, which means no SIMD instructions.")
 -            message(STATUS "Compile output: ${GMX_DETECTSIMD_COMPILED_OUTPUT}")
 +            if(CPU_DETECTION_FEATURES MATCHES " vsx ")
 +                set(OUTPUT_SIMD "IBM_VSX")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " vmx ")
 +                set(OUTPUT_SIMD "IBM_VMX")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " qpx ")
 +                set(OUTPUT_SIMD "IBM_QPX")
 +            elseif(CPU_DETECTION_FEATURES MATCHES " neon_asimd ")
 +                set(OUTPUT_SIMD "ARM_NEON_ASIMD")
-             elseif(CPU_DETECTION_FEATURES MATCHES " neon ")
++            elseif(CPU_DETECTION_FEATURES MATCHES " neon " AND NOT GMX_DOUBLE)
 +                set(OUTPUT_SIMD "ARM_NEON")
 +            endif()
 +        endif()
 +        if (NOT SUGGEST_SIMD_QUIETLY)
 +            message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
          endif()
      else()
 -        message(WARNING "Cannot detect SIMD architecture for this cross-compile; you should check it manually.")
 +        if (NOT SUGGEST_SIMD_QUIETLY)
 +            message(STATUS "Detection for best SIMD instructions failed, using SIMD - ${OUTPUT_SIMD}")
 +        endif()
      endif()
  
 -    set(${_suggested_simd} "${OUTPUT_SIMD}" CACHE INTERNAL "Suggested SIMD")
 +    set(${_suggested_simd} "${OUTPUT_SIMD}" PARENT_SCOPE)
 +    set(SUGGEST_SIMD_QUIETLY TRUE CACHE INTERNAL "Be quiet during future construction of SIMD suggestions")
  endfunction()
  
  function(gmx_detect_simd _suggested_simd)
Simple merge
index 64661de6f9e07e8eb1e49fb36e109a49e28891cb,b1b76a51d3844d420948c455d4640fbff960481e..84045516d6607e5a1f08920553b4066cfaf1b1e5
@@@ -348,11 -341,15 +348,15 @@@ elseif(GMX_SIMD_ACTIVE STREQUAL "AVX_51
  
      set(SIMD_C_FLAGS "${TOOLCHAIN_C_FLAGS}")
      set(SIMD_CXX_FLAGS "${TOOLCHAIN_CXX_FLAGS}")
 -    set(GMX_SIMD_X86_${GMX_SIMD} 1)
 +    set(GMX_SIMD_X86_${GMX_SIMD_ACTIVE} 1)
      set(SIMD_STATUS_MESSAGE "Enabling 512-bit AVX-512-KNL SIMD instructions")
  
 -elseif(GMX_SIMD STREQUAL "ARM_NEON")
 +elseif(GMX_SIMD_ACTIVE STREQUAL "ARM_NEON")
  
+     if (GMX_DOUBLE)
+         message(FATAL_ERROR "ARM_NEON SIMD support is not available for a double precision build because the architecture lacks double-precision support")
+     endif()
      gmx_find_flags(
          "#include<arm_neon.h>
           int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
Simple merge
Simple merge
Simple merge
index 384ed8daaafa23be4723d3acaa7e9b5de7d4b6ae,292624fb4d783b2c356701c8b72837d9e267be6d..6e2bea5d8739446eb0e9efc1996d0936fa447f5f
@@@ -372,13 -361,17 +374,18 @@@ static gmx_bool pme_loadbal_increase_cu
      }
      else
      {
 -        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
 -        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
+         /* TODO Remove these lines and pme_lb->cutoff_scheme */
 -        set->rlist            = std::min(tmpr_coulomb, tmpr_vdw);
 +        tmpr_coulomb     = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
 +        tmpr_vdw         = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+         /* Two (known) bugs with cutoff-scheme=group here:
+          * - This modification of rlist results in incorrect DD comunication.
+          * - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
+          */
 +        set->rlistOuter  = std::min(tmpr_coulomb, tmpr_vdw);
 +        set->rlistInner  = set->rlistOuter;
      }
  
 -    set->spacing      = sp;
 +    set->spacing         = sp;
      /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
      set->grid_efficiency = 1;
      for (d = 0; d < DIM; d++)
Simple merge
index efd59644a965d10783ec420bd497d2384129d02f,b6a23047267fd4b9428b8f006ecaebc829c7e9b8..a60bf9ebb80537871ba7f5b52100458ae62c6509
   */
  static const int warp_size      = 32;
  static const int warp_size_log2 = 5;
+ /*! \brief Bitmask corresponding to all threads active in a warp.
+  *  NOTE that here too we assume 32-wide warps.
+  */
+ static const unsigned int c_fullWarpMask = 0xffffffff;
+ /* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
+ /*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
+ static __forceinline__ __device__
+ void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     /* no sync needed on pre-Volta. */
+     GMX_UNUSED_VALUE(activeMask);
+ #else
+     __syncwarp(activeMask);
+ #endif
+ }
+ /*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
+ static __forceinline__ __device__
+ unsigned int gmx_ballot_sync(const unsigned int activeMask,
+                              const int          pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __ballot(pred);
+ #else
+     return __ballot_sync(activeMask, pred);
+ #endif
+ }
+ /*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
+ static __forceinline__ __device__
+ int gmx_any_sync(const unsigned int activeMask,
+                  const int          pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __any(pred);
+ #else
+     return __any_sync(activeMask, pred);
+ #endif
+ }
+ /*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_up_sync(const unsigned int activeMask,
+                    const T            var,
+                    unsigned int       offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __shfl_up(var, offset);
+ #else
+     return __shfl_up_sync(activeMask, var, offset);
+ #endif
+ }
+ /*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_down_sync(const unsigned int activeMask,
+                      const T            var,
+                      unsigned int       offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __shfl_down(var, offset);
+ #else
+     return __shfl_down_sync(activeMask, var, offset);
+ #endif
+ }
  
 +/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
 + *
 + *  This option will not influence functionality. All features using textures ought
 + *  to have fallback for texture-less reads (direct/LDG loads), all new code needs
 + *  to provide fallback code.
 + */
 +#if defined GMX_DISABLE_CUDA_TEXTURES
 +#define DISABLE_CUDA_TEXTURES 1
 +#else
 +#define DISABLE_CUDA_TEXTURES 0
 +#endif
 +
 +/* CUDA architecture technical characteristics. Needs macros because it is used
 + * in the __launch_bounds__ function qualifiers and might need it in preprocessor
 + * conditionals.
 + *
 + */
 +#if GMX_PTX_ARCH > 0
 +    #if   GMX_PTX_ARCH <= 210  // CC 2.x
 +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   8
 +        #define GMX_CUDA_MAX_THREADS_PER_MP  1536
 +    #elif GMX_PTX_ARCH <= 370  // CC 3.x
 +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   16
 +        #define GMX_CUDA_MAX_THREADS_PER_MP  2048
 +    #else // CC 5.x, 6.x
 +          /* Note that this final branch covers all future architectures (current gen
 +           * is 6.x as of writing), hence assuming that these *currently defined* upper
 +           * limits will not be lowered.
 +           */
 +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   32
 +        #define GMX_CUDA_MAX_THREADS_PER_MP  2048
 +    #endif
 +#else
 +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   0
 +        #define GMX_CUDA_MAX_THREADS_PER_MP  0
 +#endif
 +
  #endif /* CUDA_ARCH_UTILS_CUH_ */
index 8ac8938075045fe3aa3ebd1baae8c545eebf36b0,dde0bb3669e6150f87ea0c3d703ab8bc0e34b956..6021331ffb4740684113555b319579d683fda89a
@@@ -1,7 -1,7 +1,7 @@@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
-  * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
 - * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
++ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
Simple merge
Simple merge
index 3077bb54bff9a977ba4b52550b909fc55bbc490f,a9411e42ec8aef631f8e62afe8aa2a705da51c1a..c4ec038d2c4504a46c70d8d66ce3c1b475bf1377
@@@ -345,12 -351,10 +354,14 @@@ __global__ void NB_KERNEL_FUNC_NAME(nbn
  
  #endif                                  /* CALC_ENERGIES */
  
-     const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
 +#ifdef EXCLUSION_FORCES
++    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
 +#endif
 +
+     int          j4LoopStart      = cij4_start + tidxz;
+     unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
      /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-     for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
+     for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
      {
          wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
          imask       = pl_cj4[j4].imei[widx].imask;
index 3c0f016515a9842d921f359fee20c778bd0e4cf7,82077bb96a2655d8559a46fe4bd9ab1f709e1168..71f1901434c914cb5ef53782aca3070d42f0bac2
  
  /*! \brief Log of the i and j cluster size.
   *  change this together with c_clSize !*/
- static const int      c_clSizeLog2  = 3;
+ static const int          c_clSizeLog2  = 3;
  /*! \brief Square of cluster size. */
- static const int      c_clSizeSq    = c_clSize*c_clSize;
+ static const int          c_clSizeSq    = c_clSize*c_clSize;
  /*! \brief j-cluster size after split (4 in the current implementation). */
- static const int      c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  /*! \brief Stride in the force accumualation buffer */
- static const int      c_fbufStride  = c_clSizeSq;
+ static const int          c_fbufStride  = c_clSizeSq;
 +/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
++static const unsigned     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
  
- static const float    c_oneSixth    = 0.16666667f;
- static const float    c_oneTwelveth = 0.08333333f;
+ static const float        c_oneSixth    = 0.16666667f;
+ static const float        c_oneTwelveth = 0.08333333f;
  
 -/* With multiple compilation units this ensures that texture refs are available
 -   in the the kernels' compilation units. */
 -#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 -/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
 -extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
 -
 -/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
 -extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
 -
 -/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
 -extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
 -#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
  /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  static __forceinline__ __device__
@@@ -534,14 -443,15 +534,15 @@@ void reduce_force_j_generic(float *f_bu
  /*! Final j-force reduction; this implementation only with power of two
   *  array sizes and with sm >= 3.0
   */
 -#if GMX_PTX_ARCH >= 300
 +#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
  static __forceinline__ __device__
  void reduce_force_j_warp_shfl(float3 f, float3 *fout,
-                               int tidxi, int aidx)
+                               int tidxi, int aidx,
+                               const unsigned int activemask)
  {
-     f.x += __shfl_down(f.x, 1);
-     f.y += __shfl_up  (f.y, 1);
-     f.z += __shfl_down(f.z, 1);
+     f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+     f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
+     f.z += gmx_shfl_down_sync(activemask, f.z, 1);
  
      if (tidxi & 1)
      {
index 5e43352fc8dd4de744cad27fe48f33993c3f4cf4,0787d52af08f687c9283788ecdca7cc1fe2443b2..77cf7559a8e99c896d725d0d70fbf171d2679d35
@@@ -535,7 -535,7 +535,7 @@@ static inline SimdDInt32 gmx_simdcal
  operator<<(SimdDInt32 a, int n)
  {
      return {
-                vshl_n_s32(a.simdInternal_, n)
 -        vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
++               vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
      };
  }
  
@@@ -543,7 -543,7 +543,7 @@@ static inline SimdDInt32 gmx_simdcal
  operator>>(SimdDInt32 a, int n)
  {
      return {
-                vshr_n_s32(a.simdInternal_, n)
 -        vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
++               vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
      };
  }
  
index 902f7bfb84ad0c192d77a6a8b45a6ae9445e7758,bbb8e59754577e05b1f2fe1c3b6c5f253d1eaeaa..58a53ef7a813ec4ecb921828de553261a913e4d8
@@@ -505,22 -498,21 +505,22 @@@ double gmx::do_md(FILE *fplog, t_commre
          set_constraints(constr, top, ir, mdatoms, cr);
      }
  
 -    if (repl_ex_nst > 0 && MASTER(cr))
 +    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 +    if (useReplicaExchange && MASTER(cr))
      {
          repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
 -                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
 +                                        replExParams);
      }
  
-     /* PME tuning is only supported with PME for Coulomb. Is is not supported
-      * with only LJ PME, or for reruns.
-      */
+     /* PME tuning is only supported in the Verlet scheme, with PME for
+      * Coulomb. It is not supported with only LJ PME, or for
+      * reruns. */
      bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
-                 !(Flags & MD_REPRODUCIBLE));
+                 !(Flags & MD_REPRODUCIBLE) && ir->cutoff_scheme != ecutsGROUP);
      if (bPMETune)
      {
 -        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
 -                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
 +        pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
 +                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
                           &bPMETunePrinting);
      }
  
index dd70f417dec683a56cb4bdac58678cfd69dc2cce,798f7682742304554848fcc85b505101ffa6cad9..148b4a0c197771a3e3b24b6defeb4be353b888b9
@@@ -317,12 -365,12 +317,12 @@@ int Mdrunner::mainFunction(int argc, ch
            "load balancing." },
          { "-gcom",    FALSE, etINT, {&nstglobalcomm},
            "Global communication frequency" },
 -        { "-nb",      FALSE, etENUM, {&nbpu_opt},
 +        { "-nb",      FALSE, etENUM, {&nbpu_opt_choices},
            "Calculate non-bonded interactions on" },
 -        { "-nstlist", FALSE, etINT, {&nstlist},
 +        { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
            "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
          { "-tunepme", FALSE, etBOOL, {&bTunePME},
-           "Optimize PME load between PP/PME ranks or GPU/CPU" },
+           "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
          { "-v",       FALSE, etBOOL, {&bVerbose},
            "Be loud and noisy" },
          { "-pforce",  FALSE, etREAL, {&pforce},