Merge branch release-2016

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 21 Aug 2017 23:51:09 +0000 (01:51 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 22 Aug 2017 11:19:22 +0000 (13:19 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 21 Aug 2017 23:51:09 +0000 (01:51 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 22 Aug 2017 11:19:22 +0000 (13:19 +0200)
diff --cc cmake/gmxDetectSimd.cmake

index e83c3abd790d0e6c65d6a22b2a0a76eb321af918,c0cf2f56ac17db11670d50fc40101c7a3c223233..50088be99f190879bf4d71a9804e27ddba2a99d5
--- 1/cmake/gmxDetectSimd.cmake
--- 2/cmake/gmxDetectSimd.cmake
+++ b/cmake/gmxDetectSimd.cmake
@@@ -61,64 -72,89 +61,64 @@@ function(gmx_suggest_simd _suggested_si
       # Prepare a default suggestion
       set(OUTPUT_SIMD "None")
   
- -    # We need to execute the binary, so this only works if not cross-compiling.
- -    # However, note that we are NOT limited to x86.
- -    if(NOT CMAKE_CROSSCOMPILING)
- -        # TODO Extract this try_compile to a helper function, because
- -        # it duplicates code in gmxSetBuildInformation.cmake
- -        set(GMX_DETECTSIMD_BINARY "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/GmxDetectSimd${CMAKE_EXECUTABLE_SUFFIX}")
- -        set(LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}")
- -        try_compile(GMX_DETECTSIMD_COMPILED
- -            "${CMAKE_CURRENT_BINARY_DIR}"
- -            "${CMAKE_CURRENT_SOURCE_DIR}/src/gromacs/hardware/cpuinfo.cpp"
- -            COMPILE_DEFINITIONS "${_compile_definitions}"
- -            CMAKE_FLAGS "-DLINK_LIBRARIES=${LINK_LIBRARIES}"
- -            OUTPUT_VARIABLE GMX_DETECTSIMD_COMPILED_OUTPUT
- -            COPY_FILE ${GMX_DETECTSIMD_BINARY})
- -        unset(_compile_definitions)
+ +    # Detect CPU features and place the string in CPU_DETECTION_FEATURES
+ +    # Note that we are NOT limited to x86.
+ +    gmx_run_cpu_detection(features)
+ +
+ +    if (DEFINED CPU_DETECTION_FEATURES)
+ +        # Make a concrete suggestion of SIMD level if a feature flag
+ +        # matches. Make sure that the match strings below work even if
+ +        # the feature is first or last.
+ +        set(CPU_DETECTION_FEATURES " ${CPU_DETECTION_FEATURES} ")
   
- -        if(GMX_DETECTSIMD_COMPILED)
- -            if(NOT DEFINED GMX_DETECTSIMD_RUN)
- -                execute_process(COMMAND ${GMX_DETECTSIMD_BINARY} "-features"
- -                    RESULT_VARIABLE GMX_DETECTSIMD_RUN
- -                    OUTPUT_VARIABLE OUTPUT_TMP
- -                    ERROR_QUIET)
- -                set(GMX_DETECTSIMD_RUN "${GMX_DETECTSIMD_RUN}" CACHE INTERNAL "Result of running cpuinfo code to detect SIMD support")
- -                if(GMX_DETECTSIMD_RUN EQUAL 0)
- -                    # Make a concrete suggestion of SIMD level
- -                    if(GMX_TARGET_X86)
- -                        if(OUTPUT_TMP MATCHES " avx512er ")
- -                            set(OUTPUT_SIMD "AVX_512_KNL")
- -                        elseif(OUTPUT_TMP MATCHES " avx512f ")
- -                            set(OUTPUT_SIMD "AVX_512")
- -                        elseif(OUTPUT_TMP MATCHES " avx2 ")
- -                            set(OUTPUT_SIMD "AVX2_256")
- -                        elseif(OUTPUT_TMP MATCHES " avx ")
- -                            if(OUTPUT_TMP MATCHES " fma4 ")
- -                                # AMD that works better with avx-128-fma
- -                                set(OUTPUT_SIMD "AVX_128_FMA")
- -                            else()
- -                                # Intel
- -                                set(OUTPUT_SIMD "AVX_256")
- -                            endif()
- -                        elseif(OUTPUT_TMP MATCHES " sse4.1 ")
- -                            set(OUTPUT_SIMD "SSE4.1")
- -                        elseif(OUTPUT_TMP MATCHES " sse2 ")
- -                            set(OUTPUT_SIMD "SSE2")
- -                        endif()
- -                    else()
- -                        if(OUTPUT_TMP MATCHES " vsx ")
- -                            set(OUTPUT_SIMD "IBM_VSX")
- -                        elseif(OUTPUT_TMP MATCHES " vmx ")
- -                            set(OUTPUT_SIMD "IBM_VMX")
- -                        elseif(OUTPUT_TMP MATCHES " qpx ")
- -                            set(OUTPUT_SIMD "IBM_QPX")
- -                        elseif(OUTPUT_TMP MATCHES " neon_asimd ")
- -                            set(OUTPUT_SIMD "ARM_NEON_ASIMD")
- -                        elseif(OUTPUT_TMP MATCHES " neon " AND NOT GMX_DOUBLE)
- -                            set(OUTPUT_SIMD "ARM_NEON")
- -                        endif()
- -                    endif()
- -                    message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
+ +        if(GMX_TARGET_X86)
+ +            if(CPU_DETECTION_FEATURES MATCHES " avx512er ")
+ +                set(OUTPUT_SIMD "AVX_512_KNL")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ")
+ +                set(OUTPUT_SIMD "AVX_512")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ")
+ +                if(CPU_DETECTION_FEATURES MATCHES " amd ")
+ +                    set(OUTPUT_SIMD "AVX2_128")
+ +                else()
+ +                    set(OUTPUT_SIMD "AVX2_256")
+ +                endif()
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " avx ")
+ +                if(CPU_DETECTION_FEATURES MATCHES " fma4 ")
+ +                    # AMD that works better with avx-128-fma
+ +                    set(OUTPUT_SIMD "AVX_128_FMA")
                   else()
- -                    message(WARNING "Cannot run cpuinfo code, which means no SIMD suggestion can be made.")
- -                    message(STATUS "Run output: ${OUTPUT_TMP}")
+ +                    # Intel
+ +                    set(OUTPUT_SIMD "AVX_256")
                   endif()
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " sse4.1 ")
+ +                set(OUTPUT_SIMD "SSE4.1")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " sse2 ")
+ +                set(OUTPUT_SIMD "SSE2")
               endif()
           else()
- -            message(WARNING "Cannot compile cpuinfo code, which means no SIMD instructions.")
- -            message(STATUS "Compile output: ${GMX_DETECTSIMD_COMPILED_OUTPUT}")
+ +            if(CPU_DETECTION_FEATURES MATCHES " vsx ")
+ +                set(OUTPUT_SIMD "IBM_VSX")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " vmx ")
+ +                set(OUTPUT_SIMD "IBM_VMX")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " qpx ")
+ +                set(OUTPUT_SIMD "IBM_QPX")
+ +            elseif(CPU_DETECTION_FEATURES MATCHES " neon_asimd ")
+ +                set(OUTPUT_SIMD "ARM_NEON_ASIMD")
-             elseif(CPU_DETECTION_FEATURES MATCHES " neon ")
++            elseif(CPU_DETECTION_FEATURES MATCHES " neon " AND NOT GMX_DOUBLE)
+ +                set(OUTPUT_SIMD "ARM_NEON")
+ +            endif()
+ +        endif()
+ +        if (NOT SUGGEST_SIMD_QUIETLY)
+ +            message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
           endif()
       else()
- -        message(WARNING "Cannot detect SIMD architecture for this cross-compile; you should check it manually.")
+ +        if (NOT SUGGEST_SIMD_QUIETLY)
+ +            message(STATUS "Detection for best SIMD instructions failed, using SIMD - ${OUTPUT_SIMD}")
+ +        endif()
       endif()
   
- -    set(${_suggested_simd} "${OUTPUT_SIMD}" CACHE INTERNAL "Suggested SIMD")
+ +    set(${_suggested_simd} "${OUTPUT_SIMD}" PARENT_SCOPE)
+ +    set(SUGGEST_SIMD_QUIETLY TRUE CACHE INTERNAL "Be quiet during future construction of SIMD suggestions")
   endfunction()
   
   function(gmx_detect_simd _suggested_simd)
diff --cc cmake/gmxManageNvccConfig.cmake
Simple merge
diff --cc cmake/gmxManageSimd.cmake

index 64661de6f9e07e8eb1e49fb36e109a49e28891cb,b1b76a51d3844d420948c455d4640fbff960481e..84045516d6607e5a1f08920553b4066cfaf1b1e5
--- 1/cmake/gmxManageSimd.cmake
--- 2/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@@ -348,11 -341,15 +348,15 @@@ elseif(GMX_SIMD_ACTIVE STREQUAL "AVX_51
   
       set(SIMD_C_FLAGS "${TOOLCHAIN_C_FLAGS}")
       set(SIMD_CXX_FLAGS "${TOOLCHAIN_CXX_FLAGS}")
- -    set(GMX_SIMD_X86_${GMX_SIMD} 1)
+ +    set(GMX_SIMD_X86_${GMX_SIMD_ACTIVE} 1)
       set(SIMD_STATUS_MESSAGE "Enabling 512-bit AVX-512-KNL SIMD instructions")
   
- -elseif(GMX_SIMD STREQUAL "ARM_NEON")
+ +elseif(GMX_SIMD_ACTIVE STREQUAL "ARM_NEON")
   
+     if (GMX_DOUBLE)
+         message(FATAL_ERROR "ARM_NEON SIMD support is not available for a double precision build because the architecture lacks double-precision support")
+     endif()
+ 
       gmx_find_flags(
           "#include<arm_neon.h>
            int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
diff --cc docs/CMakeLists.txt
Simple merge
diff --cc docs/manual/algorithms.tex
Simple merge
diff --cc docs/manual/special.tex
Simple merge
diff --cc src/gromacs/ewald/pme-load-balancing.cpp

index 384ed8daaafa23be4723d3acaa7e9b5de7d4b6ae,292624fb4d783b2c356701c8b72837d9e267be6d..6e2bea5d8739446eb0e9efc1996d0936fa447f5f
--- 1/src/gromacs/ewald/pme-load-balancing.cpp
--- 2/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@@ -372,13 -361,17 +374,18 @@@ static gmx_bool pme_loadbal_increase_cu
       }
       else
       {
- -        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
- -        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
+         /* TODO Remove these lines and pme_lb->cutoff_scheme */
- -        set->rlist            = std::min(tmpr_coulomb, tmpr_vdw);
+ +        tmpr_coulomb     = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
+ +        tmpr_vdw         = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+         /* Two (known) bugs with cutoff-scheme=group here:
+          * - This modification of rlist results in incorrect DD comunication.
+          * - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
+          */
+ +        set->rlistOuter  = std::min(tmpr_coulomb, tmpr_vdw);
+ +        set->rlistInner  = set->rlistOuter;
       }
   
- -    set->spacing      = sp;
+ +    set->spacing         = sp;
       /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
       set->grid_efficiency = 1;
       for (d = 0; d < DIM; d++)
diff --cc src/gromacs/gmxana/gmx_msd.cpp
Simple merge
diff --cc src/gromacs/gpu_utils/cuda_arch_utils.cuh

index efd59644a965d10783ec420bd497d2384129d02f,b6a23047267fd4b9428b8f006ecaebc829c7e9b8..a60bf9ebb80537871ba7f5b52100458ae62c6509
--- 1/src/gromacs/gpu_utils/cuda_arch_utils.cuh
--- 2/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@@ -57,42 -59,79 +59,116 @@@
    */
   static const int warp_size      = 32;
   static const int warp_size_log2 = 5;
+ /*! \brief Bitmask corresponding to all threads active in a warp.
+  *  NOTE that here too we assume 32-wide warps.
+  */
+ static const unsigned int c_fullWarpMask = 0xffffffff;
+ 
+ /* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
+ 
+ /*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
+ static __forceinline__ __device__
+ void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     /* no sync needed on pre-Volta. */
+     GMX_UNUSED_VALUE(activeMask);
+ #else
+     __syncwarp(activeMask);
+ #endif
+ }
+ 
+ /*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
+ static __forceinline__ __device__
+ unsigned int gmx_ballot_sync(const unsigned int activeMask,
+                              const int          pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __ballot(pred);
+ #else
+     return __ballot_sync(activeMask, pred);
+ #endif
+ }
+ 
+ /*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
+ static __forceinline__ __device__
+ int gmx_any_sync(const unsigned int activeMask,
+                  const int          pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __any(pred);
+ #else
+     return __any_sync(activeMask, pred);
+ #endif
+ }
+ 
+ /*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_up_sync(const unsigned int activeMask,
+                    const T            var,
+                    unsigned int       offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __shfl_up(var, offset);
+ #else
+     return __shfl_up_sync(activeMask, var, offset);
+ #endif
+ }
+ 
+ /*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_down_sync(const unsigned int activeMask,
+                      const T            var,
+                      unsigned int       offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+     GMX_UNUSED_VALUE(activeMask);
+     return __shfl_down(var, offset);
+ #else
+     return __shfl_down_sync(activeMask, var, offset);
+ #endif
+ }
   
+ +/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
+ + *
+ + *  This option will not influence functionality. All features using textures ought
+ + *  to have fallback for texture-less reads (direct/LDG loads), all new code needs
+ + *  to provide fallback code.
+ + */
+ +#if defined GMX_DISABLE_CUDA_TEXTURES
+ +#define DISABLE_CUDA_TEXTURES 1
+ +#else
+ +#define DISABLE_CUDA_TEXTURES 0
+ +#endif
+ +
+ +/* CUDA architecture technical characteristics. Needs macros because it is used
+ + * in the __launch_bounds__ function qualifiers and might need it in preprocessor
+ + * conditionals.
+ + *
+ + */
+ +#if GMX_PTX_ARCH > 0
+ +    #if   GMX_PTX_ARCH <= 210  // CC 2.x
+ +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   8
+ +        #define GMX_CUDA_MAX_THREADS_PER_MP  1536
+ +    #elif GMX_PTX_ARCH <= 370  // CC 3.x
+ +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   16
+ +        #define GMX_CUDA_MAX_THREADS_PER_MP  2048
+ +    #else // CC 5.x, 6.x
+ +          /* Note that this final branch covers all future architectures (current gen
+ +           * is 6.x as of writing), hence assuming that these *currently defined* upper
+ +           * limits will not be lowered.
+ +           */
+ +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   32
+ +        #define GMX_CUDA_MAX_THREADS_PER_MP  2048
+ +    #endif
+ +#else
+ +        #define GMX_CUDA_MAX_BLOCKS_PER_MP   0
+ +        #define GMX_CUDA_MAX_THREADS_PER_MP  0
+ +#endif
+ +
   #endif /* CUDA_ARCH_UTILS_CUH_ */
diff --cc src/gromacs/gpu_utils/cudautils.cu

index 8ac8938075045fe3aa3ebd1baae8c545eebf36b0,dde0bb3669e6150f87ea0c3d703ab8bc0e34b956..6021331ffb4740684113555b319579d683fda89a
--- 1/src/gromacs/gpu_utils/cudautils.cu
--- 2/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@@ -1,7 -1,7 +1,7 @@@
   /*
    * This file is part of the GROMACS molecular simulation package.
    *
-  * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
- - * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
++ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
    * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
    * and including many others, as listed in the AUTHORS file in the
    * top-level source directory and at http://www.gromacs.org.
diff --cc src/gromacs/gpu_utils/cudautils.cuh
Simple merge
diff --cc src/gromacs/mdlib/forcerec.cpp
Simple merge
diff --cc src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 3077bb54bff9a977ba4b52550b909fc55bbc490f,a9411e42ec8aef631f8e62afe8aa2a705da51c1a..c4ec038d2c4504a46c70d8d66ce3c1b475bf1377
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@@ -345,12 -351,10 +354,14 @@@ __global__ void NB_KERNEL_FUNC_NAME(nbn
   
   #endif                                  /* CALC_ENERGIES */
   
-     const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+ +#ifdef EXCLUSION_FORCES
++    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+ +#endif
+ +
+     int          j4LoopStart      = cij4_start + tidxz;
+     unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
       /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-     for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
+     for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
       {
           wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
           imask       = pl_cj4[j4].imei[widx].imask;
diff --cc src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 3c0f016515a9842d921f359fee20c778bd0e4cf7,82077bb96a2655d8559a46fe4bd9ab1f709e1168..71f1901434c914cb5ef53782aca3070d42f0bac2
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@@ -66,19 -68,29 +66,19 @@@
   
   /*! \brief Log of the i and j cluster size.
    *  change this together with c_clSize !*/
- static const int      c_clSizeLog2  = 3;
+ static const int          c_clSizeLog2  = 3;
   /*! \brief Square of cluster size. */
- static const int      c_clSizeSq    = c_clSize*c_clSize;
+ static const int          c_clSizeSq    = c_clSize*c_clSize;
   /*! \brief j-cluster size after split (4 in the current implementation). */
- static const int      c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
   /*! \brief Stride in the force accumualation buffer */
- static const int      c_fbufStride  = c_clSizeSq;
+ static const int          c_fbufStride  = c_clSizeSq;
+ +/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
++static const unsigned     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
   
- static const float    c_oneSixth    = 0.16666667f;
- static const float    c_oneTwelveth = 0.08333333f;
+ static const float        c_oneSixth    = 0.16666667f;
+ static const float        c_oneTwelveth = 0.08333333f;
   
- -/* With multiple compilation units this ensures that texture refs are available
- -   in the the kernels' compilation units. */
- -#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
- -/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
- -extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
- -
- -/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
- -extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
- -
- -/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
- -extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
- -#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
   
   /*! Convert LJ sigma,epsilon parameters to C6,C12. */
   static __forceinline__ __device__
@@@ -534,14 -443,15 +534,15 @@@ void reduce_force_j_generic(float *f_bu
   /*! Final j-force reduction; this implementation only with power of two
    *  array sizes and with sm >= 3.0
    */
- -#if GMX_PTX_ARCH >= 300
+ +#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
   static __forceinline__ __device__
   void reduce_force_j_warp_shfl(float3 f, float3 *fout,
-                               int tidxi, int aidx)
+                               int tidxi, int aidx,
+                               const unsigned int activemask)
   {
-     f.x += __shfl_down(f.x, 1);
-     f.y += __shfl_up  (f.y, 1);
-     f.z += __shfl_down(f.z, 1);
+     f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+     f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
+     f.z += gmx_shfl_down_sync(activemask, f.z, 1);
   
       if (tidxi & 1)
       {
diff --cc src/gromacs/mdrunutility/handlerestart.cpp
Simple merge
diff --cc src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h

index 5e43352fc8dd4de744cad27fe48f33993c3f4cf4,0787d52af08f687c9283788ecdca7cc1fe2443b2..77cf7559a8e99c896d725d0d70fbf171d2679d35
--- 1/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
--- 2/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
+++ b/src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h
@@@ -535,7 -535,7 +535,7 @@@ static inline SimdDInt32 gmx_simdcal
   operator<<(SimdDInt32 a, int n)
   {
       return {
-                vshl_n_s32(a.simdInternal_, n)
- -        vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
++               vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
       };
   }
   
@@@ -543,7 -543,7 +543,7 @@@ static inline SimdDInt32 gmx_simdcal
   operator>>(SimdDInt32 a, int n)
   {
       return {
-                vshr_n_s32(a.simdInternal_, n)
- -        vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
++               vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
       };
   }
   
diff --cc src/programs/mdrun/md.cpp

index 902f7bfb84ad0c192d77a6a8b45a6ae9445e7758,bbb8e59754577e05b1f2fe1c3b6c5f253d1eaeaa..58a53ef7a813ec4ecb921828de553261a913e4d8
--- 1/src/programs/mdrun/md.cpp
--- 2/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@@ -505,22 -498,21 +505,22 @@@ double gmx::do_md(FILE *fplog, t_commre
           set_constraints(constr, top, ir, mdatoms, cr);
       }
   
- -    if (repl_ex_nst > 0 && MASTER(cr))
+ +    const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+ +    if (useReplicaExchange && MASTER(cr))
       {
           repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
- -                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+ +                                        replExParams);
       }
   
-     /* PME tuning is only supported with PME for Coulomb. Is is not supported
-      * with only LJ PME, or for reruns.
-      */
+     /* PME tuning is only supported in the Verlet scheme, with PME for
+      * Coulomb. It is not supported with only LJ PME, or for
+      * reruns. */
       bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
-                 !(Flags & MD_REPRODUCIBLE));
+                 !(Flags & MD_REPRODUCIBLE) && ir->cutoff_scheme != ecutsGROUP);
       if (bPMETune)
       {
- -        pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
- -                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+ +        pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
+ +                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
                            &bPMETunePrinting);
       }
   
diff --cc src/programs/mdrun/mdrun.cpp

index dd70f417dec683a56cb4bdac58678cfd69dc2cce,798f7682742304554848fcc85b505101ffa6cad9..148b4a0c197771a3e3b24b6defeb4be353b888b9
--- 1/src/programs/mdrun/mdrun.cpp
--- 2/src/programs/mdrun/mdrun.cpp
+++ b/src/programs/mdrun/mdrun.cpp
@@@ -317,12 -365,12 +317,12 @@@ int Mdrunner::mainFunction(int argc, ch
             "load balancing." },
           { "-gcom",    FALSE, etINT, {&nstglobalcomm},
             "Global communication frequency" },
- -        { "-nb",      FALSE, etENUM, {&nbpu_opt},
+ +        { "-nb",      FALSE, etENUM, {&nbpu_opt_choices},
             "Calculate non-bonded interactions on" },
- -        { "-nstlist", FALSE, etINT, {&nstlist},
+ +        { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
             "Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
           { "-tunepme", FALSE, etBOOL, {&bTunePME},
-           "Optimize PME load between PP/PME ranks or GPU/CPU" },
+           "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
           { "-v",       FALSE, etBOOL, {&bVerbose},
             "Be loud and noisy" },
           { "-pforce",  FALSE, etREAL, {&pforce},
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 21 Aug 2017 23:51:09 +0000 (01:51 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 22 Aug 2017 11:19:22 +0000 (13:19 +0200)
		1	2
cmake/gmxDetectSimd.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxManageNvccConfig.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxManageSimd.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
docs/CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
docs/manual/algorithms.tex	patch \|	diff1 \|	diff2 \|	blob \| history
docs/manual/special.tex	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/ewald/pme-load-balancing.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxana/gmx_msd.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gpu_utils/cuda_arch_utils.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gpu_utils/cudautils.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gpu_utils/cudautils.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/forcerec.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdrunutility/handlerestart.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd_simd_double.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/programs/mdrun/md.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/programs/mdrun/mdrun.cpp	patch \|	diff1 \|	diff2 \|	blob \| history