# Prepare a default suggestion
set(OUTPUT_SIMD "None")
- # We need to execute the binary, so this only works if not cross-compiling.
- # However, note that we are NOT limited to x86.
- if(NOT CMAKE_CROSSCOMPILING)
- # TODO Extract this try_compile to a helper function, because
- # it duplicates code in gmxSetBuildInformation.cmake
- set(GMX_DETECTSIMD_BINARY "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/GmxDetectSimd${CMAKE_EXECUTABLE_SUFFIX}")
- set(LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}")
- try_compile(GMX_DETECTSIMD_COMPILED
- "${CMAKE_CURRENT_BINARY_DIR}"
- "${CMAKE_CURRENT_SOURCE_DIR}/src/gromacs/hardware/cpuinfo.cpp"
- COMPILE_DEFINITIONS "${_compile_definitions}"
- CMAKE_FLAGS "-DLINK_LIBRARIES=${LINK_LIBRARIES}"
- OUTPUT_VARIABLE GMX_DETECTSIMD_COMPILED_OUTPUT
- COPY_FILE ${GMX_DETECTSIMD_BINARY})
- unset(_compile_definitions)
+ # Detect CPU features and place the string in CPU_DETECTION_FEATURES
+ # Note that we are NOT limited to x86.
+ gmx_run_cpu_detection(features)
+
+ if (DEFINED CPU_DETECTION_FEATURES)
+ # Make a concrete suggestion of SIMD level if a feature flag
+ # matches. Make sure that the match strings below work even if
+ # the feature is first or last.
+ set(CPU_DETECTION_FEATURES " ${CPU_DETECTION_FEATURES} ")
- if(GMX_DETECTSIMD_COMPILED)
- if(NOT DEFINED GMX_DETECTSIMD_RUN)
- execute_process(COMMAND ${GMX_DETECTSIMD_BINARY} "-features"
- RESULT_VARIABLE GMX_DETECTSIMD_RUN
- OUTPUT_VARIABLE OUTPUT_TMP
- ERROR_QUIET)
- set(GMX_DETECTSIMD_RUN "${GMX_DETECTSIMD_RUN}" CACHE INTERNAL "Result of running cpuinfo code to detect SIMD support")
- if(GMX_DETECTSIMD_RUN EQUAL 0)
- # Make a concrete suggestion of SIMD level
- if(GMX_TARGET_X86)
- if(OUTPUT_TMP MATCHES " avx512er ")
- set(OUTPUT_SIMD "AVX_512_KNL")
- elseif(OUTPUT_TMP MATCHES " avx512f ")
- set(OUTPUT_SIMD "AVX_512")
- elseif(OUTPUT_TMP MATCHES " avx2 ")
- set(OUTPUT_SIMD "AVX2_256")
- elseif(OUTPUT_TMP MATCHES " avx ")
- if(OUTPUT_TMP MATCHES " fma4 ")
- # AMD that works better with avx-128-fma
- set(OUTPUT_SIMD "AVX_128_FMA")
- else()
- # Intel
- set(OUTPUT_SIMD "AVX_256")
- endif()
- elseif(OUTPUT_TMP MATCHES " sse4.1 ")
- set(OUTPUT_SIMD "SSE4.1")
- elseif(OUTPUT_TMP MATCHES " sse2 ")
- set(OUTPUT_SIMD "SSE2")
- endif()
- else()
- if(OUTPUT_TMP MATCHES " vsx ")
- set(OUTPUT_SIMD "IBM_VSX")
- elseif(OUTPUT_TMP MATCHES " vmx ")
- set(OUTPUT_SIMD "IBM_VMX")
- elseif(OUTPUT_TMP MATCHES " qpx ")
- set(OUTPUT_SIMD "IBM_QPX")
- elseif(OUTPUT_TMP MATCHES " neon_asimd ")
- set(OUTPUT_SIMD "ARM_NEON_ASIMD")
- elseif(OUTPUT_TMP MATCHES " neon " AND NOT GMX_DOUBLE)
- set(OUTPUT_SIMD "ARM_NEON")
- endif()
- endif()
- message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
+ if(GMX_TARGET_X86)
+ if(CPU_DETECTION_FEATURES MATCHES " avx512er ")
+ set(OUTPUT_SIMD "AVX_512_KNL")
+ elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ")
+ set(OUTPUT_SIMD "AVX_512")
+ elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ")
+ if(CPU_DETECTION_FEATURES MATCHES " amd ")
+ set(OUTPUT_SIMD "AVX2_128")
+ else()
+ set(OUTPUT_SIMD "AVX2_256")
+ endif()
+ elseif(CPU_DETECTION_FEATURES MATCHES " avx ")
+ if(CPU_DETECTION_FEATURES MATCHES " fma4 ")
+ # AMD that works better with avx-128-fma
+ set(OUTPUT_SIMD "AVX_128_FMA")
else()
- message(WARNING "Cannot run cpuinfo code, which means no SIMD suggestion can be made.")
- message(STATUS "Run output: ${OUTPUT_TMP}")
+ # Intel
+ set(OUTPUT_SIMD "AVX_256")
endif()
+ elseif(CPU_DETECTION_FEATURES MATCHES " sse4.1 ")
+ set(OUTPUT_SIMD "SSE4.1")
+ elseif(CPU_DETECTION_FEATURES MATCHES " sse2 ")
+ set(OUTPUT_SIMD "SSE2")
endif()
else()
- message(WARNING "Cannot compile cpuinfo code, which means no SIMD instructions.")
- message(STATUS "Compile output: ${GMX_DETECTSIMD_COMPILED_OUTPUT}")
+ if(CPU_DETECTION_FEATURES MATCHES " vsx ")
+ set(OUTPUT_SIMD "IBM_VSX")
+ elseif(CPU_DETECTION_FEATURES MATCHES " vmx ")
+ set(OUTPUT_SIMD "IBM_VMX")
+ elseif(CPU_DETECTION_FEATURES MATCHES " qpx ")
+ set(OUTPUT_SIMD "IBM_QPX")
+ elseif(CPU_DETECTION_FEATURES MATCHES " neon_asimd ")
+ set(OUTPUT_SIMD "ARM_NEON_ASIMD")
- elseif(CPU_DETECTION_FEATURES MATCHES " neon ")
++ elseif(CPU_DETECTION_FEATURES MATCHES " neon " AND NOT GMX_DOUBLE)
+ set(OUTPUT_SIMD "ARM_NEON")
+ endif()
+ endif()
+ if (NOT SUGGEST_SIMD_QUIETLY)
+ message(STATUS "Detected best SIMD instructions for this CPU - ${OUTPUT_SIMD}")
endif()
else()
- message(WARNING "Cannot detect SIMD architecture for this cross-compile; you should check it manually.")
+ if (NOT SUGGEST_SIMD_QUIETLY)
+ message(STATUS "Detection for best SIMD instructions failed, using SIMD - ${OUTPUT_SIMD}")
+ endif()
endif()
- set(${_suggested_simd} "${OUTPUT_SIMD}" CACHE INTERNAL "Suggested SIMD")
+ set(${_suggested_simd} "${OUTPUT_SIMD}" PARENT_SCOPE)
+ set(SUGGEST_SIMD_QUIETLY TRUE CACHE INTERNAL "Be quiet during future construction of SIMD suggestions")
endfunction()
function(gmx_detect_simd _suggested_simd)
set(SIMD_C_FLAGS "${TOOLCHAIN_C_FLAGS}")
set(SIMD_CXX_FLAGS "${TOOLCHAIN_CXX_FLAGS}")
- set(GMX_SIMD_X86_${GMX_SIMD} 1)
+ set(GMX_SIMD_X86_${GMX_SIMD_ACTIVE} 1)
set(SIMD_STATUS_MESSAGE "Enabling 512-bit AVX-512-KNL SIMD instructions")
-elseif(GMX_SIMD STREQUAL "ARM_NEON")
+elseif(GMX_SIMD_ACTIVE STREQUAL "ARM_NEON")
+ if (GMX_DOUBLE)
+ message(FATAL_ERROR "ARM_NEON SIMD support is not available for a double precision build because the architecture lacks double-precision support")
+ endif()
+
gmx_find_flags(
"#include<arm_neon.h>
int main(){float32x4_t x=vdupq_n_f32(0.5);x=vmlaq_f32(x,x,x);return vgetq_lane_f32(x,0)>0;}"
}
else
{
- tmpr_coulomb = set->rcut_coulomb + pme_lb->rbuf_coulomb;
- tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
+ /* TODO Remove these lines and pme_lb->cutoff_scheme */
- set->rlist = std::min(tmpr_coulomb, tmpr_vdw);
+ tmpr_coulomb = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
+ tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+ /* Two (known) bugs with cutoff-scheme=group here:
+ * - This modification of rlist results in incorrect DD comunication.
+ * - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
+ */
+ set->rlistOuter = std::min(tmpr_coulomb, tmpr_vdw);
+ set->rlistInner = set->rlistOuter;
}
- set->spacing = sp;
+ set->spacing = sp;
/* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
set->grid_efficiency = 1;
for (d = 0; d < DIM; d++)
*/
static const int warp_size = 32;
static const int warp_size_log2 = 5;
+ /*! \brief Bitmask corresponding to all threads active in a warp.
+ * NOTE that here too we assume 32-wide warps.
+ */
+ static const unsigned int c_fullWarpMask = 0xffffffff;
+
+ /* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
+
+ /*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic. */
+ static __forceinline__ __device__
+ void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
+ {
+ #if GMX_CUDA_VERSION < 9000
+ /* no sync needed on pre-Volta. */
+ GMX_UNUSED_VALUE(activeMask);
+ #else
+ __syncwarp(activeMask);
+ #endif
+ }
+
+ /*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic. */
+ static __forceinline__ __device__
+ unsigned int gmx_ballot_sync(const unsigned int activeMask,
+ const int pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+ GMX_UNUSED_VALUE(activeMask);
+ return __ballot(pred);
+ #else
+ return __ballot_sync(activeMask, pred);
+ #endif
+ }
+
+ /*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic. */
+ static __forceinline__ __device__
+ int gmx_any_sync(const unsigned int activeMask,
+ const int pred)
+ {
+ #if GMX_CUDA_VERSION < 9000
+ GMX_UNUSED_VALUE(activeMask);
+ return __any(pred);
+ #else
+ return __any_sync(activeMask, pred);
+ #endif
+ }
+
+ /*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic. */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_up_sync(const unsigned int activeMask,
+ const T var,
+ unsigned int offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+ GMX_UNUSED_VALUE(activeMask);
+ return __shfl_up(var, offset);
+ #else
+ return __shfl_up_sync(activeMask, var, offset);
+ #endif
+ }
+
+ /*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic. */
+ template <typename T>
+ static __forceinline__ __device__
+ T gmx_shfl_down_sync(const unsigned int activeMask,
+ const T var,
+ unsigned int offset)
+ {
+ #if GMX_CUDA_VERSION < 9000
+ GMX_UNUSED_VALUE(activeMask);
+ return __shfl_down(var, offset);
+ #else
+ return __shfl_down_sync(activeMask, var, offset);
+ #endif
+ }
+/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
+ *
+ * This option will not influence functionality. All features using textures ought
+ * to have fallback for texture-less reads (direct/LDG loads), all new code needs
+ * to provide fallback code.
+ */
+#if defined GMX_DISABLE_CUDA_TEXTURES
+#define DISABLE_CUDA_TEXTURES 1
+#else
+#define DISABLE_CUDA_TEXTURES 0
+#endif
+
+/* CUDA architecture technical characteristics. Needs macros because it is used
+ * in the __launch_bounds__ function qualifiers and might need it in preprocessor
+ * conditionals.
+ *
+ */
+#if GMX_PTX_ARCH > 0
+ #if GMX_PTX_ARCH <= 210 // CC 2.x
+ #define GMX_CUDA_MAX_BLOCKS_PER_MP 8
+ #define GMX_CUDA_MAX_THREADS_PER_MP 1536
+ #elif GMX_PTX_ARCH <= 370 // CC 3.x
+ #define GMX_CUDA_MAX_BLOCKS_PER_MP 16
+ #define GMX_CUDA_MAX_THREADS_PER_MP 2048
+ #else // CC 5.x, 6.x
+ /* Note that this final branch covers all future architectures (current gen
+ * is 6.x as of writing), hence assuming that these *currently defined* upper
+ * limits will not be lowered.
+ */
+ #define GMX_CUDA_MAX_BLOCKS_PER_MP 32
+ #define GMX_CUDA_MAX_THREADS_PER_MP 2048
+ #endif
+#else
+ #define GMX_CUDA_MAX_BLOCKS_PER_MP 0
+ #define GMX_CUDA_MAX_THREADS_PER_MP 0
+#endif
+
#endif /* CUDA_ARCH_UTILS_CUH_ */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
- * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
++ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#endif /* CALC_ENERGIES */
- const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#ifdef EXCLUSION_FORCES
++ const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
+ int j4LoopStart = cij4_start + tidxz;
+ unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
- for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
+ for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
{
wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
imask = pl_cj4[j4].imei[widx].imask;
/*! \brief Log of the i and j cluster size.
* change this together with c_clSize !*/
- static const int c_clSizeLog2 = 3;
+ static const int c_clSizeLog2 = 3;
/*! \brief Square of cluster size. */
- static const int c_clSizeSq = c_clSize*c_clSize;
+ static const int c_clSizeSq = c_clSize*c_clSize;
/*! \brief j-cluster size after split (4 in the current implementation). */
- static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
/*! \brief Stride in the force accumualation buffer */
- static const int c_fbufStride = c_clSizeSq;
+ static const int c_fbufStride = c_clSizeSq;
+/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
++static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
- static const float c_oneSixth = 0.16666667f;
- static const float c_oneTwelveth = 0.08333333f;
+ static const float c_oneSixth = 0.16666667f;
+ static const float c_oneTwelveth = 0.08333333f;
-/* With multiple compilation units this ensures that texture refs are available
- in the the kernels' compilation units. */
-#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
-/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
-extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
-
-/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
-extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
-
-/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
-#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
static __forceinline__ __device__
/*! Final j-force reduction; this implementation only with power of two
* array sizes and with sm >= 3.0
*/
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_j_warp_shfl(float3 f, float3 *fout,
- int tidxi, int aidx)
+ int tidxi, int aidx,
+ const unsigned int activemask)
{
- f.x += __shfl_down(f.x, 1);
- f.y += __shfl_up (f.y, 1);
- f.z += __shfl_down(f.z, 1);
+ f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+ f.y += gmx_shfl_up_sync (activemask, f.y, 1);
+ f.z += gmx_shfl_down_sync(activemask, f.z, 1);
if (tidxi & 1)
{
operator<<(SimdDInt32 a, int n)
{
return {
- vshl_n_s32(a.simdInternal_, n)
- vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
++ vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? 32 : n))
};
}
operator>>(SimdDInt32 a, int n)
{
return {
- vshr_n_s32(a.simdInternal_, n)
- vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
++ vshl_s32(a.simdInternal_, vdup_n_s32(n >= 32 ? -32 : -n))
};
}
set_constraints(constr, top, ir, mdatoms, cr);
}
- if (repl_ex_nst > 0 && MASTER(cr))
+ const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+ if (useReplicaExchange && MASTER(cr))
{
repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
- repl_ex_nst, repl_ex_nex, repl_ex_seed);
+ replExParams);
}
- /* PME tuning is only supported with PME for Coulomb. Is is not supported
- * with only LJ PME, or for reruns.
- */
+ /* PME tuning is only supported in the Verlet scheme, with PME for
+ * Coulomb. It is not supported with only LJ PME, or for
+ * reruns. */
bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
- !(Flags & MD_REPRODUCIBLE));
+ !(Flags & MD_REPRODUCIBLE) && ir->cutoff_scheme != ecutsGROUP);
if (bPMETune)
{
- pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box,
- fr->ic, fr->pmedata, use_GPU(fr->nbv),
+ pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
+ fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
&bPMETunePrinting);
}
"load balancing." },
{ "-gcom", FALSE, etINT, {&nstglobalcomm},
"Global communication frequency" },
- { "-nb", FALSE, etENUM, {&nbpu_opt},
+ { "-nb", FALSE, etENUM, {&nbpu_opt_choices},
"Calculate non-bonded interactions on" },
- { "-nstlist", FALSE, etINT, {&nstlist},
+ { "-nstlist", FALSE, etINT, {&nstlist_cmdline},
"Set nstlist when using a Verlet buffer tolerance (0 is guess)" },
{ "-tunepme", FALSE, etBOOL, {&bTunePME},
- "Optimize PME load between PP/PME ranks or GPU/CPU" },
+ "Optimize PME load between PP/PME ranks or GPU/CPU (only with the Verlet cut-off scheme)" },
{ "-v", FALSE, etBOOL, {&bVerbose},
"Be loud and noisy" },
{ "-pforce", FALSE, etREAL, {&pforce},