include(gmxSetBuildInformation)
gmx_set_build_information()
-if(BUILD_CPU_FEATURES MATCHES "rdtscp")
+if(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
# The timestep counter headers do not include config.h
add_definitions(-DHAVE_RDTSCP)
-endif(BUILD_CPU_FEATURES MATCHES "rdtscp")
+endif(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
include(gmxTestFloatFormat)
gmx_test_float_format(GMX_FLOAT_FORMAT_IEEE754
message(WARNING "No C++ AVX flag found. Consider a newer compiler, or disable AVX for much lower performance.")
endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
+ # Set the FMA4 flags (MSVC doesn't require any)
+ if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
+ GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+ if (NOT GNU_FMA_CFLAG)
+ message(WARNING "No C FMA4 flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+ endif(NOT GNU_FMA_CFLAG)
+ if (CMAKE_CXX_COMPILER_LOADED)
+ GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+ if (NOT GNU_FMA_CXXFLAG)
+ message(WARNING "No C++ FMA flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+ endif (NOT GNU_FMA_CXXFLAG)
+ endif()
+ endif()
+
# Only test the header after we have tried to add the flag for AVX support
check_include_file(immintrin.h HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
endif(NOT HAVE_IMMINTRIN_H)
- # AMD says we should include x86intrin.h for FMA support, but MSVC seems to do fine without it, so don't require it.
+ # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
+ check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE4_1 1)
/* Define to 1 if you have the <x86intrin.h> header file */
#cmakedefine HAVE_X86INTRIN_H
+/* Define to 1 if you have the <intrin.h> header file */
+#cmakedefine HAVE_INTRIN_H
+
/* Define to 1 if you have the <sched.h> header */
#cmakedefine HAVE_SCHED_H
#ifdef HAVE_X86INTRIN_H
#include <x86intrin.h> /* FMA */
#endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
#include <math.h>
}
}
+static void
+nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
+ nbnxn_atomdata_output_t * gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
+{
+ int i,s;
+
+ for(i=i0; i<i1; i++)
+ {
+ for(s=0; s<nsrc; s++)
+ {
+ dest[i] += src[s].f[i];
+ }
+ }
+}
+
+static void
+nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
+ nbnxn_atomdata_output_t * gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
+{
+#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_X86_AVX_256
+#define GMX_MM256_HERE
+#else
+#define GMX_MM128_HERE
+#endif
+#include "gmx_x86_simd_macros.h"
+
+ int i,s;
+ gmx_mm_pr dest_SSE,src_SSE;
+
+ if ((i0 & (GMX_X86_SIMD_WIDTH_HERE-1)) ||
+ (i1 & (GMX_X86_SIMD_WIDTH_HERE-1)))
+ {
+ gmx_incons("bounds not a multiple of GMX_X86_SIMD_WIDTH_HERE in nbnxn_atomdata_reduce_reals_x86_simd");
+ }
+
+ for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ {
+ dest_SSE = gmx_load_pr(dest+i);
+ for(s=0; s<nsrc; s++)
+ {
+ src_SSE = gmx_load_pr(src[s].f+i);
+ dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
+ }
+ gmx_store_pr(dest+i,dest_SSE);
+ }
+
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+#endif
+}
+
/* Add part of the force array(s) from nbnxn_atomdata_t to f */
static void
nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
{
int a0=0,na=0;
int nth,th;
+ gmx_bool bStreamingReduce;
nbs_cycle_start(&nbs->cc[enbsCCreducef]);
}
nth = gmx_omp_nthreads_get(emntNonbonded);
+
+ /* Using the two-step streaming reduction is probably always faster */
+ bStreamingReduce = (nbat->nout > 1);
+
+ if (bStreamingReduce)
+ {
+ /* Reduce the force thread output buffers into buffer 0, before adding
+ * them to the, differently ordered, "real" force buffer.
+ */
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for(th=0; th<nth; th++)
+ {
+ int g0,g1;
+ int b0,b1,nb;
+ int blocksize,i0,i1;
+
+ /* For which grids should we reduce the force output? */
+ g0 = ((locality==eatLocal || locality==eatAll) ? 0 : 1);
+ g1 = (locality==eatLocal ? 1 : nbs->ngrid);
+
+ /* Get the grid cell bounds */
+ b0 = nbs->grid[g0].cell0;
+ b1 = nbs->grid[g1-1].cell0 + nbs->grid[g1-1].nc;
+ blocksize = nbs->grid[g0].na_sc*nbat->fstride;
+ /* The simple grid size in atoms is a multiple of na_cj.
+ * With float-AVX256 we use this and make blocksize a multiple of 8.
+ */
+ if (nbs->grid[0].bSimple && nbs->grid[0].na_cj > nbs->grid[0].na_c)
+ {
+ blocksize *= 2;
+ b0 /= 2;
+ b1 /= 2;
+ }
+ nb = b1 - b0;
+
+ /* Calculate the index range for our thread */
+ i0 = (b0 + (nb* th )/nth)*blocksize;
+ i1 = (b0 + (nb*(th+1))/nth)*blocksize;
+
+#ifdef NBNXN_SEARCH_SSE
+ nbnxn_atomdata_reduce_reals_x86_simd(
+#else
+ nbnxn_atomdata_reduce_reals(
+#endif
+ nbat->out[0].f,
+ nbat->out+1,nbat->nout - 1,
+ i0,i1);
+ }
+ }
+
#pragma omp parallel for num_threads(nth) schedule(static)
for(th=0; th<nth; th++)
{
nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
- nbat->out,
- nbat->nout,
- a0+((th+0)*na)/nth,
- a0+((th+1)*na)/nth,
- f);
+ nbat->out,
+ bStreamingReduce ? 1 : nbat->nout,
+ a0+((th+0)*na)/nth,
+ a0+((th+1)*na)/nth,
+ f);
}
nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
{ "-nlevels", FALSE, etINT, {&nlevels},"number of levels for matrix colors"},
{ "-max",FALSE, etREAL, {&cutmax},"max value for energies"},
{ "-min",FALSE, etREAL, {&cutmin},"min value for energies"},
- { "-coul", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
- { "-coulr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
+ { "-coulsr", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
+ { "-coullr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
{ "-coul14",FALSE, etBOOL, {&bCoul14},"extract Coulomb 1-4 energies"},
- { "-lj", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
- { "-lj", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
+ { "-ljsr", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
+ { "-ljlr", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
{ "-lj14",FALSE, etBOOL, {&bLJ14},"extract Lennard-Jones 1-4 energies"},
{ "-bhamsr",FALSE, etBOOL, {&bBhamSR},"extract Buckingham SR energies"},
{ "-bhamlr",FALSE, etBOOL, {&bBhamLR},"extract Buckingham LR energies"},