Merge release-4-6 into master

author Roland Schulz <roland@utk.edu>

Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)

committer Roland Schulz <roland@utk.edu>

Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
author Roland Schulz <roland@utk.edu>
Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
committer Roland Schulz <roland@utk.edu>
Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 356f6ce19f70816f61e40e399730c66f4fb2c943..46a2e31c6dfbbf8231e3855fa8707f7177165110 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -650,10 +650,10 @@ gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
  
  include(gmxSetBuildInformation)
  gmx_set_build_information()
-if(BUILD_CPU_FEATURES MATCHES "rdtscp")
+if(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
      # The timestep counter headers do not include config.h
      add_definitions(-DHAVE_RDTSCP)
-endif(BUILD_CPU_FEATURES MATCHES "rdtscp")
+endif(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
  
  include(gmxTestFloatFormat)
  gmx_test_float_format(GMX_FLOAT_FORMAT_IEEE754 
@@ -785,6 +785,20 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
         message(WARNING "No C++ AVX flag found. Consider a newer compiler, or disable AVX for much lower performance.")
      endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
  
+    # Set the FMA4 flags (MSVC doesn't require any)
+    if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
+        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+        if (NOT GNU_FMA_CFLAG)
+            message(WARNING "No C FMA4 flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+        endif(NOT GNU_FMA_CFLAG)
+        if (CMAKE_CXX_COMPILER_LOADED)
+            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+            if (NOT GNU_FMA_CXXFLAG)
+                message(WARNING "No C++ FMA flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+            endif (NOT GNU_FMA_CXXFLAG)
+        endif()
+    endif()
+
      # Only test the header after we have tried to add the flag for AVX support
      check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
  
@@ -792,8 +806,9 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
          message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
      endif(NOT HAVE_IMMINTRIN_H)
  
-    # AMD says we should include x86intrin.h for FMA support, but MSVC seems to do fine without it, so don't require it.
+    # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
      check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
  
      # The user should not be able to set this orthogonally to the acceleration
      set(GMX_X86_SSE4_1 1)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index ade12fd5464b381d176cbf3cb067764574f5101e..3e9448c5729d67625c029fe3ed2056844d3a0fca 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -299,6 +299,9 @@
  /* Define to 1 if you have the <x86intrin.h> header file */
  #cmakedefine HAVE_X86INTRIN_H
  
+/* Define to 1 if you have the <intrin.h> header file */
+#cmakedefine HAVE_INTRIN_H
+
  /* Define to 1 if you have the <sched.h> header */
  #cmakedefine HAVE_SCHED_H
  
diff --git a/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h b/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h

index 8d48f2cbca1d5ffd3c5008ef07728b2a77d819e3..0adb058a9d152ac5671184b22de97829ad82b3d0 100644 (file)
--- a/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
@@ -25,6 +25,9 @@
  #ifdef HAVE_X86INTRIN_H
  #include <x86intrin.h> /* FMA */
  #endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
  
  #include <math.h>
  
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index fb3614274ca99f3921a390d054cc80bedb71c193..54e327e9f1272769a32f4fe0deb38e8c7f940f05 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -962,6 +962,62 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
      }
  }
  
+static void
+nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
+                            nbnxn_atomdata_output_t * gmx_restrict src,
+                            int nsrc,
+                            int i0, int i1)
+{
+    int i,s;
+
+    for(i=i0; i<i1; i++)
+    {
+        for(s=0; s<nsrc; s++)
+        {
+            dest[i] += src[s].f[i];
+        }
+    }
+}
+
+static void
+nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
+                                     nbnxn_atomdata_output_t * gmx_restrict src,
+                                     int nsrc,
+                                     int i0, int i1)
+{
+#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_X86_AVX_256
+#define GMX_MM256_HERE
+#else
+#define GMX_MM128_HERE
+#endif
+#include "gmx_x86_simd_macros.h"
+
+    int       i,s;
+    gmx_mm_pr dest_SSE,src_SSE;
+
+    if ((i0 & (GMX_X86_SIMD_WIDTH_HERE-1)) ||
+        (i1 & (GMX_X86_SIMD_WIDTH_HERE-1)))
+    {
+        gmx_incons("bounds not a multiple of GMX_X86_SIMD_WIDTH_HERE in nbnxn_atomdata_reduce_reals_x86_simd");
+    }
+
+    for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+    {
+        dest_SSE = gmx_load_pr(dest+i);
+        for(s=0; s<nsrc; s++)
+        {
+            src_SSE  = gmx_load_pr(src[s].f+i);
+            dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
+        }
+        gmx_store_pr(dest+i,dest_SSE);
+    }
+
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+#endif
+}
+
  /* Add part of the force array(s) from nbnxn_atomdata_t to f */
  static void
  nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
@@ -1079,6 +1135,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
  {
      int a0=0,na=0;
      int nth,th;
+    gmx_bool bStreamingReduce;
  
      nbs_cycle_start(&nbs->cc[enbsCCreducef]);
  
@@ -1099,15 +1156,65 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
      }
  
      nth = gmx_omp_nthreads_get(emntNonbonded);
+
+    /* Using the two-step streaming reduction is probably always faster */
+    bStreamingReduce = (nbat->nout > 1);
+
+    if (bStreamingReduce)
+    {
+        /* Reduce the force thread output buffers into buffer 0, before adding
+         * them to the, differently ordered, "real" force buffer.
+         */
+#pragma omp parallel for num_threads(nth) schedule(static)
+        for(th=0; th<nth; th++)
+        {
+            int g0,g1;
+            int b0,b1,nb;
+            int blocksize,i0,i1;
+
+            /* For which grids should we reduce the force output? */
+            g0 = ((locality==eatLocal || locality==eatAll) ? 0 : 1);
+            g1 = (locality==eatLocal ? 1 : nbs->ngrid);
+
+            /* Get the grid cell bounds */
+            b0 = nbs->grid[g0].cell0;
+            b1 = nbs->grid[g1-1].cell0 + nbs->grid[g1-1].nc;
+            blocksize = nbs->grid[g0].na_sc*nbat->fstride;
+            /* The simple grid size in atoms is a multiple of na_cj.
+             * With float-AVX256 we use this and make blocksize a multiple of 8.
+             */
+            if (nbs->grid[0].bSimple && nbs->grid[0].na_cj > nbs->grid[0].na_c)
+            {
+                blocksize *= 2;
+                b0 /= 2;
+                b1 /= 2;
+            }
+            nb = b1 - b0;
+
+            /* Calculate the index range for our thread */
+            i0 = (b0 + (nb* th   )/nth)*blocksize;
+            i1 = (b0 + (nb*(th+1))/nth)*blocksize;
+
+#ifdef NBNXN_SEARCH_SSE
+            nbnxn_atomdata_reduce_reals_x86_simd(
+#else
+            nbnxn_atomdata_reduce_reals(    
+#endif
+                                        nbat->out[0].f,
+                                        nbat->out+1,nbat->nout - 1,
+                                        i0,i1);
+        }
+    }
+
  #pragma omp parallel for num_threads(nth) schedule(static)
      for(th=0; th<nth; th++)
      {
          nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
-                                             nbat->out,
-                                             nbat->nout,
-                                             a0+((th+0)*na)/nth,
-                                             a0+((th+1)*na)/nth,
-                                             f);
+                                            nbat->out,
+                                            bStreamingReduce ? 1 : nbat->nout,
+                                            a0+((th+0)*na)/nth,
+                                            a0+((th+1)*na)/nth,
+                                            f);
      }
  
      nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
diff --git a/src/tools/gmx_enemat.c b/src/tools/gmx_enemat.c

index 18cb971e604363ddbede01da26cff91018dcaa52..077b885ce6b20680dacbdd814b0689338ad811ec 100644 (file)
--- a/src/tools/gmx_enemat.c
+++ b/src/tools/gmx_enemat.c
@@ -122,11 +122,11 @@ int gmx_enemat(int argc,char *argv[])
      { "-nlevels", FALSE, etINT, {&nlevels},"number of levels for matrix colors"},
      { "-max",FALSE, etREAL, {&cutmax},"max value for energies"},
      { "-min",FALSE, etREAL, {&cutmin},"min value for energies"},
-    { "-coul", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
-    { "-coulr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
+    { "-coulsr", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
+    { "-coullr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
      { "-coul14",FALSE, etBOOL, {&bCoul14},"extract Coulomb 1-4 energies"},
-    { "-lj", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
-    { "-lj", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
+    { "-ljsr", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
+    { "-ljlr", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
      { "-lj14",FALSE, etBOOL, {&bLJ14},"extract Lennard-Jones 1-4 energies"},
      { "-bhamsr",FALSE, etBOOL, {&bBhamSR},"extract Buckingham SR energies"},
      { "-bhamlr",FALSE, etBOOL, {&bBhamLR},"extract Buckingham LR energies"},
author	Roland Schulz <roland@utk.edu>
	Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
committer	Roland Schulz <roland@utk.edu>
	Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
CMakeLists.txt		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/tools/gmx_enemat.c		patch \| blob \| history