Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
committerRoland Schulz <roland@utk.edu>
Mon, 12 Nov 2012 23:45:33 +0000 (18:45 -0500)
Conflicts:
CMakeLists.txt

Change-Id: I42951043675af21763778a3022403166f8cc70e0

CMakeLists.txt
src/config.h.cmakein
src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
src/gromacs/mdlib/nbnxn_atomdata.c
src/tools/gmx_enemat.c

index 356f6ce19f70816f61e40e399730c66f4fb2c943..46a2e31c6dfbbf8231e3855fa8707f7177165110 100644 (file)
@@ -650,10 +650,10 @@ gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
 
 include(gmxSetBuildInformation)
 gmx_set_build_information()
-if(BUILD_CPU_FEATURES MATCHES "rdtscp")
+if(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
     # The timestep counter headers do not include config.h
     add_definitions(-DHAVE_RDTSCP)
-endif(BUILD_CPU_FEATURES MATCHES "rdtscp")
+endif(BUILD_CPU_FEATURES MATCHES "rdtscp" AND NOT GMX_DISTRIBUTABLE_BUILD)
 
 include(gmxTestFloatFormat)
 gmx_test_float_format(GMX_FLOAT_FORMAT_IEEE754 
@@ -785,6 +785,20 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
        message(WARNING "No C++ AVX flag found. Consider a newer compiler, or disable AVX for much lower performance.")
     endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 
+    # Set the FMA4 flags (MSVC doesn't require any)
+    if(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
+        GMX_TEST_CFLAG(GNU_FMA_CFLAG "-mfma4" GROMACS_C_FLAGS)
+        if (NOT GNU_FMA_CFLAG)
+            message(WARNING "No C FMA4 flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+        endif(NOT GNU_FMA_CFLAG)
+        if (CMAKE_CXX_COMPILER_LOADED)
+            GMX_TEST_CXXFLAG(GNU_FMA_CXXFLAG "-mfma4" GROMACS_CXX_FLAGS)
+            if (NOT GNU_FMA_CXXFLAG)
+                message(WARNING "No C++ FMA flag found. Consider a newer compiler, or disable AVX_128_FMA for much lower performance.")
+            endif (NOT GNU_FMA_CXXFLAG)
+        endif()
+    endif()
+
     # Only test the header after we have tried to add the flag for AVX support
     check_include_file(immintrin.h  HAVE_IMMINTRIN_H ${GROMACS_C_FLAGS})
 
@@ -792,8 +806,9 @@ elseif(${GMX_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_ACCELERATION} STREQUA
         message(FATAL_ERROR "Cannot find immintrin.h, which is required for AVX intrinsics support. Consider switching compiler.")
     endif(NOT HAVE_IMMINTRIN_H)
 
-    # AMD says we should include x86intrin.h for FMA support, but MSVC seems to do fine without it, so don't require it.
+    # GCC requires x86intrin.h for FMA support. MSVC 2010 requires intrin.h for FMA support.
     check_include_file(x86intrin.h HAVE_X86INTRIN_H ${GROMACS_C_FLAGS})
+    check_include_file(intrin.h HAVE_INTRIN_H ${GROMACS_C_FLAGS})
 
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE4_1 1)
index ade12fd5464b381d176cbf3cb067764574f5101e..3e9448c5729d67625c029fe3ed2056844d3a0fca 100644 (file)
 /* Define to 1 if you have the <x86intrin.h> header file */
 #cmakedefine HAVE_X86INTRIN_H
 
+/* Define to 1 if you have the <intrin.h> header file */
+#cmakedefine HAVE_INTRIN_H
+
 /* Define to 1 if you have the <sched.h> header */
 #cmakedefine HAVE_SCHED_H
 
index 8d48f2cbca1d5ffd3c5008ef07728b2a77d819e3..0adb058a9d152ac5671184b22de97829ad82b3d0 100644 (file)
@@ -25,6 +25,9 @@
 #ifdef HAVE_X86INTRIN_H
 #include <x86intrin.h> /* FMA */
 #endif
+#ifdef HAVE_INTRIN_H
+#include <intrin.h> /* FMA MSVC */
+#endif
 
 #include <math.h>
 
index fb3614274ca99f3921a390d054cc80bedb71c193..54e327e9f1272769a32f4fe0deb38e8c7f940f05 100644 (file)
@@ -962,6 +962,62 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
     }
 }
 
+static void
+nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
+                            nbnxn_atomdata_output_t * gmx_restrict src,
+                            int nsrc,
+                            int i0, int i1)
+{
+    int i,s;
+
+    for(i=i0; i<i1; i++)
+    {
+        for(s=0; s<nsrc; s++)
+        {
+            dest[i] += src[s].f[i];
+        }
+    }
+}
+
+static void
+nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
+                                     nbnxn_atomdata_output_t * gmx_restrict src,
+                                     int nsrc,
+                                     int i0, int i1)
+{
+#ifdef NBNXN_SEARCH_SSE
+#ifdef GMX_X86_AVX_256
+#define GMX_MM256_HERE
+#else
+#define GMX_MM128_HERE
+#endif
+#include "gmx_x86_simd_macros.h"
+
+    int       i,s;
+    gmx_mm_pr dest_SSE,src_SSE;
+
+    if ((i0 & (GMX_X86_SIMD_WIDTH_HERE-1)) ||
+        (i1 & (GMX_X86_SIMD_WIDTH_HERE-1)))
+    {
+        gmx_incons("bounds not a multiple of GMX_X86_SIMD_WIDTH_HERE in nbnxn_atomdata_reduce_reals_x86_simd");
+    }
+
+    for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+    {
+        dest_SSE = gmx_load_pr(dest+i);
+        for(s=0; s<nsrc; s++)
+        {
+            src_SSE  = gmx_load_pr(src[s].f+i);
+            dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
+        }
+        gmx_store_pr(dest+i,dest_SSE);
+    }
+
+#undef GMX_MM128_HERE
+#undef GMX_MM256_HERE
+#endif
+}
+
 /* Add part of the force array(s) from nbnxn_atomdata_t to f */
 static void
 nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
@@ -1079,6 +1135,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
 {
     int a0=0,na=0;
     int nth,th;
+    gmx_bool bStreamingReduce;
 
     nbs_cycle_start(&nbs->cc[enbsCCreducef]);
 
@@ -1099,15 +1156,65 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
     }
 
     nth = gmx_omp_nthreads_get(emntNonbonded);
+
+    /* Using the two-step streaming reduction is probably always faster */
+    bStreamingReduce = (nbat->nout > 1);
+
+    if (bStreamingReduce)
+    {
+        /* Reduce the force thread output buffers into buffer 0, before adding
+         * them to the, differently ordered, "real" force buffer.
+         */
+#pragma omp parallel for num_threads(nth) schedule(static)
+        for(th=0; th<nth; th++)
+        {
+            int g0,g1;
+            int b0,b1,nb;
+            int blocksize,i0,i1;
+
+            /* For which grids should we reduce the force output? */
+            g0 = ((locality==eatLocal || locality==eatAll) ? 0 : 1);
+            g1 = (locality==eatLocal ? 1 : nbs->ngrid);
+
+            /* Get the grid cell bounds */
+            b0 = nbs->grid[g0].cell0;
+            b1 = nbs->grid[g1-1].cell0 + nbs->grid[g1-1].nc;
+            blocksize = nbs->grid[g0].na_sc*nbat->fstride;
+            /* The simple grid size in atoms is a multiple of na_cj.
+             * With float-AVX256 we use this and make blocksize a multiple of 8.
+             */
+            if (nbs->grid[0].bSimple && nbs->grid[0].na_cj > nbs->grid[0].na_c)
+            {
+                blocksize *= 2;
+                b0 /= 2;
+                b1 /= 2;
+            }
+            nb = b1 - b0;
+
+            /* Calculate the index range for our thread */
+            i0 = (b0 + (nb* th   )/nth)*blocksize;
+            i1 = (b0 + (nb*(th+1))/nth)*blocksize;
+
+#ifdef NBNXN_SEARCH_SSE
+            nbnxn_atomdata_reduce_reals_x86_simd(
+#else
+            nbnxn_atomdata_reduce_reals(    
+#endif
+                                        nbat->out[0].f,
+                                        nbat->out+1,nbat->nout - 1,
+                                        i0,i1);
+        }
+    }
+
 #pragma omp parallel for num_threads(nth) schedule(static)
     for(th=0; th<nth; th++)
     {
         nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
-                                             nbat->out,
-                                             nbat->nout,
-                                             a0+((th+0)*na)/nth,
-                                             a0+((th+1)*na)/nth,
-                                             f);
+                                            nbat->out,
+                                            bStreamingReduce ? 1 : nbat->nout,
+                                            a0+((th+0)*na)/nth,
+                                            a0+((th+1)*na)/nth,
+                                            f);
     }
 
     nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
index 18cb971e604363ddbede01da26cff91018dcaa52..077b885ce6b20680dacbdd814b0689338ad811ec 100644 (file)
@@ -122,11 +122,11 @@ int gmx_enemat(int argc,char *argv[])
     { "-nlevels", FALSE, etINT, {&nlevels},"number of levels for matrix colors"},
     { "-max",FALSE, etREAL, {&cutmax},"max value for energies"},
     { "-min",FALSE, etREAL, {&cutmin},"min value for energies"},
-    { "-coul", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
-    { "-coulr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
+    { "-coulsr", FALSE, etBOOL, {&bCoulSR},"extract Coulomb SR energies"},
+    { "-coullr", FALSE, etBOOL, {&bCoulLR},"extract Coulomb LR energies"},
     { "-coul14",FALSE, etBOOL, {&bCoul14},"extract Coulomb 1-4 energies"},
-    { "-lj", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
-    { "-lj", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
+    { "-ljsr", FALSE, etBOOL, {&bLJSR},"extract Lennard-Jones SR energies"},
+    { "-ljlr", FALSE, etBOOL, {&bLJLR},"extract Lennard-Jones LR energies"},
     { "-lj14",FALSE, etBOOL, {&bLJ14},"extract Lennard-Jones 1-4 energies"},
     { "-bhamsr",FALSE, etBOOL, {&bBhamSR},"extract Buckingham SR energies"},
     { "-bhamlr",FALSE, etBOOL, {&bBhamLR},"extract Buckingham LR energies"},