Merge "Improved the accuracy of the sd1 integrator" into release-4-6
authorDavid van der Spoel <davidvanderspoel@gmail.com>
Fri, 3 Feb 2012 07:54:53 +0000 (08:54 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Fri, 3 Feb 2012 07:54:53 +0000 (08:54 +0100)
18 files changed:
CMakeLists.txt
cmake/FindFFTW3.cmake
cmake/FindFFTW3F.cmake
cmake/gmxCFlags.cmake
include/gmx_parallel_3dfft.h
include/gmx_wallcycle.h
include/network.h
include/pme.h
src/gmxlib/calcgrid.c
src/gmxlib/network.c
src/kernel/runner.c
src/mdlib/fft5d.c
src/mdlib/fft5d.h
src/mdlib/gmx_parallel_3dfft.c
src/mdlib/gmx_wallcycle.c
src/mdlib/pme.c
src/mdlib/pme_sse_single.h [new file with mode: 0644]
src/tools/gmx_membed.c

index 8c23fd975407f86190b4dee582050089b68f0855..6872b533d03d9f336861673816275b5542c388b6 100644 (file)
@@ -133,6 +133,8 @@ mark_as_advanced(GMX_IA32_ASM)
 option(GMX_X86_64_ASM "Add SSE assembly files for X86_64" OFF)
 mark_as_advanced(GMX_X86_64_ASM)
 
+option(GMX_OPENMP "Enable OpenMP-based mutithreading. " ON)
+
 option(USE_VERSION_H "Generate development version string/information" ON)
 mark_as_advanced(USE_VERSION_H)
 
@@ -214,6 +216,11 @@ if(GMX_OPENMM)
         set(GMX_THREADS OFF CACHE BOOL 
                "Threads are not compatible with OpenMM build, disabled!" FORCE)
     endif(GMX_THREADS)
+    if(GMX_OPENMP)
+        message(STATUS "OpenMP multithreading is not compatible with OpenMM, disabled")
+        set(GMX_OPENMP OFF CACHE BOOL
+            "OpenMP multithreading is not compatible with OpenMM, disabled!" FORCE)
+    endif()
     if(GMX_SOFTWARE_INVSQRT)
         set(GMX_SOFTWARE_INVSQRT OFF CACHE STRING 
                 "The OpenMM build does not need GROMACS software 1/sqrt!" FORCE)
@@ -406,6 +413,13 @@ if(GMX_OPENMM)
     find_package(OpenMM) 
 endif(GMX_OPENMM)
 
+if(GMX_OPENMP)
+    find_package(OpenMP REQUIRED)
+    list(APPEND GROMACS_C_FLAGS ${OpenMP_C_FLAGS})
+    list(APPEND GROMACS_CXX_FLAGS ${OpenMP_CXX_FLAGS})
+    add_definitions(-DGMX_OPENMP)
+endif()
+
 if(APPLE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    list(APPEND GMX_EXTRA_LIBRARIES ${ACCELERATE_FRAMEWORK})
index e9a25dc7cd0caff606ff629ca0973f63245ad2d3..480790d7368d069b421dd5541a7076245bcabeb9 100644 (file)
@@ -1,34 +1,28 @@
 # - Find FFTW3
 # Find the native FFTW3 includes and library, double precision
 #
-#  FFTW3_INCLUDE_DIR    - where to find fftw3.h
-#  FFTW3_LIBRARIES   - List of libraries when using FFTW.
-#  FFTW3_FOUND       - True if FFTW found.
-#
-# The FFTW3 root installation directory can be provided in the FFTW3_ROOT_DIR
+#  FFTW3_INCLUDE_DIR    where to find fftw3.h
+#  FFTW3_LIBRARIES      List of libraries when using FFTW.
+#  FFTW3_FOUND          True if FFTW found.
 
 if (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES)
   # Already in cache, be silent
   set (FFTW3_FIND_QUIETLY TRUE)
 endif (FFTW3_INCLUDE_DIR AND FFTW3_LIBRARIES)
 
-file(TO_CMAKE_PATH "$ENV{FFTW3_ROOT_DIR}" _env_FFTW3_ROOT_DIR)
-
 find_path (FFTW3_INCLUDE_DIR fftw3.h
-                PATHS "${_env_FFTW3_ROOT_DIR}/include"
-               CACHE STRING "Path to double precision FFTW3 headers")
+                CACHE STRING "Path to headers for double precision FFTW3")
 
 find_library (FFTW3_LIBRARIES 
-               NAMES fftw3
-                PATHS "${_env_FFTW3_ROOT_DIR}/lib"
-                      "${FFTW3_INCLUDE_DIR}/../lib" 
-               CACHE STRING "Double precision FFTW3 libraries")
+                NAMES fftw3
+                PATHS "${FFTW3_INCLUDE_DIR}/../lib"
+                CACHE STRING "Double precision FFTW3 libraries")
+
 
 # handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
 # all listed variables are TRUE
 include (FindPackageHandleStandardArgs)
-set(__MSG "Could not find FFTW3. Provide the fftw3 install directory in the FFTW3_ROOT_DIR environment variable.")
-find_package_handle_standard_args (FFTW3 ${__MSG} FFTW3_LIBRARIES FFTW3_INCLUDE_DIR)
+find_package_handle_standard_args (FFTW3 DEFAULT_MSG FFTW3_LIBRARIES FFTW3_INCLUDE_DIR)
 
 mark_as_advanced (FFTW3_LIBRARIES FFTW3_INCLUDE_DIR)
 
index 44824dade2a35d921e3991dbd3f2293708212b3b..d7ab48f43719936a2a9a8b2c49af3b0339d72672 100644 (file)
@@ -1,33 +1,26 @@
 # - Find FFTW3F
 # Find the native FFTW3 includes and library, single precision
 #
-#  FFTW3F_INCLUDE_DIR    - where to find fftw3.h
-#  FFTW3F_LIBRARIES   - List of libraries when using FFTW.
-#  FFTW3F_FOUND       - True if FFTW found.
-#
-# The FFTW3F root installation directory can be provided in the FFTW3F_ROOT_DIR
+#  FFTW3F_INCLUDE_DIR   where to find fftw3.h
+#  FFTW3F_LIBRARIES     List of libraries when using FFTW.
+#  FFTW3F_FOUND         True if FFTW found.
 
 if (FFTW3F_INCLUDE_DIR AND FFTW3F_LIBRARIES)
   # Already in cache, be silent
   set (FFTW3F_FIND_QUIETLY TRUE)
 endif (FFTW3F_INCLUDE_DIR AND FFTW3F_LIBRARIES)
 
-file(TO_CMAKE_PATH "$ENV{FFTW3F_ROOT_DIR}" _env_FFTW3F_ROOT_DIR)
-
 find_path (FFTW3F_INCLUDE_DIR fftw3.h 
-            PATHS "${_env_FFTW3F_ROOT_DIR}/include"
-           CACHE STRING "Path to single precision FFTW3 headers")
+                CACHE STRING "Path to headers for single precision FFTW3")
 
 find_library (FFTW3F_LIBRARIES 
-               NAMES fftw3f
-                PATHS "${_env_FFTW3F_ROOT_DIR}/lib"
-                      "${FFTW3F_INCLUDE_DIR}/../lib" 
-               CACHE STRING "Single precision FFTW3 libraries")
+                NAMES fftw3f
+                PATHS "${FFTW3F_INCLUDE_DIR}/../lib"
+                CACHE STRING "Single precision FFTW3 libraries")
 
 # handle the QUIETLY and REQUIRED arguments and set FFTW_FOUND to TRUE if
 # all listed variables are TRUE
 include (FindPackageHandleStandardArgs)
-set(__MSG "Could not find FFTW3F. Provide the fftw3 install directory in the FFTW3F_ROOT_DIR environment variable.")
-find_package_handle_standard_args (FFTW3F ${__MSG} FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR)
+find_package_handle_standard_args (FFTW3F DEFAULT_MSG FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR)
 
 mark_as_advanced (FFTW3F_LIBRARIES FFTW3F_INCLUDE_DIR)
index 21c795aa6bc8789581bb42991cc524eff1995af2..855c19a0b1e6be5718c044d9abd40826bdc7e916 100644 (file)
@@ -30,6 +30,10 @@ MACRO(gmx_c_flags)
 
     # gcc
     if(CMAKE_COMPILER_IS_GNUCC)
+        #flags are added in reverse order and -Wno* need to appear after -Wall
+        if(NOT GMX_OPENMP)
+            GMX_TEST_CFLAG(CFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CFLAGS)
+        endif()
         GMX_TEST_CFLAG(CFLAGS_WARN "-Wall -Wno-unused" GMXC_CFLAGS)
         # new in gcc 4.5
         GMX_TEST_CFLAG(CFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CFLAGS)
@@ -39,6 +43,9 @@ MACRO(gmx_c_flags)
     endif()
     # g++
     if(CMAKE_COMPILER_IS_GNUCXX)
+        if(NOT GMX_OPENMP)
+            GMX_TEST_CFLAG(CXXFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CXXFLAGS)
+        endif()
         GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall -Wno-unused" GMXC_CXXFLAGS)
       # new in gcc 4.5
         GMX_TEST_CXXFLAG(CXXFLAGS_EXCESS_PREC "-fexcess-precision=fast" 
@@ -56,6 +63,9 @@ MACRO(gmx_c_flags)
             GMX_TEST_CFLAG(CFLAGS_SSE2 "-msse2" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_X86 "-mtune=core2" GMXC_CFLAGS_RELEASE)
             GMX_TEST_CFLAG(CFLAGS_IA64 "-mtune=itanium2" GMXC_CFLAGS_RELEASE)
+            if(NOT GMX_OPENMP)
+                GMX_TEST_CFLAG(CFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CFLAGS)
+            endif()
         else()
             GMX_TEST_CFLAG(CFLAGS_SSE2 "/arch:SSE2" GMXC_CFLAGS)
             GMX_TEST_CFLAG(CFLAGS_X86 "/Qip" GMXC_CFLAGS_RELEASE)
@@ -71,6 +81,9 @@ MACRO(gmx_c_flags)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "-mtune=core2" GMXC_CXXFLAGS_RELEASE)
             GMX_TEST_CXXFLAG(CXXFLAGS_IA64 "-mtune=itanium2" 
                               GMXC_CXXFLAGS_RELEASE)
+            if(NOT GMX_OPENMP)
+                GMX_TEST_CFLAG(CXXFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CXXFLAGS)
+            endif()
         else()
             GMX_TEST_CXXFLAG(CXXFLAGS_SSE2 "/arch:SSE2" GMXC_CXXFLAGS)
             GMX_TEST_CXXFLAG(CXXFLAGS_X86 "/Qip" GMXC_CXXFLAGS_RELEASE)
index 85aabb5771d9cae006ee93c594175fcb82da36f7..55582be42c108abb776dd4ad48ca0bf6fd438480 100644 (file)
@@ -52,18 +52,20 @@ gmx_parallel_3dfft_t;
  *  \param bReproducible  Try to avoid FFT timing optimizations and other stuff
  *                        that could make results differ for two runs with
  *                        identical input (reproducibility for debugging).
+ *  \param nthreads       Run in parallel using n threads
  *    
  *  \return 0 or a standard error code.
  */
 int
 gmx_parallel_3dfft_init   (gmx_parallel_3dfft_t *    pfft_setup,
                            ivec                      ndata,
-                                                  real **                   real_data,
-                                                  t_complex **              complex_data,
+                           real **                   real_data,
+                           t_complex **              complex_data,
                            MPI_Comm                  comm[2],
                            int *                     slab2index_major,
                            int *                     slab2index_minor,
-                           gmx_bool                      bReproducible);
+                           gmx_bool                  bReproducible,
+                           int                       nthreads);
 
 
 
@@ -73,9 +75,9 @@ gmx_parallel_3dfft_init   (gmx_parallel_3dfft_t *    pfft_setup,
  */
 int
 gmx_parallel_3dfft_real_limits(gmx_parallel_3dfft_t      pfft_setup,
-                                                          ivec                      local_ndata,
-                                                          ivec                      local_offset,
-                                                          ivec                      local_size);
+                               ivec                      local_ndata,
+                               ivec                      local_offset,
+                               ivec                      local_size);
 
 
 /*! \brief Get reciprocal space grid index limits
@@ -83,16 +85,18 @@ gmx_parallel_3dfft_real_limits(gmx_parallel_3dfft_t      pfft_setup,
 int
 gmx_parallel_3dfft_complex_limits(gmx_parallel_3dfft_t      pfft_setup,
                                   ivec                      complex_order,
-                                                                 ivec                      local_ndata,
-                                                                 ivec                      local_offset,
-                                                                 ivec                      local_size);
+                                  ivec                      local_ndata,
+                                  ivec                      local_offset,
+                                  ivec                      local_size);
 
 
 int
 gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t    pfft_setup,
-                                                  enum gmx_fft_direction  dir,
-                                                  void *                  in_data,
-                                                  void *                  out_data);
+                           enum gmx_fft_direction  dir,
+                           void *                  in_data,
+                           void *                  out_data,
+                           int                     thread,
+                           gmx_wallcycle_t         wcycle);
 
 
 /*! \brief Release all data in parallel fft setup
@@ -101,6 +105,12 @@ gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t    pfft_setup,
  *  is not released, but the contents is invalid after this call.
  *
  *  \param pfft_setup Parallel 3dfft setup.
+ *  \param in_data    Input data.
+ *  \param out_data   Output data.
+ *  \param thread     Thread index of the calling thread, i.e. index to the part
+ *                    of the data operated on last by the calling thread. This
+ *                    is needed to start the FFT without an OpenMP barrier.
+ *  \param wcycle     Wall cycle counters.
  *
  *  \return 0 or a standard error code.
  */
index afaeb0d6985869ef41d519faf864895bbf71656b..1bcd6b79c713c2caf54362b166a135566da4baa2 100644 (file)
 extern "C" {
 #endif
 
-  enum { ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD, ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcMOVEX, ewcNS, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH, ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_SOLVE, ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcVSITESPREAD, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcTEST, ewcNR };
+  enum { ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD, ewcDDCOMMBOUND, ewcVSITECONSTR, ewcPP_PMESENDX, ewcMOVEX, ewcNS, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH, ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcPME_SOLVE, ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcVSITESPREAD, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcTEST, ewcNR };
 
 gmx_bool wallcycle_have_counter(void);
 /* Returns if cycle counting is supported */
 
-gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr);
+gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr, int omp_nthreads);
 /* Returns the wall cycle structure.
  * Returns NULL when cycle counting is not supported.
  */
index 5480c1d5f0a4f8800523e3b1617ee54cd5f6c337..15db2187decd2c309975eb137f747fd946e67b45 100644 (file)
@@ -62,6 +62,11 @@ int gmx_node_num(void);
 int gmx_node_rank(void);
 /* return the rank of the node */
 
+int gmx_hostname_num(void);
+/* If the first part of the hostname (up to the first dot) ends with a number, returns this number.
+   If the first part of the hostname does not ends in a number (0-9 characters), returns 0.
+*/
+
 void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr);
 /* Sets up fast global communication for clusters with multi-core nodes */
 
index 9c6ee1215ea9fa7f18eab9f70ebf454eb2312d60..b2765b37a280d54a72d670e179cfccb0c44b575b 100644 (file)
@@ -50,10 +50,10 @@ typedef real *splinevec[DIM];
 enum { GMX_SUM_QGRID_FORWARD, GMX_SUM_QGRID_BACKWARD };
 
 int gmx_pme_init(gmx_pme_t *pmedata,t_commrec *cr,
-                       int nnodes_major,int nnodes_minor,
-                       t_inputrec *ir,int homenr,
-                       gmx_bool bFreeEnergy, gmx_bool bReproducible);
-                       
+                 int nnodes_major,int nnodes_minor,
+                 t_inputrec *ir,int homenr,
+                 gmx_bool bFreeEnergy, gmx_bool bReproducible, int nthread);
+
 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata);
 /* Initialize and destroy the pme data structures resepectively.
  * Return value 0 indicates all well, non zero is an error code.
index 63aff62cba55605976aa2aaba6a0f358826952f3..7d77c4a352b8063d1377d2320fe8df3decdc4a61 100644 (file)
 #include "gmx_fatal.h"
 #include "calcgrid.h"
 
-#define facNR 4
-const int factor[facNR] = {2,3,5,7};
+/* The grid sizes below are based on timing of a 3D cubic grid in fftw
+ * compiled with SSE using 4 threads in fft5d.c.
+ * A grid size is removed when a larger grid is faster.
+ */
 
-static void make_list(int start_fac,int *ng,int ng_max,int *n_list,int **list)
-{
-    int i,fac;
-  
-    if (*ng < ng_max)
-    {
-        if (*n_list % 100 == 0)
-        {
-            srenew(*list,*n_list+100);
-        }
-        (*list)[*n_list] = *ng;
-        (*n_list)++;
-        
-        for(i=start_fac; i<facNR; i++)
-        {
-            fac = factor[i];
-            /* The choice of grid size is based on benchmarks of fftw
-             * and the need for a lot of factors for nice DD decomposition.
-             * The base criterion is that a grid size is not included
-             * when there is a larger grid size that produces a faster 3D FFT.
-             * Allow any power for 2, two for 3 and 5, but only one for 7.
-             * Three for 3 are ok when there is also a factor of 2.
-             * Two factors of 5 are not allowed with a factor of 3 or 7.
-             * A factor of 7 does not go with a factor of 5, 7 or 9.
-             */
-            if ((fac == 2) ||
-                (fac == 3 && (*ng % 9 != 0 ||
-                              (*ng % 2 == 0 && *ng % 27 != 0))) ||
-                (fac == 5 && *ng % 15 != 0 && *ng % 25 != 0) ||
-                (fac == 7 && *ng % 5 != 0 && *ng % 7 != 0 && *ng % 9 != 0))
-            {
-                *ng *= fac;
-                make_list(i,ng,ng_max,n_list,list);
-                *ng /= fac;
-            }
-        }
-    }
-}
+/* Small grid size array */
+#define g_initNR 15
+const int grid_init[g_initNR] = { 6,8,10,12,14,16,20,24,25,28,32,36,40,42,44 };
 
-static int list_comp(const void *a,const void *b)
-{
-  return (*((int *)a) - *((int *)b));
-}
+/* For larger grid sizes, a prefactor with any power of 2 can be added.
+ * Only sizes divisible by 4 should be used, 90 is allowed, 140 not.
+ */
+#define g_baseNR 14
+const int grid_base[g_baseNR] = { 45,48,50,52,54,56,60,64,70,72,75,80,81,84 };
 
 real calc_grid(FILE *fp,matrix box,real gr_sp,
                int *nx,int *ny,int *nz)
 {
     int  d,n[DIM];
-    int  i,j,nmin[DIM];
-    rvec box_size,spacing;
+    int  i;
+    rvec box_size;
+    int  nmin,fac2,try;
+    rvec spacing;
     real max_spacing;
-    int  ng_max,ng;
-    int  n_list,*list;
 
     if (gr_sp <= 0)
     {
         gmx_fatal(FARGS,"invalid fourier grid spacing: %g",gr_sp);
     }
 
+    if (grid_base[g_baseNR-1] % 4 != 0)
+    {
+        gmx_incons("the last entry in grid_base is not a multiple of 4");
+    }
+
     /* New grid calculation setup:
      *
      * To maintain similar accuracy for triclinic PME grids as for rectangular
@@ -131,21 +104,7 @@ real calc_grid(FILE *fp,matrix box,real gr_sp,
     n[XX] = *nx;
     n[YY] = *ny;
     n[ZZ] = *nz;
-    
-    ng = 1;
-    ng_max = 1;
-    for(d=0; d<DIM; d++)
-    {
-        nmin[d] = (int)(box_size[d]/gr_sp + 0.999);
-        if (2*nmin[d] > ng_max)
-        {
-            ng_max = 2*nmin[d];
-        }
-    }
-    n_list=0;
-    list=NULL;
-    make_list(0,&ng,ng_max,&n_list,&list);
-    
+
     if ((*nx<=0) || (*ny<=0) || (*nz<=0))
     {
         if (NULL != fp)
@@ -155,29 +114,48 @@ real calc_grid(FILE *fp,matrix box,real gr_sp,
         }
     }
     
-    qsort(list,n_list,sizeof(list[0]),list_comp);
-    if (debug)
-    {
-        for(i=0; i<n_list; i++)
-            fprintf(debug,"grid: %d\n",list[i]);
-    }
-        
+    max_spacing = 0;
     for(d=0; d<DIM; d++)
     {
-        for(i=0; (i<n_list) && (n[d]<=0); i++)
+        if (n[d] <= 0)
         {
-            if (list[i] >= nmin[d])
+            nmin = (int)(box_size[d]/gr_sp + 0.999);
+
+            i = g_initNR - 1;
+            if (grid_init[i] >= nmin)
             {
-                n[d] = list[i];
+                /* Take the smallest possible grid in the list */
+                while (i > 0 && grid_init[i-1] >= nmin)
+                {
+                    i--;
+                }
+                n[d] = grid_init[i];
+            }
+            else
+            {
+                /* Determine how many pre-factors of 2 we need */
+                fac2 = 1;
+                i = g_baseNR - 1;
+                while (fac2*grid_base[i-1] < nmin)
+                {
+                    fac2 *= 2;
+                }
+                /* Find the smallest grid that is >= nmin */
+                do
+                {
+                    try = fac2*grid_base[i];
+                    /* We demand a factor of 4, avoid 140, allow 90 */
+                    if (((try % 4 == 0 && try != 140) || try == 90) &&
+                        try >= nmin)
+                    {
+                        n[d] = try;
+                    }
+                    i--;
+                }
+                while (i > 0);
             }
         }
-    }
-    
-    sfree(list);
-    
-    max_spacing = 0;
-    for(d=0; d<DIM; d++)
-    {
+
         spacing[d] = box_size[d]/n[d];
         if (spacing[d] > max_spacing)
         {
index 693302103cfd062e697bb4047c9aefd8fe057bf0..580474e7df53679b15683fc174b7354fa904aca4 100644 (file)
@@ -254,13 +254,48 @@ int gmx_node_rank(void)
 #endif
 }
 
+
+int gmx_hostname_num()
+{
+#ifndef GMX_MPI
+  return 0;
+#else
+  int  resultlen,hostnum,i,j;
+  char mpi_hostname[MPI_MAX_PROCESSOR_NAME],hostnum_str[MPI_MAX_PROCESSOR_NAME];
+
+  MPI_Get_processor_name(mpi_hostname,&resultlen);
+  /* This procedure can only differentiate nodes with host names
+   * that end on unique numbers.
+   */
+  i = 0;
+  j = 0;
+  /* Only parse the host name up to the first dot */
+  while(i < resultlen && mpi_hostname[i] != '.') {
+    if (isdigit(mpi_hostname[i])) {
+      hostnum_str[j++] = mpi_hostname[i];
+    }
+    i++;
+  }
+  hostnum_str[j] = '\0';
+  if (j == 0) {
+    hostnum = 0;
+  } else {
+    /* Use only the last 9 decimals, so we don't overflow an int */
+    hostnum = strtol(hostnum_str + max(0,j-9), NULL, 10);
+  }
+
+  if (debug) {
+    fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
+        mpi_hostname,hostnum);
+  }
+  return hostnum;
+#endif
+}
+
 void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 {
   gmx_nodecomm_t *nc;
-  int  n,rank,resultlen,hostnum,i,j,ng,ni;
-#ifdef GMX_MPI
-  char mpi_hostname[MPI_MAX_PROCESSOR_NAME],num[MPI_MAX_PROCESSOR_NAME];
-#endif
+  int  n,rank,hostnum,ng,ni;
 
   /* Many MPI implementations do not optimize MPI_Allreduce
    * (and probably also other global communication calls)
@@ -281,35 +316,16 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 #ifdef GMX_MPI
     MPI_Comm_size(cr->mpi_comm_mygroup,&n);
     MPI_Comm_rank(cr->mpi_comm_mygroup,&rank);
-    MPI_Get_processor_name(mpi_hostname,&resultlen);
-    /* This procedure can only differentiate nodes with host names
-     * that end on unique numbers.
-     */
-    i = 0;
-    j = 0;
-    /* Only parse the host name up to the first dot */
-    while(i < resultlen && mpi_hostname[i] != '.') {
-      if (isdigit(mpi_hostname[i])) {
-       num[j++] = mpi_hostname[i];
-      }
-      i++;
-    }
-    num[j] = '\0';
-    if (j == 0) {
-      hostnum = 0;
-    } else {
-      /* Use only the last 9 decimals, so we don't overflow an int */
-      hostnum = strtol(num + max(0,j-9), NULL, 10); 
-    }
+
+    hostnum = gmx_hostname_num();
 
     if (debug) {
       fprintf(debug,
-             "In gmx_setup_nodecomm: splitting communicator of size %d\n",
-             n);
-      fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
-             mpi_hostname,hostnum);
+              "In gmx_setup_nodecomm: splitting communicator of size %d\n",
+              n);
     }
 
+
     /* The intra-node communicator, split on node number */
     MPI_Comm_split(cr->mpi_comm_mygroup,hostnum,rank,&nc->comm_intra);
     MPI_Comm_rank(nc->comm_intra,&nc->rank_intra);
index 2ce6b28e321186a567f815eb4e175cf4bb2cf733..43cca09078beb96b96b5e348f8f06e058b053373 100644 (file)
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
-
+#ifdef __linux
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/syscall.h>
+#endif
 #include <signal.h>
 #include <stdlib.h>
 
 #include "md_openmm.h"
 #endif
 
+#ifdef GMX_OPENMP
+#include <omp.h>
+#endif
+
 
 typedef struct { 
     gmx_integrator_t *func;
@@ -248,10 +256,12 @@ static t_commrec *mdrunner_start_threads(int nthreads,
 }
 
 
-/* get the number of threads based on how many there were requested, 
-   which algorithms we're using, and how many particles there are. */
-static int get_nthreads(int nthreads_requested, t_inputrec *inputrec,
-                        gmx_mtop_t *mtop)
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ */
+static int get_nthreads_mpi(int nthreads_requested, t_inputrec *inputrec,
+                            gmx_mtop_t *mtop)
 {
     int nthreads,nthreads_new;
     int min_atoms_per_thread;
@@ -365,7 +375,8 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
     gmx_large_int_t reset_counters;
     gmx_edsam_t ed=NULL;
     t_commrec   *cr_old=cr; 
-    int         nthreads=1;
+    int         nthreads_mpi=1;
+    int         nthreads_pme=1;
 
     /* CAUTION: threads may be started later on in this function, so
        cr doesn't reflect the final parallel state right now */
@@ -390,12 +401,12 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
 
         /* NOW the threads will be started: */
 #ifdef GMX_THREADS
-        nthreads = get_nthreads(nthreads_requested, inputrec, mtop);
+        nthreads_mpi = get_nthreads_mpi(nthreads_requested, inputrec, mtop);
 
-        if (nthreads > 1)
+        if (nthreads_mpi > 1)
         {
             /* now start the threads. */
-            cr=mdrunner_start_threads(nthreads, fplog, cr_old, nfile, fnm, 
+            cr=mdrunner_start_threads(nthreads_mpi, fplog, cr_old, nfile, fnm,
                                       oenv, bVerbose, bCompact, nstglobalcomm, 
                                       ddxyz, dd_node_order, rdd, rconstr, 
                                       dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
@@ -622,7 +633,31 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
         gmx_setup_nodecomm(fplog,cr);
     }
 
-    wcycle = wallcycle_init(fplog,resetstep,cr);
+    /* get number of OpenMP/PME threads
+     * env variable should be read only on one node to make sure it is identical everywhere */
+#ifdef GMX_OPENMP
+    if (EEL_PME(inputrec->coulombtype))
+    {
+        if (MASTER(cr))
+        {
+            char *ptr;
+            if ((ptr=getenv("GMX_PME_NTHREADS")) != NULL)
+            {
+                sscanf(ptr,"%d",&nthreads_pme);
+            }
+            if (fplog != NULL && nthreads_pme > 1)
+            {
+                fprintf(fplog,"Using %d threads for PME\n",nthreads_pme);
+            }
+        }
+        if (PAR(cr))
+        {
+            gmx_bcast_sim(sizeof(nthreads_pme),&nthreads_pme,cr);
+        }
+    }
+#endif
+
+    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pme);
     if (PAR(cr))
     {
         /* Master synchronizes its value of reset_counters with all nodes 
@@ -759,11 +794,46 @@ int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
             /* The PME only nodes need to know nChargePerturbed */
             gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
         }
+
+
+        /* Set CPU affinity. Can be important for performance.
+           On some systems (e.g. Cray) CPU Affinity is set by default.
+           But default assigning doesn't work (well) with only some ranks
+           having threads. This causes very low performance.
+           External tools have cumbersome syntax for setting affinity
+           in the case that only some ranks have threads.
+           Thus it is important that GROMACS sets the affinity internally at
+           if only PME is using threads.
+        */
+
+#ifdef GMX_OPENMP
+#ifdef __linux
+#ifdef GMX_LIB_MPI
+        {
+            int core;
+            MPI_Comm comm_intra; /* intra communicator (but different to nc.comm_intra includes PME nodes) */
+            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),gmx_node_rank(),&comm_intra);
+            int local_omp_nthreads = (cr->duty & DUTY_PME) ? nthreads_pme : 1; /* threads on this node */
+            MPI_Scan(&local_omp_nthreads,&core, 1, MPI_INT, MPI_SUM, comm_intra);
+            core-=local_omp_nthreads; /* make exclusive scan */
+#pragma omp parallel firstprivate(core) num_threads(local_omp_nthreads)
+            {
+                cpu_set_t mask;
+                CPU_ZERO(&mask);
+                core+=omp_get_thread_num();
+                CPU_SET(core,&mask);
+                sched_setaffinity((pid_t) syscall (SYS_gettid),sizeof(cpu_set_t),&mask);
+            }
+        }
+#endif /*GMX_MPI*/
+#endif /*__linux*/
+#endif /*GMX_OPENMP*/
+
         if (cr->duty & DUTY_PME)
         {
             status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
                                   mtop ? mtop->natoms : 0,nChargePerturbed,
-                                  (Flags & MD_REPRODUCIBLE));
+                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
             if (status != 0) 
             {
                 gmx_fatal(FARGS,"Error %d initializing PME",status);
index 776e3ebd12593f6a8a9bd41e1c9b43c29828d134..5d2acdfe3b07202cba3df879dbc3d215c844c607 100644 (file)
@@ -6,10 +6,9 @@
  * 
  *          GROningen MAchine for Chemical Simulations
  * 
- *                        VERSION 4.5
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2008, The GROMACS development team,
+ * Copyright (c) 2001-2012, The GROMACS development team,
  * check out http://www.gromacs.org for more information.
  
  * This program is free software; you can redistribute it and/or
 #include "tmpi.h"
 #endif
 
+#ifdef GMX_OPENMP
+#define FFT5D_THREADS
+#endif
 #ifdef FFT5D_THREADS
 #include <omp.h>
 /* requires fftw compiled with openmp */
-#define FFT5D_FFTW_THREADS
+/* #define FFT5D_FFTW_THREADS (now set by cmake) */
 #endif
 
 #include "fft5d.h"
@@ -177,23 +179,25 @@ gmx_calloc_aligned(size_t size)
 /* NxMxK the size of the data
  * comm communicator to use for fft5d
  * P0 number of processor in 1st axes (can be null for automatic)
- * lin is allocated by fft5d because size of array is only known after planning phase */
-fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout)
+ * lin is allocated by fft5d because size of array is only known after planning phase 
+ * rlout2 is only used as intermediate buffer - only returned after allocation to reuse for back transform - should not be used by caller
+*/
+fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads)
 {
 
-    int P[2],bMaster,prank[2],i;
+    int P[2],bMaster,prank[2],i,t;
     int rNG,rMG,rKG;
     int *N0=0, *N1=0, *M0=0, *M1=0, *K0=0, *K1=0, *oN0=0, *oN1=0, *oM0=0, *oM1=0, *oK0=0, *oK1=0;
     int N[3],M[3],K[3],pN[3],pM[3],pK[3],oM[3],oK[3],*iNin[3]={0},*oNin[3]={0},*iNout[3]={0},*oNout[3]={0};
     int C[3],rC[3],nP[2];
     int lsize;
-    t_complex *lin=0,*lout=0;
+    t_complex *lin=0,*lout=0,*lout2=0,*lout3=0;
     fft5d_plan plan;
     int s;
 
     /* comm, prank and P are in the order of the decomposition (plan->cart is in the order of transposes) */
 #ifdef GMX_MPI
-    if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != 0)
+    if (GMX_PARALLEL_ENV_INITIALIZED && comm[0] != MPI_COMM_NULL)
     {
         MPI_Comm_size(comm[0],&P[0]);
         MPI_Comm_rank(comm[0],&prank[0]);
@@ -205,7 +209,7 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
         prank[0] = 0;
     }
 #ifdef GMX_MPI
-    if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != 0)
+    if (GMX_PARALLEL_ENV_INITIALIZED && comm[1] != MPI_COMM_NULL)
     {
         MPI_Comm_size(comm[1],&P[1]);
         MPI_Comm_rank(comm[1],&prank[1]);
@@ -402,35 +406,30 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
     if (!(flags&FFT5D_NOMALLOC)) { 
         lin = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);   
         lout = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize); 
+        lout2 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
+        lout3 = (t_complex*)gmx_calloc_aligned(sizeof(t_complex) * lsize);
     } else {
         lin = *rlin;
         lout = *rlout;
+        lout2 = *rlout2;
+        lout3 = *rlout3;
     }
 
     plan = (fft5d_plan)calloc(1,sizeof(struct fft5d_plan_t));
 
     
-#ifdef FFT5D_THREADS
-#ifdef FFT5D_FFTW_THREADS
-    FFTW(init_threads)();
-    int nthreads;
-    #pragma omp parallel
-    {
-        #pragma omp master
-        {
-            nthreads = omp_get_num_threads();
-        }
-    }
-    if (prank[0] == 0 && prank[1] == 0)
+    if (debug)
     {
-        printf("Running fftw on %d threads\n",nthreads);        
+        fprintf(debug, "Running on %d threads\n",nthreads);        
     }
-    FFTW(plan_with_nthreads)(nthreads);
-#endif
-#endif    
 
-#ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but insead only 1D plans */
-    if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1))) {  /*don't do 3d plan in parallel or if in_place requested */  
+#ifdef GMX_FFT_FFTW3  /*if not FFTW - then we don't do a 3d plan but instead use only 1D plans */
+    /* It is possible to use the 3d plan with OMP threads - but in that case it is not allowed to be called from
+     * within a parallel region. For now deactivated. If it should be supported it has to made sure that
+     * that the execute of the 3d plan is in a master/serial block (since it contains it own parallel region)
+     * and that the 3d plan is faster than the 1d plan.
+     */
+    if ((!(flags&FFT5D_INPLACE)) && (!(P[0]>1 || P[1]>1)) && nthreads==1) {  /*don't do 3d plan in parallel or if in_place requested */
             int fftwflags=FFTW_DESTROY_INPUT;
             FFTW(iodim) dims[3];
             int inNG=NG,outMG=MG,outKG=KG;
@@ -490,6 +489,11 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
                     dims[2].os = 1;                  
                 }           
             }
+#ifdef FFT5D_THREADS
+#ifdef FFT5D_FFTW_THREADS
+            FFTW(plan_with_nthreads)(nthreads);
+#endif
+#endif
             if ((flags&FFT5D_REALCOMPLEX) && !(flags&FFT5D_BACKWARD)) {
                 plan->p3d = FFTW(plan_guru_dft_r2c)(/*rank*/ 3, dims,
                                      /*howmany*/ 0, /*howmany_dims*/0 ,
@@ -506,6 +510,11 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
                                      (FFTW(complex) *)lin, (FFTW(complex) *)lout,
                                      /*sign*/ (flags&FFT5D_BACKWARD)?1:-1, /*flags*/ fftwflags);
             }
+#ifdef FFT5D_THREADS
+#ifdef FFT5D_FFTW_THREADS
+            FFTW(plan_with_nthreads)(1);
+#endif
+#endif
             FFTW_UNLOCK;
     }
     if (!plan->p3d) {  /* for decomposition and if 3d plan did not work */
@@ -516,12 +525,24 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
                 fprintf(debug,"FFT5D: Plan s %d rC %d M %d pK %d C %d lsize %d\n",
                         s,rC[s],M[s],pK[s],C[s],lsize);
             }
-            if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
-                gmx_fft_init_many_1d_real( &plan->p1d[s], rC[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
-            } else {
-                gmx_fft_init_many_1d     ( &plan->p1d[s],  C[s], pM[s]*pK[s], (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+            plan->p1d[s] = (gmx_fft_t*)malloc(sizeof(gmx_fft_t)*nthreads);
+
+            /* Make sure that the init routines are only called by one thread at a time and in order
+               (later is only important to not confuse valgrind)
+             */
+#pragma omp parallel for num_threads(nthreads) schedule(static) ordered
+            for(t=0; t<nthreads; t++)
+            {
+                int tsize = ((t+1)*pM[s]*pK[s]/nthreads)-(t*pM[s]*pK[s]/nthreads);
+
+                if ((flags&FFT5D_REALCOMPLEX) && ((!(flags&FFT5D_BACKWARD) && s==0) || ((flags&FFT5D_BACKWARD) && s==2))) {
+                    gmx_fft_init_many_1d_real( &plan->p1d[s][t], rC[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+                } else {
+                    gmx_fft_init_many_1d     ( &plan->p1d[s][t],  C[s], tsize, (flags&FFT5D_NOMEASURE)?GMX_FFT_FLAG_CONSERVATIVE:0 );
+                }
             }
         }
+
 #ifdef GMX_FFT_FFTW3 
     }
 #endif
@@ -534,9 +555,9 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
     FFTW_LOCK;
     for (s=0;s<2;s++) {
         if ((s==0 && !(flags&FFT5D_ORDER_YZ)) || (s==1 && (flags&FFT5D_ORDER_YZ))) 
-            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
+            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*K[s]*pM[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
         else
-            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lin, (real*)lout, plan->cart[s], FFTW_PATIENT);
+            plan->mpip[s] = FFTW(mpi_plan_many_transpose)(nP[s], nP[s], N[s]*pK[s]*M[s]*2, 1, 1, (real*)lout2, (real*)lout3, plan->cart[s], FFTW_PATIENT);
     }
     FFTW_UNLOCK;
 #endif 
@@ -544,6 +565,8 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
     
     plan->lin=lin;
     plan->lout=lout;
+    plan->lout2=lout2;
+    plan->lout3=lout3;
     
     plan->NG=NG;plan->MG=MG;plan->KG=KG;
     
@@ -562,8 +585,11 @@ fft5d_plan fft5d_plan_3d(int NG, int MG, int KG, MPI_Comm comm[2], int flags, t_
     plan->realcomplex=realcomplex;
 */
     plan->flags=flags;
+    plan->nthreads=nthreads;
     *rlin=lin;
     *rlout=lout;
+    *rlout2=lout2;
+    *rlout3=lout3;
     return plan;
 }
 
@@ -586,40 +612,36 @@ enum order {
   NG, MG, KG is size of global data*/
 static void splitaxes(t_complex* lout,const t_complex* lin,
                       int maxN,int maxM,int maxK, int pN, int pM, int pK,
-                      int P,int NG,int *N, int* oN)
+                      int P,int NG,int *N, int* oN,int starty,int startz,int endy, int endz)
 {
     int x,y,z,i;
     int in_i,out_i,in_z,out_z,in_y,out_y;
+    int s_y,e_y;
 
-#ifdef FFT5D_THREADS
-    int zi;
-
-    /* In the thread parallel case we want to loop over z and i
-     * in a single for loop to allow for better load balancing.
-     */
-#pragma omp parallel for private(z,in_z,out_z,i,in_i,out_i,y,in_y,out_y,x) schedule(static)
-    for (zi=0; zi<pK*P; zi++)
-    {
-        z = zi/P;
-        i = zi - z*P;
-#else
-    for (z=0; z<pK; z++) /*3. z l*/ 
+    for (z=startz; z<endz+1; z++) /*3. z l*/
     {
-#endif
-        in_z  = z*maxN*maxM;
-        out_z = z*NG*pM;
+        if (z==startz) {
+            s_y=starty;
+        } else {
+            s_y=0;
+        }
+        if (z==endz) {
+            e_y=endy;
+        } else {
+            e_y=pM;
+        }
+        out_z  = z*maxN*maxM;
+        in_z = z*NG*pM;
 
-#ifndef FFT5D_THREADS
         for (i=0; i<P; i++) /*index cube along long axis*/
-#endif
         {
-            in_i  = in_z  + i*maxN*maxM*maxK;
-            out_i = out_z + oN[i];
-            for (y=0;y<pM;y++) { /*2. y k*/
-                in_y  = in_i  + y*maxN;
-                out_y = out_i + y*NG;
+            out_i  = out_z  + i*maxN*maxM*maxK;
+            in_i = in_z + oN[i];
+            for (y=s_y;y<e_y;y++) { /*2. y k*/
+                out_y  = out_i  + y*maxN;
+                in_y = in_i + y*NG;
                 for (x=0;x<N[i];x++) { /*1. x j*/
-                    lout[in_y+x] = lin[out_y+x];
+                    lout[out_y+x] = lin[in_y+x];    /*in=z*NG*pM+oN[i]+y*NG+x*/
                     /*after split important that each processor chunk i has size maxN*maxM*maxK and thus being the same size*/
                     /*before split data contiguos - thus if different processor get different amount oN is different*/
                 }
@@ -634,43 +656,47 @@ static void splitaxes(t_complex* lout,const t_complex* lin,
   the major, middle, minor order is only correct for x,y,z (N,M,K) for the input
   N,M,K local dimensions
   KG global size*/
-static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
+static void joinAxesTrans13(t_complex* lout,const t_complex* lin,
                             int maxN,int maxM,int maxK,int pN, int pM, int pK, 
-                            int P,int KG, int* K, int* oK)
+                            int P,int KG, int* K, int* oK,int starty, int startx, int endy, int endx)
 {
     int i,x,y,z;
-    int in_i,out_i,in_x,out_x,in_z,out_z;
-
-#ifdef FFT5D_THREADS
-    int xi;
+    int out_i,in_i,out_x,in_x,out_z,in_z;
+    int s_y,e_y;
 
-    /* In the thread parallel case we want to loop over x and i
-     * in a single for loop to allow for better load balancing.
-     */
-#pragma omp parallel for private(x,in_x,out_x,i,in_i,out_i,z,in_z,out_z,y) schedule(static)
-    for (xi=0; xi<pN*P; xi++)
-    {
-        x = xi/P;
-        i = xi - x*P;
-#else
-    for (x=0;x<pN;x++) /*1.j*/
+    for (x=startx;x<endx+1;x++) /*1.j*/
     {
-#endif
-        in_x  = x*KG*pM;
-        out_x = x;
+        if (x==startx)
+        {
+            s_y=starty;
+        }
+        else
+        {
+            s_y=0;
+        }
+        if (x==endx)
+        {
+            e_y=endy;
+        }
+        else
+        {
+            e_y=pM;
+        }
+
+        out_x  = x*KG*pM;
+        in_x = x;
 
-#ifndef FFT5D_THREADS
         for (i=0;i<P;i++) /*index cube along long axis*/
-#endif
         {
-            in_i  = in_x  + oK[i];
-            out_i = out_x + i*maxM*maxN*maxK;
+            out_i  = out_x  + oK[i];
+            in_i = in_x + i*maxM*maxN*maxK;
             for (z=0;z<K[i];z++) /*3.l*/
             {
-                in_z  = in_i  + z;
-                out_z = out_i + z*maxM*maxN;
-                for (y=0;y<pM;y++) { /*2.k*/
-                    lin[in_z+y*KG] = lout[out_z+y*maxN];
+                out_z  = out_i  + z;
+                in_z = in_i + z*maxM*maxN;
+                for (y=s_y;y<e_y;y++) /*2.k*/
+                {
+                    lout[out_z+y*KG] = lin[in_z+y*maxN]; /*out=x*KG*pM+oK[i]+z+y*KG*/
                 }
             }
         }
@@ -683,40 +709,44 @@ static void joinAxesTrans13(t_complex* lin,const t_complex* lout,
   the minor, middle, major order is only correct for x,y,z (N,M,K) for the input
   N,M,K local size
   MG, global size*/
-static void joinAxesTrans12(t_complex* lin,const t_complex* lout,int maxN,int maxM,int maxK,int pN, int pM, int pK,
-                int P,int MG, int* M, int* oM) {
+static void joinAxesTrans12(t_complex* lout,const t_complex* lin,int maxN,int maxM,int maxK,int pN, int pM, int pK,
+                            int P,int MG, int* M, int* oM, int startx, int startz, int endx, int endz) {
     int i,z,y,x;
-    int in_i,out_i,in_z,out_z,in_x,out_x;
+    int out_i,in_i,out_z,in_z,out_x,in_x;
+    int s_x,e_x;
 
-#ifdef FFT5D_THREADS
-    int zi;
-
-    /* In the thread parallel case we want to loop over z and i
-     * in a single for loop to allow for better load balancing.
-     */
-#pragma omp parallel for private(i,in_i,out_i,z,in_z,out_z,in_x,out_x,x,y) schedule(static)
-    for (zi=0; zi<pK*P; zi++)
-    {
-        z = zi/P;
-        i = zi - z*P;
-#else
-    for (z=0; z<pK; z++)
+    for (z=startz; z<endz+1; z++)
     {
-#endif
-        in_z  = z*MG*pN;
-        out_z = z*maxM*maxN;
+        if (z==startz)
+        {
+            s_x=startx;
+        }
+        else
+        {
+            s_x=0;
+        }
+        if (z==endz)
+        {
+            e_x=endx;
+        }
+        else
+        {
+            e_x=pN;
+        }
+        out_z  = z*MG*pN;
+        in_z = z*maxM*maxN;
 
-#ifndef FFT5D_THREADS
         for (i=0; i<P; i++) /*index cube along long axis*/
-#endif
         {
-            in_i  = in_z  + oM[i];
-            out_i = out_z + i*maxM*maxN*maxK;
-            for (x=0;x<pN;x++) {
-                in_x  = in_i  + x*MG;
-                out_x = out_i + x;
-                for (y=0;y<M[i];y++) {
-                    lin[in_x+y] = lout[out_x+y*maxN];
+            out_i  = out_z  + oM[i];
+            in_i = in_z + i*maxM*maxN*maxK;
+            for (x=s_x;x<e_x;x++)
+            {
+                out_x  = out_i  + x*MG;
+                in_x = in_i + x;
+                for (y=0;y<M[i];y++)
+                {
+                    lout[out_x+y] = lin[in_x+y*maxN]; /*out=z*MG*pN+oM[i]+x*MG+y*/
                 }
             }
         }
@@ -803,7 +833,7 @@ static void compute_offsets(fft5d_plan plan, int xs[], int xl[], int xc[], int N
 }
 
 static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_plan plan) {
-    int x,y,z,l;
+    int x,y,z,l,t;
     int *coor = plan->coor;
     int xs[3],xl[3],xc[3],NG[3];        
     int ll=(plan->flags&FFT5D_REALCOMPLEX)?1:2;
@@ -824,11 +854,14 @@ static void print_localdata(const t_complex* lin, const char* txt, int s, fft5d_
     }
 }
 
-void fft5d_execute(fft5d_plan plan,fft5d_time times) {
+void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times) {
     t_complex *lin = plan->lin;
     t_complex *lout = plan->lout;
+    t_complex *lout2 = plan->lout2;
+    t_complex *lout3 = plan->lout3;
+    t_complex *fftout,*joinin;
 
-    gmx_fft_t *p1d=plan->p1d;
+    gmx_fft_t **p1d=plan->p1d;
 #ifdef FFT5D_MPI_TRANSPOSE
     FFTW(plan) *mpip=plan->mpip;
 #endif
@@ -839,124 +872,275 @@ void fft5d_execute(fft5d_plan plan,fft5d_time times) {
     double time_fft=0,time_local=0,time_mpi[2]={0},time=0;    
     int *N=plan->N,*M=plan->M,*K=plan->K,*pN=plan->pN,*pM=plan->pM,*pK=plan->pK,
         *C=plan->C,*P=plan->P,**iNin=plan->iNin,**oNin=plan->oNin,**iNout=plan->iNout,**oNout=plan->oNout;
-    int s=0;
+    int s=0,tstart,tend,bParallelDim;
     
     
 #ifdef GMX_FFT_FFTW3 
-    if (plan->p3d) {
-        if (times!=0)
-            time=MPI_Wtime();
-        FFTW(execute)(plan->p3d); 
-        if (times!=0)
-            times->fft+=MPI_Wtime()-time;
+    if (plan->p3d)
+    {
+        if (thread == 0)
+        {
+#ifdef NOGMX
+            if (times!=0)
+            {
+                time=MPI_Wtime();
+            }
+#endif
+            FFTW(execute)(plan->p3d);
+#ifdef NOGMX
+            if (times!=0)
+            {
+                times->fft+=MPI_Wtime()-time;
+            }
+#endif
+        }
         return;
     }
 #endif
 
+        s=0;
+
     /*lin: x,y,z*/
-    if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: copy in lin\n", s, plan);
-    for (s=0;s<2;s++) {
-        if (times!=0)
-            time=MPI_Wtime();
-        
-        if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0) {
-            gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
-        } else {
-            gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
+        if (plan->flags&FFT5D_DEBUG && thread == 0)
+        {
+            print_localdata(lin, "%d %d: copy in lin\n", s, plan);
         }
-        if (times!=0)
-            time_fft+=MPI_Wtime()-time;
-    
-        if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
-        
+
+        for (s=0;s<2;s++) {  /*loop over first two FFT steps (corner rotations)*/
+
 #ifdef GMX_MPI
         if (GMX_PARALLEL_ENV_INITIALIZED && cart[s] !=0 && P[s]>1 )
         {
-            if (times!=0)
-                time=MPI_Wtime(); 
-            /*prepare for AllToAll
-              1. (most outer) axes (x) is split into P[s] parts of size N[s] 
-              for sending*/
-            splitaxes(lin,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s]);
+            bParallelDim = 1;
+        }
+        else
+#endif
+        {
+            bParallelDim = 0;
+        }
 
-            if (times!=0)
+        /* ---------- START FFT ------------ */
+#ifdef NOGMX
+        if (times!=0 && thread == 0)
+        {
+            time=MPI_Wtime();
+        }
+#endif
+
+        if (bParallelDim) {
+            fftout = lout;
+        }
+        else
+        {
+            if (s==0)
+            {
+                fftout = lout3;
+            } else
+            {
+                fftout = lout2;
+            }
+        }
+
+        tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
+        if ((plan->flags&FFT5D_REALCOMPLEX) && !(plan->flags&FFT5D_BACKWARD) && s==0)
+        {
+            gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,fftout+tstart);
+        } else
+        {
+            gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,fftout+tstart);
+
+        }
+
+#ifdef NOGMX
+        if (times != NULL && thread == 0)
+        {
+            time_fft+=MPI_Wtime()-time;
+        }
+#endif
+        if (plan->flags&FFT5D_DEBUG && thread == 0)
+        {
+            print_localdata(lout, "%d %d: FFT %d\n", s, plan);
+        }
+        /* ---------- END FFT ------------ */
+
+        /* ---------- START SPLIT + TRANSPOSE------------ (if parallel in in this dimension)*/
+        if (bParallelDim) {
+#ifdef NOGMX
+            if (times != NULL && thread == 0)
             {
-                time_local+=MPI_Wtime()-time;
-            
-                /*send, recv*/
                 time=MPI_Wtime();
             }
+#endif
+            /*prepare for A
+llToAll
+              1. (most outer) axes (x) is split into P[s] parts of size N[s]
+              for sending*/
+            if (pM[s]>0)
+            {
+                tend = ((thread+1)*pM[s]*pK[s]/plan->nthreads);
+                tstart/=C[s];
+                splitaxes(lout2,lout,N[s],M[s],K[s], pN[s],pM[s],pK[s],P[s],C[s],iNout[s],oNout[s],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+            }
+#pragma omp barrier /*barrier required before AllToAll (all input has to be their) - before timing to make timing more acurate*/
+#ifdef NOGMX
+            if (times != NULL && thread == 0)
+            {
+                time_local+=MPI_Wtime()-time;
+            }
+#endif
 
+        /* ---------- END SPLIT , START TRANSPOSE------------ */
+
+            if (thread == 0)
+            {
+#ifdef NOGMX
+                if (times!=0)
+                {
+                    time=MPI_Wtime();
+                }
+#else
+                wallcycle_start(times,ewcPME_FFTCOMM);
+#endif
 #ifdef FFT5D_MPI_TRANSPOSE
-            FFTW(execute)(mpip[s]);  
+                FFTW(execute)(mpip[s]);
 #else
-            if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) 
-                MPI_Alltoall(lin,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
-            else
-                MPI_Alltoall(lin,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
-#endif /*FFT5D_MPI_TRANSPOSE*/
-            if (times!=0)
-                time_mpi[s]=MPI_Wtime()-time;
-        }
+#ifdef GMX_MPI
+                if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ)))
+                    MPI_Alltoall(lout2,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*pM[s]*K[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+                else
+                    MPI_Alltoall(lout2,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,lout3,N[s]*M[s]*pK[s]*sizeof(t_complex)/sizeof(real),GMX_MPI_REAL,cart[s]);
+#else
+                gmx_incons("fft5d MPI call without MPI configuration");
 #endif /*GMX_MPI*/
+#endif /*FFT5D_MPI_TRANSPOSE*/
+#ifdef NOGMX
+                if (times!=0)
+                {
+                    time_mpi[s]=MPI_Wtime()-time;
+                }
+#else
+                wallcycle_stop(times,ewcPME_FFTCOMM);
+#endif
+            }  /*master*/
+        }  /* bPrallelDim */
+#pragma omp barrier  /*both needed for parallel and non-parallel dimension (either have to wait on data from AlltoAll or from last FFT*/
 
-    
-        if (times!=0)
+        /* ---------- END SPLIT + TRANSPOSE------------ */
+
+        /* ---------- START JOIN ------------ */
+#ifdef NOGMX
+        if (times != NULL && thread == 0)
+        {
             time=MPI_Wtime();
+        }
+#endif
+
+        if (bParallelDim) {
+            joinin = lout3;
+        } else {
+            joinin = fftout;
+        }
         /*bring back in matrix form 
           thus make  new 1. axes contiguos
-          also local transpose 1 and 2/3 */
-        if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) 
-            joinAxesTrans13(lin,lout,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);
-        else 
-            joinAxesTrans12(lin,lout,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1]);    
-        if (times!=0)
+          also local transpose 1 and 2/3 
+          runs on thread used for following FFT (thus needing a barrier before but not afterwards)
+        */
+        if ((s==0 && !(plan->flags&FFT5D_ORDER_YZ)) || (s==1 && (plan->flags&FFT5D_ORDER_YZ))) {
+            if (pM[s]>0)
+            {
+                tstart = ( thread   *pM[s]*pN[s]/plan->nthreads);
+                tend   = ((thread+1)*pM[s]*pN[s]/plan->nthreads);
+                joinAxesTrans13(lin,joinin,N[s],pM[s],K[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pM[s],tstart/pM[s],tend%pM[s],tend/pM[s]);
+            }
+        }
+        else {
+            if (pN[s]>0)
+            {
+                tstart = ( thread   *pK[s]*pN[s]/plan->nthreads);
+                tend   = ((thread+1)*pK[s]*pN[s]/plan->nthreads);
+                joinAxesTrans12(lin,joinin,N[s],M[s],pK[s],pN[s],pM[s],pK[s],P[s],C[s+1],iNin[s+1],oNin[s+1],tstart%pN[s],tstart/pN[s],tend%pN[s],tend/pN[s]);
+            }
+        }
+
+#ifdef NOGMX
+        if (times != NULL && thread == 0)
+        {
             time_local+=MPI_Wtime()-time;
-    
-        if (plan->flags&FFT5D_DEBUG) print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
-                
+        }
+#endif
+        if (plan->flags&FFT5D_DEBUG && thread == 0)
+        {
+            print_localdata(lin, "%d %d: tranposed %d\n", s+1, plan);
+        }
+        /* ---------- END JOIN ------------ */
+
         /*if (debug) print_localdata(lin, "%d %d: transposed x-z\n", N1, M0, K, ZYX, coor);*/
-    }    
-    
-    if (times!=0)
-        time=MPI_Wtime();
-    if (plan->flags&FFT5D_INPLACE) lout=lin;
+    }  /* for(s=0;s<2;s++) */
+#ifdef NOGMX
+        if (times != NULL && thread == 0)
+        {
+            time=MPI_Wtime();
+        }
+#endif
+
+    if (plan->flags&FFT5D_INPLACE) lout=lin; /*in place currently not supported*/
+
+    /*  ----------- FFT ----------- */
+    tstart = (thread*pM[s]*pK[s]/plan->nthreads)*C[s];
     if ((plan->flags&FFT5D_REALCOMPLEX) && (plan->flags&FFT5D_BACKWARD)) {
-        gmx_fft_many_1d_real(p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin,lout);
+        gmx_fft_many_1d_real(p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_COMPLEX_TO_REAL:GMX_FFT_REAL_TO_COMPLEX,lin+tstart,lout+tstart);
     } else {
-        gmx_fft_many_1d(     p1d[s],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin,lout);
+        gmx_fft_many_1d(     p1d[s][thread],(plan->flags&FFT5D_BACKWARD)?GMX_FFT_BACKWARD:GMX_FFT_FORWARD,               lin+tstart,lout+tstart);
     }
+    /* ------------ END FFT ---------*/
 
-    if (times!=0)
+#ifdef NOGMX
+    if (times != NULL && thread == 0)
+    {
         time_fft+=MPI_Wtime()-time;
-    if (plan->flags&FFT5D_DEBUG) print_localdata(lout, "%d %d: FFT %d\n", s, plan);
-    /*if (debug) print_localdata(lout, "%d %d: FFT in y\n", N1, M, K0, YZX, coor);*/
-    
-    if (times!=0) {
+
         times->fft+=time_fft;
         times->local+=time_local;
         times->mpi2+=time_mpi[1];
         times->mpi1+=time_mpi[0];
     }
+#endif
+
+    if (plan->flags&FFT5D_DEBUG && thread == 0)
+    {
+        print_localdata(lout, "%d %d: FFT %d\n", s, plan);
+    }
 }
 
 void fft5d_destroy(fft5d_plan plan) {
-    int s;
-    for (s=0;s<3;s++) {
-        gmx_many_fft_destroy(plan->p1d[s]);
-        if (plan->iNin[s]) {
+    int s,t;
+    for (s=0;s<3;s++)
+    {
+        if (plan->p1d[s])
+        {
+            for (t=0;t<plan->nthreads;t++)
+            {
+                gmx_many_fft_destroy(plan->p1d[s][t]);
+            }
+            free(plan->p1d[s]);
+        }
+        if (plan->iNin[s])
+        {
             free(plan->iNin[s]);
             plan->iNin[s]=0;
         }
-        if (plan->oNin[s]) {
+        if (plan->oNin[s])
+        {
             free(plan->oNin[s]);
             plan->oNin[s]=0;
         }
-        if (plan->iNout[s]) {
+        if (plan->iNout[s])
+        {
             free(plan->iNout[s]);
             plan->iNout[s]=0;
         }
-        if (plan->oNout[s]) {
+        if (plan->oNout[s])
+        {
             free(plan->oNout[s]);
             plan->oNout[s]=0;
         }
@@ -965,7 +1149,13 @@ void fft5d_destroy(fft5d_plan plan) {
     FFTW_LOCK;
 #ifdef FFT5D_MPI_TRANSPOS
     for (s=0;s<2;s++)    
+    {
         FFTW(destroy_plan)(plan->mpip[s]);
+    }
+    if (plan->p3d)
+    {
+        FFTW(destroy_plan)(plan->p3d);
+    }
 #endif /* FFT5D_MPI_TRANSPOS */
 #endif /* GMX_FFT_FFTW3 */
 
@@ -974,7 +1164,7 @@ void fft5d_destroy(fft5d_plan plan) {
     
 #ifdef FFT5D_THREADS
 #ifdef FFT5D_FFTW_THREADS
-    FFTW(cleanup_threads)();
+    /*FFTW(cleanup_threads)();*/
 #endif
 #endif
 
@@ -995,7 +1185,7 @@ void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor
 
 /*same as fft5d_plan_3d but with cartesian coordinator and automatic splitting 
   of processor dimensions*/
-fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout) {
+fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int flags, t_complex** rlin, t_complex** rlout, t_complex** rlout2, t_complex** rlout3, int nthreads) {
     MPI_Comm cart[2]={0};
 #ifdef GMX_MPI
     int size=1,prank=0;
@@ -1009,7 +1199,8 @@ fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int
     MPI_Comm_rank(comm,&prank);
 
     if (P0==0) P0 = lfactor(size);
-    if (size%P0!=0) {
+    if (size%P0!=0)
+    {
         if (prank==0) printf("FFT5D: WARNING: Number of processors %d not evenly dividable by %d\n",size,P0);
         P0 = lfactor(size);
     }
@@ -1024,7 +1215,7 @@ fft5d_plan fft5d_plan_3d_cart(int NG, int MG, int KG, MPI_Comm comm, int P0, int
     MPI_Cart_sub(gcart, rdim1 , &cart[0]);
     MPI_Cart_sub(gcart, rdim2 , &cart[1]);
 #endif
-    return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout); 
+    return fft5d_plan_3d(NG, MG, KG, cart, flags, rlin, rlout,rlout2,rlout3,nthreads);
 }
 
 
index d4444f268bee849c4d3c64449f2da1b9487066ce..cfc7c9328c683fa76371f798c7cfc6e5624868c9 100644 (file)
@@ -1,3 +1,37 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Groningen Machine for Chemical Simulation
+ */
+
 #ifndef FFT5D_H_     
 #define FFT5D_H_
 
@@ -30,56 +64,63 @@ double MPI_Wtime();
 #define FFTW(x) fftw_##x
 #endif
 
+#ifdef NOGMX
 struct fft5d_time_t {
-       double fft,local,mpi1,mpi2;
+    double fft,local,mpi1,mpi2;
 };
 typedef struct fft5d_time_t *fft5d_time;
+#else
+#include "gmx_wallcycle.h"
+typedef gmx_wallcycle_t fft5d_time;
+#endif
 
 typedef enum fft5d_flags_t {
-       FFT5D_ORDER_YZ=1,
-       FFT5D_BACKWARD=2,
-       FFT5D_REALCOMPLEX=4,
-       FFT5D_DEBUG=8,
-       FFT5D_NOMEASURE=16,
-       FFT5D_INPLACE=32,
-       FFT5D_NOMALLOC=64
+    FFT5D_ORDER_YZ=1,
+    FFT5D_BACKWARD=2,
+    FFT5D_REALCOMPLEX=4,
+    FFT5D_DEBUG=8,
+    FFT5D_NOMEASURE=16,
+    FFT5D_INPLACE=32,
+    FFT5D_NOMALLOC=64
 } fft5d_flags;
 
 struct fft5d_plan_t {
-       t_complex *lin;
-       t_complex *lout;
-        gmx_fft_t p1d[3];   /*1D plans*/
+    t_complex *lin;
+    t_complex *lout,*lout2,*lout3;
+    gmx_fft_t* p1d[3];   /*1D plans*/
 #ifdef GMX_FFT_FFTW3 
-        FFTW(plan) p2d;  /*2D plan: used for 1D decomposition if FFT supports transposed output*/
-        FFTW(plan) p3d;  /*3D plan: used for 0D decomposition if FFT supports transposed output*/
-       FFTW(plan) mpip[2];
+    FFTW(plan) p2d;  /*2D plan: used for 1D decomposition if FFT supports transposed output*/
+    FFTW(plan) p3d;  /*3D plan: used for 0D decomposition if FFT supports transposed output*/
+    FFTW(plan) mpip[2];
 #endif
-       MPI_Comm cart[2];
+    MPI_Comm cart[2];
 
     int N[3],M[3],K[3]; /*local length in transposed coordinate system (if not divisisable max)*/
     int pN[3],pM[3], pK[3]; /*local length - not max but length for this processor*/
     int oM[3],oK[3]; /*offset for current processor*/
     int *iNin[3],*oNin[3],*iNout[3],*oNout[3]; /*size for each processor (if divisisable=max) for out(=split) 
-                                                and in (=join) and offsets in transposed coordinate system*/
+                                                 and in (=join) and offsets in transposed coordinate system*/
     int C[3],rC[3]; /*global length (of the one global axes) */
     /* C!=rC for real<->complex. then C=rC/2 but with potential padding*/
     int P[2]; /*size of processor grid*/
-/*     int fftorder;*/
-/*     int direction;*/
-/*     int realcomplex;*/
-       int flags;
+/*  int fftorder;*/
+/*  int direction;*/
+/*  int realcomplex;*/
+    int flags;
     /*int N0,N1,M0,M1,K0,K1;*/
-       int NG,MG,KG;
-    /*int P[2];*/
-       int coor[2];
+    int NG,MG,KG;
+  /*int P[2];*/
+    int coor[2];
+    int nthreads;
 }; 
 
 typedef struct fft5d_plan_t *fft5d_plan;
 
-void fft5d_execute(fft5d_plan plan,fft5d_time times);
-fft5d_plan fft5d_plan_3d(int N, int M, int K, MPI_Comm comm[2], int flags, t_complex** lin, t_complex** lin2);
+void fft5d_execute(fft5d_plan plan,int thread,fft5d_time times);
+fft5d_plan fft5d_plan_3d(int N, int M, int K, MPI_Comm comm[2], int flags, t_complex** lin, t_complex** lin2, t_complex** lout2, t_complex** lout3, int nthreads);
 void fft5d_local_size(fft5d_plan plan,int* N1,int* M0,int* K0,int* K1,int** coor);
 void fft5d_destroy(fft5d_plan plan);
-fft5d_plan fft5d_plan_3d_cart(int N, int M, int K, MPI_Comm comm, int P0, int flags, t_complex** lin, t_complex** lin2);
+fft5d_plan fft5d_plan_3d_cart(int N, int M, int K, MPI_Comm comm, int P0, int flags, t_complex** lin, t_complex** lin2, t_complex** lout2, t_complex** lout3, int nthreads);
 void fft5d_compare_data(const t_complex* lin, const t_complex* in, fft5d_plan plan, int bothLocal, int normarlize);
 #endif /*FFTLIB_H_*/
+
index 2696f939d60c5396bbce660aac292ab074afc4c9..449a91766ef4e56a21c8d7338748d693b5706630 100644 (file)
@@ -75,12 +75,14 @@ gmx_parallel_3dfft_init   (gmx_parallel_3dfft_t *    pfft_setup,
                            MPI_Comm                  comm[2],
                            int *                     slab2index_major,
                            int *                     slab2index_minor,
-                           gmx_bool                      bReproducible)
+                           gmx_bool                      bReproducible,
+                           int nthreads)
 {
     int rN=ndata[2],M=ndata[1],K=ndata[0];
     int flags = FFT5D_REALCOMPLEX | FFT5D_ORDER_YZ; /* FFT5D_DEBUG */
     MPI_Comm rcomm[]={comm[1],comm[0]};
     int Nb,Mb,Kb; /* dimension for backtransform (in starting order) */
+    t_complex *buf1, *buf2; /*intermediate buffers - used internally.*/
     
     snew(*pfft_setup,1);
     if (bReproducible) flags |= FFT5D_NOMEASURE; 
@@ -91,10 +93,10 @@ gmx_parallel_3dfft_init   (gmx_parallel_3dfft_t *    pfft_setup,
         Nb=K;Mb=rN;Kb=M;  /* currently always true because ORDER_YZ always set */
     }
     
-    (*pfft_setup)->p1 = fft5d_plan_3d(rN,M,K,rcomm, flags, (t_complex**)real_data, complex_data);
+    (*pfft_setup)->p1 = fft5d_plan_3d(rN,M,K,rcomm, flags, (t_complex**)real_data, complex_data, &buf1, &buf2, nthreads);
     
     (*pfft_setup)->p2 = fft5d_plan_3d(Nb,Mb,Kb,rcomm,
-                                      (flags|FFT5D_BACKWARD|FFT5D_NOMALLOC)^FFT5D_ORDER_YZ, complex_data, (t_complex**)real_data);
+                                      (flags|FFT5D_BACKWARD|FFT5D_NOMALLOC)^FFT5D_ORDER_YZ, complex_data, (t_complex**)real_data, &buf1, &buf2, nthreads);
     
     return (*pfft_setup)->p1 != 0 && (*pfft_setup)->p2 !=0;
 }
@@ -173,14 +175,17 @@ int
 gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t    pfft_setup,
                            enum gmx_fft_direction  dir,
                            void *                  in_data,
-                           void *                  out_data) {
-    if ((!(pfft_setup->p1->flags&FFT5D_REALCOMPLEX)) ^ (dir==GMX_FFT_FORWARD ||dir==GMX_FFT_BACKWARD)) { 
+                           void *                  out_data,
+                           int                     thread,
+                           gmx_wallcycle_t         wcycle) {
+    if ((!(pfft_setup->p1->flags&FFT5D_REALCOMPLEX)) ^ (dir==GMX_FFT_FORWARD ||dir==GMX_FFT_BACKWARD))
+    {
         gmx_fatal(FARGS,"Invalid transform. Plan and execution don't match regarding reel/complex");
     }
     if (dir==GMX_FFT_FORWARD || dir==GMX_FFT_REAL_TO_COMPLEX) {
-        fft5d_execute(pfft_setup->p1,0);
+        fft5d_execute(pfft_setup->p1,thread,wcycle);
     } else {
-        fft5d_execute(pfft_setup->p2,0);
+        fft5d_execute(pfft_setup->p2,thread,wcycle);
     }
     return 0;
 }
index af3fa95996ec80ecafc5af4a8574f95aab83414e..964de1a582d4b8ffba30751260631643eba83ed3 100644 (file)
@@ -72,18 +72,19 @@ typedef struct gmx_wallcycle
 #ifdef GMX_MPI
     MPI_Comm     mpi_comm_mygroup;
 #endif
+    int          omp_nthreads;
 } gmx_wallcycle_t_t;
 
 /* Each name should not exceed 19 characters */
 static const char *wcn[ewcNR] =
-{ "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load", "DD comm. bounds", "Vsite constr.", "Send X to PME", "Comm. coord.", "Neighbor search", "Born radii", "Force", "Wait + Comm. F", "PME mesh", "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME solve", "Wait + Comm. X/F", "Wait + Recv. PME F", "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies", "Enforced rotation", "Add rot. forces", "Test" };
+{ "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load", "DD comm. bounds", "Vsite constr.", "Send X to PME", "Comm. coord.", "Neighbor search", "Born radii", "Force", "Wait + Comm. F", "PME mesh", "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve", "Wait + Comm. X/F", "Wait + Recv. PME F", "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies", "Enforced rotation", "Add rot. forces", "Test" };
 
 gmx_bool wallcycle_have_counter(void)
 {
   return gmx_cycles_have_counter();
 }
 
-gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr)
+gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr, int omp_nthreads)
 {
     gmx_wallcycle_t wc;
     
@@ -100,6 +101,7 @@ gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr)
     wc->wc_depth   = 0;
     wc->ewc_prev   = -1;
     wc->reset_counters = resetstep;
+    wc->omp_nthreads = omp_nthreads;
 
 #ifdef GMX_MPI
     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
@@ -246,6 +248,11 @@ void wallcycle_reset_all(gmx_wallcycle_t wc)
     }
 }
 
+static gmx_bool pme_subdivision(int ewc)
+{
+    return (ewc >= ewcPME_REDISTXF && ewc <= ewcPME_SOLVE);
+}
+
 void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
 {
     wallcc_t *wcc;
@@ -259,6 +266,17 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
 
     wcc = wc->wcc;
 
+    if (wc->omp_nthreads>1)
+    {
+        for(i=0; i<ewcNR; i++)
+        {
+            if (pme_subdivision(i) || i==ewcPMEMESH || (i==ewcRUN && cr->duty == DUTY_PME))
+            {
+                wcc[i].c *= wc->omp_nthreads;
+            }
+        }
+    }
+
     if (wcc[ewcDDCOMMLOAD].n > 0)
     {
         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
@@ -267,6 +285,11 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc,double cycles[])
     {
         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
     }
+    if (wcc[ewcPME_FFTCOMM].n > 0)
+    {
+        wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
+    }
+
     if (cr->npmenodes == 0)
     {
         /* All nodes do PME (or no PME at all) */
@@ -349,10 +372,6 @@ static void print_cycles(FILE *fplog, double c2t, const char *name, int nnodes,
     }
 }
 
-static gmx_bool subdivision(int ewc)
-{
-    return (ewc >= ewcPME_REDISTXF && ewc <= ewcPME_SOLVE);
-}
 
 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                      gmx_wallcycle_t wc, double cycles[])
@@ -377,10 +396,15 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
         npme = nnodes;
     }
     tot = cycles[ewcRUN];
+    /* PME part has to be multiplied with number of threads */
+    if (npme == 0)
+    {
+        tot += cycles[ewcPMEMESH]*(wc->omp_nthreads-1);
+    }
     /* Conversion factor from cycles to seconds */
     if (tot > 0)
     {
-      c2t = nnodes*realtime/tot;
+      c2t = (npp+npme*wc->omp_nthreads)*realtime/tot;
     }
     else
     {
@@ -394,7 +418,7 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
     sum = 0;
     for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
     {
-        if (!subdivision(i))
+        if (!pme_subdivision(i))
         {
             print_cycles(fplog,c2t,wcn[i],
                          (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp,
@@ -430,7 +454,7 @@ void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
         fprintf(fplog,"%s\n",myline);
         for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
         {
-            if (subdivision(i))
+            if (pme_subdivision(i))
             {
                 print_cycles(fplog,c2t,wcn[i],
                              (i>=ewcPMEMESH || i<=ewcPME_SOLVE) ? npme : npp,
index 622d15f5444458c08a2d7d3747b8165b89d5d9b9..bf57c3f1a1511be41b762bf71e6fd0a6b7bfd1f8 100644 (file)
@@ -1,12 +1,12 @@
 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
  *
- * 
+ *
  *                This source code is part of
- * 
+ *
  *                 G   R   O   M   A   C   S
- * 
+ *
  *          GROningen MAchine for Chemical Simulations
- * 
+ *
  *                        VERSION 3.2.0
  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version 2
  * of the License, or (at your option) any later version.
- * 
+ *
  * If you want to redistribute modifications, please consider that
  * scientific software is very special. Version control is crucial -
  * bugs must be traceable. We will be happy to consider code for
  * inclusion in the official distribution, but derived work must not
  * be called official GROMACS. Details are found in the README & COPYING
  * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
  * the papers on the package - you can find them in the top README file.
- * 
+ *
  * For more info, check our website at http://www.gromacs.org
- * 
+ *
  * And Hey:
  * GROwing Monsters And Cloning Shrimps
  */
@@ -54,7 +54,7 @@
  *
  * It might seem an overkill, but better safe than sorry.
  * /Erik 001109
- */ 
+ */
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -67,6 +67,9 @@
 #include "tmpi.h"
 #endif
 
+#ifdef GMX_OPENMP
+#include <omp.h>
+#endif
 
 #include <stdio.h>
 #include <string.h>
 #include "gmx_wallcycle.h"
 #include "gmx_parallel_3dfft.h"
 #include "pdbio.h"
+#include "gmx_cyclecounter.h"
 
 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
 #include "gmx_sse2_single.h"
+
+#define PME_SSE
+/* Some old AMD processors could have problems with unaligned loads+stores */
+#ifndef GMX_FAHCORE
+#define PME_SSE_UNALIGNED
+#endif
 #endif
 
 #include "mpelogging.h"
 /* #define TAKETIME (step > 1 && timesteps < 10) */
 #define TAKETIME FALSE
 
+/* #define PME_TIME_THREADS */
+
 #ifdef GMX_DOUBLE
 #define mpi_type MPI_DOUBLE
 #else
 #define mpi_type MPI_FLOAT
 #endif
 
+/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#define GMX_CACHE_SEP 64
+
+/* We only define a maximum to be able to use local arrays without allocation.
+ * An order larger than 12 should never be needed, even for test cases.
+ * If needed it can be changed here.
+ */
+#define PME_ORDER_MAX 12
+
 /* Internal datastructures */
 typedef struct {
     int send_index0;
@@ -124,8 +145,23 @@ typedef struct {
     int  noverlap_nodes;
     int  *send_id,*recv_id;
     pme_grid_comm_t *comm_data;
+    real *sendbuf;
+    real *recvbuf;
 } pme_overlap_t;
 
+typedef struct {
+    int *n;     /* Cumulative counts of the number of particles per thread */
+    int nalloc; /* Allocation size of i */
+    int *i;     /* Particle indices ordered on thread index (n) */
+} thread_plist_t;
+
+typedef struct {
+    int  n;
+    int  *ind;
+    splinevec theta;
+    splinevec dtheta;
+} splinedata_t;
+
 typedef struct {
     int  dimind;            /* The index of the dimension, 0=x, 1=y */
     int  nslab;
@@ -144,6 +180,7 @@ typedef struct {
     int  pd_nalloc;
     int  *pd;
     int  *count;            /* The number of atoms to send to each node */
+    int  **count_thread;
     int  *rcount;           /* The number of atoms to receive */
 
     int  n;
@@ -151,15 +188,65 @@ typedef struct {
     rvec *x;
     real *q;
     rvec *f;
-    gmx_bool bSpread;           /* These coordinates are used for spreading */
+    gmx_bool bSpread;       /* These coordinates are used for spreading */
     int  pme_order;
-    splinevec theta,dtheta;
     ivec *idx;
     rvec *fractx;            /* Fractional coordinate relative to the
-                              * lower cell boundary 
+                              * lower cell boundary
                               */
+    int  nthread;
+    int  *thread_idx;        /* Which thread should spread which charge */
+    thread_plist_t *thread_plist;
+    splinedata_t *spline;
 } pme_atomcomm_t;
 
+#define FLBS  3
+#define FLBSZ 4
+
+typedef struct {
+    ivec ci;     /* The spatial location of this grid       */
+    ivec n;      /* The size of *grid, including order-1    */
+    ivec offset; /* The grid offset from the full node grid */
+    int  order;  /* PME spreading order                     */
+    real *grid;  /* The grid local thread, size n           */
+} pmegrid_t;
+
+typedef struct {
+    pmegrid_t grid;     /* The full node grid (non thread-local)            */
+    int  nthread;       /* The number of threads operating on this grid     */
+    ivec nc;            /* The local spatial decomposition over the threads */
+    pmegrid_t *grid_th; /* Array of grids for each thread                   */
+    int  **g2t;         /* The grid to thread index                         */
+    ivec nthread_comm;  /* The number of threads to communicate with        */
+} pmegrids_t;
+
+
+typedef struct {
+#ifdef PME_SSE
+    /* Masks for SSE aligned spreading and gathering */
+    __m128 mask_SSE0[6],mask_SSE1[6];
+#else
+    int dummy; /* C89 requires that struct has at least one member */
+#endif
+} pme_spline_work_t;
+
+typedef struct {
+    /* work data for solve_pme */
+    int      nalloc;
+    real *   mhx;
+    real *   mhy;
+    real *   mhz;
+    real *   m2;
+    real *   denom;
+    real *   tmp1_alloc;
+    real *   tmp1;
+    real *   eterm;
+    real *   m2inv;
+
+    real     energy;
+    matrix   vir;
+} pme_work_t;
+
 typedef struct gmx_pme {
     int  ndecompdim;         /* The number of decomposition dimensions */
     int  nodeid;             /* Our nodeid in mpi->mpi_comm */
@@ -175,76 +262,76 @@ typedef struct gmx_pme {
     MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
 #endif
 
-    gmx_bool bPPnode;            /* Node also does particle-particle forces */
-    gmx_bool bFEP;               /* Compute Free energy contribution */
+    int  nthread;            /* The number of threads doing PME */
+
+    gmx_bool bPPnode;        /* Node also does particle-particle forces */
+    gmx_bool bFEP;           /* Compute Free energy contribution */
     int nkx,nky,nkz;         /* Grid dimensions */
     int pme_order;
-    real epsilon_r;           
-    
-    real *  pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
-    real *  pmegridB;
+    real epsilon_r;
+
+    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
+    pmegrids_t pmegridB;
+    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
     int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
-    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;    
-    
-    real *  pmegrid_sendbuf;
-    real *  pmegrid_recvbuf;
-    
+    /* pmegrid_nz might be larger than strictly necessary to ensure
+     * memory alignment, pmegrid_nz_base gives the real base size.
+     */
+    int     pmegrid_nz_base;
+    /* The local PME grid starting indices */
+    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
+
+    /* Work data for spreading and gathering */
+    pme_spline_work_t spline_work;
+
     real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
     real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
     int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
-    
+
     t_complex *cfftgridA;             /* Grids for complex FFT data */
-    t_complex *cfftgridB;            
+    t_complex *cfftgridB;
     int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
-    
+
     gmx_parallel_3dfft_t  pfft_setupA;
     gmx_parallel_3dfft_t  pfft_setupB;
-    
+
     int  *nnx,*nny,*nnz;
     real *fshx,*fshy,*fshz;
-    
+
     pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
     matrix    recipbox;
     splinevec bsp_mod;
-    
-    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 
+    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 
     pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
-    
+
     rvec *bufv;             /* Communication buffer */
     real *bufr;             /* Communication buffer */
     int  buf_nalloc;        /* The communication buffer size */
 
-    /* work data for solve_pme */
-    int      work_nalloc;
-    real *   work_mhx;
-    real *   work_mhy;
-    real *   work_mhz;
-    real *   work_m2;
-    real *   work_denom;
-    real *   work_tmp1_alloc;
-    real *   work_tmp1;
-    real *   work_m2inv;
+    /* thread local work data for solve_pme */
+    pme_work_t *work;
 
     /* Work data for PME_redist */
-    gmx_bool     redist_init;
-    int *    scounts; 
+    gmx_bool redist_init;
+    int *    scounts;
     int *    rcounts;
     int *    sdispls;
     int *    rdispls;
     int *    sidx;
-    int *    idxa;    
+    int *    idxa;
     real *   redist_buf;
     int      redist_buf_nalloc;
-    
+
     /* Work data for sum_qgrid */
     real *   sum_qgrid_tmp;
     real *   sum_qgrid_dd_tmp;
 } t_gmx_pme;
 
 
-static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
+static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
+                                   int start,int end,int thread)
 {
     int  i;
     int  *idxptr,tix,tiy,tiz;
@@ -252,42 +339,65 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
     real rxx,ryx,ryy,rzx,rzy,rzz;
     int  nx,ny,nz;
     int  start_ix,start_iy,start_iz;
-    
+    int  *g2tx,*g2ty,*g2tz;
+    gmx_bool bThreads;
+    int  *thread_idx=NULL;
+    thread_plist_t *tpl=NULL;
+    int  *tpl_n=NULL;
+    int  thread_i;
+
     nx  = pme->nkx;
     ny  = pme->nky;
     nz  = pme->nkz;
-    
+
     start_ix = pme->pmegrid_start_ix;
     start_iy = pme->pmegrid_start_iy;
     start_iz = pme->pmegrid_start_iz;
-    
+
     rxx = pme->recipbox[XX][XX];
     ryx = pme->recipbox[YY][XX];
     ryy = pme->recipbox[YY][YY];
     rzx = pme->recipbox[ZZ][XX];
     rzy = pme->recipbox[ZZ][YY];
     rzz = pme->recipbox[ZZ][ZZ];
-    
-    for(i=0; (i<atc->n); i++) {
+
+    g2tx = pme->pmegridA.g2t[XX];
+    g2ty = pme->pmegridA.g2t[YY];
+    g2tz = pme->pmegridA.g2t[ZZ];
+
+    bThreads = (atc->nthread > 1);
+    if (bThreads)
+    {
+        thread_idx = atc->thread_idx;
+
+        tpl   = &atc->thread_plist[thread];
+        tpl_n = tpl->n;
+        for(i=0; i<atc->nthread; i++)
+        {
+            tpl_n[i] = 0;
+        }
+    }
+
+    for(i=start; i<end; i++) {
         xptr   = atc->x[i];
         idxptr = atc->idx[i];
         fptr   = atc->fractx[i];
-        
+
         /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
         tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
         ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
         tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
-        
+
         tix = (int)(tx);
         tiy = (int)(ty);
         tiz = (int)(tz);
-        
+
         /* Because decomposition only occurs in x and y,
          * we never have a fraction correction in z.
          */
         fptr[XX] = tx - tix + pme->fshx[tix];
         fptr[YY] = ty - tiy + pme->fshy[tiy];
-        fptr[ZZ] = tz - tiz;   
+        fptr[ZZ] = tz - tiz;
 
         idxptr[XX] = pme->nnx[tix];
         idxptr[YY] = pme->nny[tiy];
@@ -298,40 +408,109 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
         range_check(idxptr[YY],0,pme->pmegrid_ny);
         range_check(idxptr[ZZ],0,pme->pmegrid_nz);
 #endif
-  }  
+
+        if (bThreads)
+        {
+            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
+            thread_idx[i] = thread_i;
+            tpl_n[thread_i]++;
+        }
+    }
+
+    if (bThreads)
+    {
+        /* Make a list of particle indices sorted on thread */
+
+        /* Get the cumulative count */
+        for(i=1; i<atc->nthread; i++)
+        {
+            tpl_n[i] += tpl_n[i-1];
+        }
+        /* The current implementation distributes particles equally
+         * over the threads, so we could actually allocate for that
+         * in pme_realloc_atomcomm_things.
+         */
+        if (tpl_n[atc->nthread-1] > tpl->nalloc)
+        {
+            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
+            srenew(tpl->i,tpl->nalloc);
+        }
+        /* Set tpl_n to the cumulative start */
+        for(i=atc->nthread-1; i>=1; i--)
+        {
+            tpl_n[i] = tpl_n[i-1];
+        }
+        tpl_n[0] = 0;
+
+        /* Fill our thread local array with indices sorted on thread */
+        for(i=start; i<end; i++)
+        {
+            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
+        }
+        /* Now tpl_n contains the cummulative count again */
+    }
 }
 
-static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
-                          pme_atomcomm_t *atc)
+static void make_thread_local_ind(pme_atomcomm_t *atc,
+                                  int thread,splinedata_t *spline)
+{
+    int  n,t,i,start,end;
+    thread_plist_t *tpl;
+
+    /* Combine the indices made by each thread into one index */
+
+    n = 0;
+    start = 0;
+    for(t=0; t<atc->nthread; t++)
+    {
+        tpl = &atc->thread_plist[t];
+        /* Copy our part (start - end) from the list of thread t */
+        if (thread > 0)
+        {
+            start = tpl->n[thread-1];
+        }
+        end = tpl->n[thread];
+        for(i=start; i<end; i++)
+        {
+            spline->ind[n++] = tpl->i[i];
+        }
+    }
+
+    spline->n = n;
+}
+
+
+static void pme_calc_pidx(int start, int end,
+                          matrix recipbox, rvec x[],
+                          pme_atomcomm_t *atc, int *count)
 {
     int  nslab,i;
     int  si;
     real *xptr,s;
     real rxx,ryx,rzx,ryy,rzy;
-    int *pd,*count;
+    int *pd;
 
     /* Calculate PME task index (pidx) for each grid index.
      * Here we always assign equally sized slabs to each node
      * for load balancing reasons (the PME grid spacing is not used).
      */
-    
+
     nslab = atc->nslab;
     pd    = atc->pd;
-    count = atc->count;
 
     /* Reset the count */
     for(i=0; i<nslab; i++)
     {
         count[i] = 0;
     }
-    
+
     if (atc->dimind == 0)
     {
         rxx = recipbox[XX][XX];
         ryx = recipbox[YY][XX];
         rzx = recipbox[ZZ][XX];
         /* Calculate the node index in x-dimension */
-        for(i=0; (i<natoms); i++)
+        for(i=start; i<end; i++)
         {
             xptr   = x[i];
             /* Fractional coordinates along box vectors */
@@ -346,7 +525,7 @@ static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
         ryy = recipbox[YY][YY];
         rzy = recipbox[ZZ][YY];
         /* Calculate the node index in y-dimension */
-        for(i=0; (i<natoms); i++)
+        for(i=start; i<end; i++)
         {
             xptr   = x[i];
             /* Fractional coordinates along box vectors */
@@ -358,10 +537,53 @@ static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
     }
 }
 
+static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
+                                  pme_atomcomm_t *atc)
+{
+    int nthread,thread,slab;
+
+    nthread = atc->nthread;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        pme_calc_pidx(natoms* thread   /nthread,
+                      natoms*(thread+1)/nthread,
+                      recipbox,x,atc,atc->count_thread[thread]);
+    }
+    /* Non-parallel reduction, since nslab is small */
+
+    for(thread=1; thread<nthread; thread++)
+    {
+        for(slab=0; slab<atc->nslab; slab++)
+        {
+            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
+        }
+    }
+}
+
+static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
+{
+    int i,d;
+
+    srenew(spline->ind,atc->nalloc);
+    /* Initialize the index to identity so it works without threads */
+    for(i=0; i<atc->nalloc; i++)
+    {
+        spline->ind[i] = i;
+    }
+
+    for(d=0;d<DIM;d++)
+    {
+        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
+        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
+    }
+}
+
 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 {
-    int nalloc_old,i;
-    
+    int nalloc_old,i,j,nalloc_tpl;
+
     /* We have to avoid a NULL pointer for atc->x to avoid
      * possible fatal errors in MPI routines.
      */
@@ -369,7 +591,7 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
     {
         nalloc_old = atc->nalloc;
         atc->nalloc = over_alloc_dd(max(atc->n,1));
-        
+
         if (atc->nslab > 1) {
             srenew(atc->x,atc->nalloc);
             srenew(atc->q,atc->nalloc);
@@ -380,12 +602,18 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
             }
         }
         if (atc->bSpread) {
-            for(i=0;i<DIM;i++) {
-                srenew(atc->theta[i] ,atc->pme_order*atc->nalloc); 
-                srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
-            }
-            srenew(atc->fractx,atc->nalloc); 
+            srenew(atc->fractx,atc->nalloc);
             srenew(atc->idx   ,atc->nalloc);
+
+            if (atc->nthread > 1)
+            {
+                srenew(atc->thread_idx,atc->nalloc);
+            }
+
+            for(i=0; i<atc->nthread; i++)
+            {
+                pme_realloc_splinedata(&atc->spline[i],atc);
+            }
         }
     }
 }
@@ -398,7 +626,7 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 {
     int *idxa;
     int i, ii;
-    
+
     if(FALSE == pme->redist_init) {
         snew(pme->scounts,atc->nslab);
         snew(pme->rcounts,atc->nslab);
@@ -411,19 +639,19 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
         pme->redist_buf_nalloc = over_alloc_dd(n);
         srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
     }
-    
+
     pme->idxa = atc->pd;
 
 #ifdef GMX_MPI
     if (forw && bXF) {
-        /* forward, redistribution from pp to pme */ 
-        
+        /* forward, redistribution from pp to pme */
+
         /* Calculate send counts and exchange them with other nodes */
         for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
         for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
         MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-        
-        /* Calculate send and receive displacements and index into send 
+
+        /* Calculate send and receive displacements and index into send
            buffer */
         pme->sdispls[0]=0;
         pme->rdispls[0]=0;
@@ -435,9 +663,9 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
         }
         /* Total # of particles to be received */
         atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-        
+
         pme_realloc_atomcomm_things(atc);
-        
+
         /* Copy particle coordinates into send buffer and exchange*/
         for(i=0; (i<n); i++) {
             ii=DIM*pme->sidx[pme->idxa[i]];
@@ -446,8 +674,8 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
             pme->redist_buf[ii+YY]=x_f[i][YY];
             pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
         }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, 
-                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls, 
+        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
+                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
                       pme->rvec_mpi, atc->mpi_comm);
     }
     if (forw) {
@@ -462,11 +690,11 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
                       atc->q, pme->rcounts, pme->rdispls, mpi_type,
                       atc->mpi_comm);
     }
-    else { /* backward, redistribution from pme to pp */ 
+    else { /* backward, redistribution from pme to pp */
         MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
-                      pme->redist_buf, pme->scounts, pme->sdispls, 
+                      pme->redist_buf, pme->scounts, pme->sdispls,
                       pme->rvec_mpi, atc->mpi_comm);
-        
+
         /* Copy data from receive buffer */
         for(i=0; i<atc->nslab; i++)
             pme->sidx[i] = pme->sdispls[i];
@@ -478,7 +706,7 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
             pme->sidx[pme->idxa[i]]++;
         }
     }
-#endif 
+#endif
 }
 
 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
@@ -489,7 +717,7 @@ static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 #ifdef GMX_MPI
     int dest,src;
     MPI_Status stat;
-    
+
     if (bBackward == FALSE) {
         dest = atc->node_dest[shift];
         src  = atc->node_src[shift];
@@ -497,7 +725,7 @@ static void pme_dd_sendrecv(pme_atomcomm_t *atc,
         dest = atc->node_src[shift];
         src  = atc->node_dest[shift];
     }
-    
+
     if (nbyte_s > 0 && nbyte_r > 0) {
         MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
                      dest,shift,
@@ -516,18 +744,18 @@ static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 #endif
 }
 
-static void dd_pmeredist_x_q(gmx_pme_t pme, 
+static void dd_pmeredist_x_q(gmx_pme_t pme,
                              int n, gmx_bool bX, rvec *x, real *charge,
                              pme_atomcomm_t *atc)
 {
     int *commnode,*buf_index;
     int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
-    
+
     commnode  = atc->node_dest;
     buf_index = atc->buf_index;
-    
+
     nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
-    
+
     nsend = 0;
     for(i=0; i<nnodes_comm; i++) {
         buf_index[commnode[i]] = nsend;
@@ -539,13 +767,13 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                       "This usually means that your system is not well equilibrated.",
                       n - (atc->count[atc->nodeid] + nsend),
                       pme->nodeid,'x'+atc->dimind);
-        
+
         if (nsend > pme->buf_nalloc) {
             pme->buf_nalloc = over_alloc_dd(nsend);
             srenew(pme->bufv,pme->buf_nalloc);
             srenew(pme->bufr,pme->buf_nalloc);
         }
-        
+
         atc->n = atc->count[atc->nodeid];
         for(i=0; i<nnodes_comm; i++) {
             scount = atc->count[commnode[i]];
@@ -558,10 +786,10 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                             &atc->rcount[i],sizeof(int));
             atc->n += atc->rcount[i];
         }
-        
+
         pme_realloc_atomcomm_things(atc);
     }
-    
+
     local_pos = 0;
     for(i=0; i<n; i++) {
         node = atc->pd[i];
@@ -581,7 +809,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             buf_index[node]++;
         }
     }
-    
+
     buf_pos = 0;
     for(i=0; i<nnodes_comm; i++) {
         scount = atc->count[commnode[i]];
@@ -673,7 +901,7 @@ static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 }
 
 #ifdef GMX_MPI
-static void 
+static void
 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 {
     pme_overlap_t *overlap;
@@ -684,10 +912,10 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
     int ipulse,send_id,recv_id,datasize;
     real *p;
     real *sendptr,*recvptr;
-    
+
     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
     overlap = &pme->overlap[1];
-    
+
     for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
     {
         /* Since we have already (un)wrapped the overlap in the z-dimension,
@@ -707,7 +935,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
             send_id = overlap->recv_id[ipulse];
             recv_id = overlap->send_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].recv_index0;
-            send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
+            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
             recv_index0   = overlap->comm_data[ipulse].send_index0;
             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
         }
@@ -731,19 +959,19 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                 for(k=0;k<pme->nkz;k++)
                 {
                     iz = k;
-                    pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
+                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
                 }
             }
         }
-            
+
         datasize      = pme->pmegrid_nx * pme->nkz;
-        
-        MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
+
+        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
                      send_id,ipulse,
-                     pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
+                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
                      recv_id,ipulse,
                      overlap->mpi_comm,&stat);
-        
+
         /* Get data from contiguous recv buffer */
         if (debug)
         {
@@ -765,24 +993,24 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                     iz = k;
                     if(direction==GMX_SUM_QGRID_FORWARD)
                     {
-                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
+                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                     }
                     else
                     {
-                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = pme->pmegrid_recvbuf[icnt++];
+                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
                     }
                 }
             }
         }
     }
-    
+
     /* Major dimension is easier, no copying required,
      * but we might have to sum to separate array.
      * Since we don't copy, we have to communicate up to pmegrid_nz,
      * not nkz as for the minor direction.
      */
     overlap = &pme->overlap[0];
-    
+
     for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
     {
         if(direction==GMX_SUM_QGRID_FORWARD)
@@ -793,19 +1021,19 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
             send_nindex   = overlap->comm_data[ipulse].send_nindex;
             recv_index0   = overlap->comm_data[ipulse].recv_index0;
             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
-            recvptr   = pme->pmegrid_recvbuf;
+            recvptr   = overlap->recvbuf;
         }
         else
         {
             send_id = overlap->recv_id[ipulse];
             recv_id = overlap->send_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].recv_index0;
-            send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
+            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
             recv_index0   = overlap->comm_data[ipulse].send_index0;
             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
             recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
         }
-                
+
         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
         datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 
@@ -828,14 +1056,14 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                      recvptr,recv_nindex*datasize,GMX_MPI_REAL,
                      recv_id,ipulse,
                      overlap->mpi_comm,&stat);
-        
+
         /* ADD data from contiguous recv buffer */
         if(direction==GMX_SUM_QGRID_FORWARD)
-        {        
+        {
             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
             for(i=0;i<recv_nindex*datasize;i++)
             {
-                p[i] += pme->pmegrid_recvbuf[i];
+                p[i] += overlap->recvbuf[i];
             }
         }
     }
@@ -856,12 +1084,12 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
-    
+
     local_pme_size[0] = pme->pmegrid_nx;
     local_pme_size[1] = pme->pmegrid_ny;
     local_pme_size[2] = pme->pmegrid_nz;
-    
-    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
+
+    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
      the offset is identical, and the PME grid always has more data (due to overlap)
      */
     {
@@ -875,6 +1103,7 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
         fp2 = ffopen(fn,"w");
      sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
 #endif
+
     for(ix=0;ix<local_fft_ndata[XX];ix++)
     {
         for(iy=0;iy<local_fft_ndata[YY];iy++)
@@ -909,14 +1138,34 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 }
 
 
+static gmx_cycles_t omp_cyc_start()
+{
+    return gmx_cycles_read();
+}
+
+static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
+{
+    return gmx_cycles_read() - c;
+}
+
+
 static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
+copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
+                        int nthread,int thread)
 {
     ivec    local_fft_ndata,local_fft_offset,local_fft_size;
     ivec    local_pme_size;
-    int     i,ix,iy,iz;
+    int     ixy0,ixy1,ixy,ix,iy,iz;
     int     pmeidx,fftidx;
-    
+#ifdef PME_TIME_THREADS
+    gmx_cycles_t c1;
+    static double cs1=0;
+    static int cnt=0;
+#endif
+
+#ifdef PME_TIME_THREADS
+    c1 = omp_cyc_start();
+#endif
     /* Dimensions should be identical for A/B grid, so we just use A here */
     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
                                    local_fft_ndata,
@@ -926,22 +1175,36 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
     local_pme_size[0] = pme->pmegrid_nx;
     local_pme_size[1] = pme->pmegrid_ny;
     local_pme_size[2] = pme->pmegrid_nz;
-    
-    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
+
+    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
      the offset is identical, and the PME grid always has more data (due to overlap)
      */
-    for(ix=0;ix<local_fft_ndata[XX];ix++)
+    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
+
+    for(ixy=ixy0;ixy<ixy1;ixy++)
     {
-        for(iy=0;iy<local_fft_ndata[YY];iy++)
+        ix = ixy/local_fft_ndata[YY];
+        iy = ixy - ix*local_fft_ndata[YY];
+
+        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
+        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
+        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
         {
-            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
-            {
-                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
-                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
-                pmegrid[pmeidx] = fftgrid[fftidx];
-            }
+            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
         }
-    }   
+    }
+
+#ifdef PME_TIME_THREADS
+    c1 = omp_cyc_end(c1);
+    cs1 += (double)c1;
+    cnt++;
+    if (cnt % 20 == 0)
+    {
+        printf("copy %.2f\n",cs1*1e-9);
+    }
+#endif
+
     return 0;
 }
 
@@ -962,9 +1225,9 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     overlap = pme->pme_order - 1;
 
     /* Add periodic overlap in z */
-    for(ix=0; ix<pnx; ix++)
+    for(ix=0; ix<pme->pmegrid_nx; ix++)
     {
-        for(iy=0; iy<pny; iy++)
+        for(iy=0; iy<pme->pmegrid_ny; iy++)
         {
             for(iz=0; iz<overlap; iz++)
             {
@@ -976,7 +1239,7 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 
     if (pme->nnodes_minor == 1)
     {
-       for(ix=0; ix<pnx; ix++)
+       for(ix=0; ix<pme->pmegrid_nx; ix++)
        {
            for(iy=0; iy<overlap; iy++)
            {
@@ -988,10 +1251,10 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
            }
        }
     }
-     
+
     if (pme->nnodes_major == 1)
     {
-        ny_x = (pme->nnodes_minor == 1 ? ny : pny);
+        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 
         for(ix=0; ix<overlap; ix++)
         {
@@ -1011,7 +1274,7 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 static void
 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
-    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
+    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
 
     nx = pme->nkx;
     ny = pme->nky;
@@ -1025,10 +1288,12 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 
     if (pme->nnodes_major == 1)
     {
-        ny_x = (pme->nnodes_minor == 1 ? ny : pny);
+        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 
         for(ix=0; ix<overlap; ix++)
         {
+            int iy,iz;
+
             for(iy=0; iy<ny_x; iy++)
             {
                 for(iz=0; iz<nz; iz++)
@@ -1042,8 +1307,11 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 
     if (pme->nnodes_minor == 1)
     {
-       for(ix=0; ix<pnx; ix++)
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+       for(ix=0; ix<pme->pmegrid_nx; ix++)
        {
+           int iy,iz;
+
            for(iy=0; iy<overlap; iy++)
            {
                for(iz=0; iz<nz; iz++)
@@ -1056,9 +1324,12 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     }
 
     /* Copy periodic overlap in z */
-    for(ix=0; ix<pnx; ix++)
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+    for(ix=0; ix<pme->pmegrid_nx; ix++)
     {
-        for(iy=0; iy<pny; iy++)
+        int iy,iz;
+
+        for(iy=0; iy<pme->pmegrid_ny; iy++)
         {
             for(iz=0; iz<overlap; iz++)
             {
@@ -1069,6 +1340,51 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     }
 }
 
+static void clear_grid(int nx,int ny,int nz,real *grid,
+                       ivec fs,int *flag,
+                       int fx,int fy,int fz,
+                       int order)
+{
+    int nc,ncz;
+    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
+    int flind;
+
+    nc  = 2 + (order - 2)/FLBS;
+    ncz = 2 + (order - 2)/FLBSZ;
+
+    for(fsx=fx; fsx<fx+nc; fsx++)
+    {
+        for(fsy=fy; fsy<fy+nc; fsy++)
+        {
+            for(fsz=fz; fsz<fz+ncz; fsz++)
+            {
+                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
+                if (flag[flind] == 0)
+                {
+                    gx = fsx*FLBS;
+                    gy = fsy*FLBS;
+                    gz = fsz*FLBSZ;
+                    g0x = (gx*ny + gy)*nz + gz;
+                    for(x=0; x<FLBS; x++)
+                    {
+                        g0y = g0x;
+                        for(y=0; y<FLBS; y++)
+                        {
+                            for(z=0; z<FLBSZ; z++)
+                            {
+                                grid[g0y+z] = 0;
+                            }
+                            g0y += nz;
+                        }
+                        g0x += ny*nz;
+                    }
+
+                    flag[flind] = 1;
+                }
+            }
+        }
+    }
+}
 
 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 #define DO_BSPLINE(order)                            \
@@ -1091,11 +1407,13 @@ for(ithx=0; (ithx<order); ithx++)                    \
 }
 
 
-static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc, 
-                              real *grid)
+static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
+                                     pme_atomcomm_t *atc, splinedata_t *spline,
+                                     pme_spline_work_t *work)
 {
 
     /* spread charges from home atoms to local grid */
+    real     *grid;
     pme_overlap_t *ol;
     int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
     int *    idxptr;
@@ -1103,302 +1421,671 @@ static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
     real     valx,valxy,qn;
     real     *thx,*thy,*thz;
     int      localsize, bndsize;
-  
     int      pnx,pny,pnz,ndatatot;
-  
-    pnx = pme->pmegrid_nx;
-    pny = pme->pmegrid_ny;
-    pnz = pme->pmegrid_nz;
+    int      offx,offy,offz;
+
+    pnx = pmegrid->n[XX];
+    pny = pmegrid->n[YY];
+    pnz = pmegrid->n[ZZ];
+
+    offx = pmegrid->offset[XX];
+    offy = pmegrid->offset[YY];
+    offz = pmegrid->offset[ZZ];
+
     ndatatot = pnx*pny*pnz;
-    
+    grid = pmegrid->grid;
     for(i=0;i<ndatatot;i++)
     {
         grid[i] = 0;
     }
 
-    order = pme->pme_order;
+    order = pmegrid->order;
 
-    for(nn=0; (nn<atc->n);nn++) 
+    for(nn=0; nn<spline->n; nn++)
     {
-        n      = nn;
-        qn     = atc->q[n];
+        n  = spline->ind[nn];
+        qn = atc->q[n];
 
-        if (qn != 0) 
+        if (qn != 0)
         {
             idxptr = atc->idx[n];
-            norder = n*order;
-            
-            i0   = idxptr[XX]; 
-            j0   = idxptr[YY];
-            k0   = idxptr[ZZ];
-            thx = atc->theta[XX] + norder;
-            thy = atc->theta[YY] + norder;
-            thz = atc->theta[ZZ] + norder;
-            
+            norder = nn*order;
+
+            i0   = idxptr[XX] - offx;
+            j0   = idxptr[YY] - offy;
+            k0   = idxptr[ZZ] - offz;
+
+            thx = spline->theta[XX] + norder;
+            thy = spline->theta[YY] + norder;
+            thz = spline->theta[ZZ] + norder;
+
             switch (order) {
-            case 4:  DO_BSPLINE(4);     break;
-            case 5:  DO_BSPLINE(5);     break;
-            default: DO_BSPLINE(order); break;
+            case 4:
+#ifdef PME_SSE
+#ifdef PME_SSE_UNALIGNED
+#define PME_SPREAD_SSE_ORDER4
+#else
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+                DO_BSPLINE(4);
+#endif
+                break;
+            case 5:
+#ifdef PME_SSE
+#define PME_SPREAD_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+                DO_BSPLINE(5);
+#endif
+                break;
+            default:
+                DO_BSPLINE(order);
+                break;
             }
         }
-    }  
+    }
 }
 
 
-#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
-    /* Calculate exponentials through SSE in float precision */
-#define CALC_EXPONENTIALS(start,end,r_aligned)      \
-    {                                               \
-        __m128 tmp_sse;                             \
-        for(kx=0; kx<end; kx+=4)                    \
-        {                                           \
-            tmp_sse = _mm_load_ps(r_aligned+kx);    \
-            tmp_sse = gmx_mm_exp_ps(tmp_sse);       \
-            _mm_store_ps(r_aligned+kx,tmp_sse);     \
-        }                                           \
+static void alloc_real_aligned(int n,real **ptr_raw,real **ptr)
+{
+    snew(*ptr_raw,n+8);
+
+    *ptr = (real *) (((size_t) *ptr_raw + 16) & (~((size_t) 15)));
+
+}
+static void set_grid_alignment(int *pmegrid_nz,int pme_order)
+{
+#ifdef PME_SSE
+    if (pme_order == 5
+#ifndef PME_SSE_UNALIGNED
+        || pme_order == 4
+#endif
+        )
+    {
+        /* Round nz up to a multiple of 4 to ensure alignment */
+        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
     }
-#else
-#define CALC_EXPONENTIALS(start,end,r)          \
-    for(kx=start; kx<end; kx++)                 \
-    {                                           \
-        r[kx] = exp(r[kx]);                     \
+#endif
+}
+
+static void set_gridsize_alignment(int *gridsize,int pme_order)
+{
+#ifdef PME_SSE
+#ifndef PME_SSE_UNALIGNED
+    if (pme_order == 4)
+    {
+        /* Add extra elements to ensured aligned operations do not go
+         * beyond the allocated grid size.
+         * Note that for pme_order=5, the pme grid z-size alignment
+         * ensures that we will not go beyond the grid size.
+         */
+         *gridsize += 4;
     }
 #endif
+#endif
+}
 
+static void pmegrid_init(pmegrid_t *grid,
+                         int cx, int cy, int cz,
+                         int x0, int y0, int z0,
+                         int x1, int y1, int z1,
+                         gmx_bool set_alignment,
+                         int pme_order,
+                         real *ptr)
+{
+    int nz,gridsize;
+
+    grid->ci[XX] = cx;
+    grid->ci[YY] = cy;
+    grid->ci[ZZ] = cz;
+    grid->offset[XX] = x0;
+    grid->offset[YY] = y0;
+    grid->offset[ZZ] = z0;
+    grid->n[XX]      = x1 - x0 + pme_order - 1;
+    grid->n[YY]      = y1 - y0 + pme_order - 1;
+    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
+
+    nz = grid->n[ZZ];
+    set_grid_alignment(&nz,pme_order);
+    if (set_alignment)
+    {
+        grid->n[ZZ] = nz;
+    }
+    else if (nz != grid->n[ZZ])
+    {
+        gmx_incons("pmegrid_init call with an unaligned z size");
+    }
 
-static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
-                         real ewaldcoeff,real vol,
-                         gmx_bool bEnerVir,real *mesh_energy,matrix vir)
+    grid->order = pme_order;
+    if (ptr == NULL)
+    {
+        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
+        set_gridsize_alignment(&gridsize,pme_order);
+        snew_aligned(grid->grid,gridsize,16);
+    }
+    else
+    {
+        grid->grid = ptr;
+    }
+}
+
+static int div_round_up(int enumerator,int denominator)
+{
+    return (enumerator + denominator - 1)/denominator;
+}
+
+static void make_subgrid_division(const ivec n,int ovl,int nthread,
+                                  ivec nsub)
+{
+    int gsize_opt,gsize;
+    int nsx,nsy,nsz;
+    char *env;
+
+    gsize_opt = -1;
+    for(nsx=1; nsx<=nthread; nsx++)
+    {
+        if (nthread % nsx == 0)
+        {
+            for(nsy=1; nsy<=nthread; nsy++)
+            {
+                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
+                {
+                    nsz = nthread/(nsx*nsy);
+
+                    /* Determine the number of grid points per thread */
+                    gsize =
+                        (div_round_up(n[XX],nsx) + ovl)*
+                        (div_round_up(n[YY],nsy) + ovl)*
+                        (div_round_up(n[ZZ],nsz) + ovl);
+
+                    /* Minimize the number of grids points per thread
+                     * and, secondarily, the number of cuts in minor dimensions.
+                     */
+                    if (gsize_opt == -1 ||
+                        gsize < gsize_opt ||
+                        (gsize == gsize_opt &&
+                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
+                    {
+                        nsub[XX] = nsx;
+                        nsub[YY] = nsy;
+                        nsub[ZZ] = nsz;
+                        gsize_opt = gsize;
+                    }
+                }
+            }
+        }
+    }
+
+    env = getenv("GMX_PME_THREAD_DIVISION");
+    if (env != NULL)
+    {
+        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
+    }
+
+    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
+    {
+        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
+    }
+}
+
+static void pmegrids_init(pmegrids_t *grids,
+                          int nx,int ny,int nz,int nz_base,
+                          int pme_order,
+                          int nthread,
+                          int overlap_x,
+                          int overlap_y)
+{
+    ivec n,n_base,g0,g1;
+    int t,x,y,z,d,i,tfac;
+    int max_comm_lines;
+
+    n[XX] = nx - (pme_order - 1);
+    n[YY] = ny - (pme_order - 1);
+    n[ZZ] = nz - (pme_order - 1);
+
+    copy_ivec(n,n_base);
+    n_base[ZZ] = nz_base;
+
+    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
+                 NULL);
+
+    grids->nthread = nthread;
+
+    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
+
+    if (grids->nthread > 1)
+    {
+        ivec nst;
+        int gridsize;
+        real *grid_all;
+
+        for(d=0; d<DIM; d++)
+        {
+            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
+        }
+        set_grid_alignment(&nst[ZZ],pme_order);
+
+        if (debug)
+        {
+            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
+                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
+            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
+                    nx,ny,nz,
+                    nst[XX],nst[YY],nst[ZZ]);
+        }
+
+        snew(grids->grid_th,grids->nthread);
+        t = 0;
+        gridsize = nst[XX]*nst[YY]*nst[ZZ];
+        set_gridsize_alignment(&gridsize,pme_order);
+        snew_aligned(grid_all,
+                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
+                     16);
+
+        for(x=0; x<grids->nc[XX]; x++)
+        {
+            for(y=0; y<grids->nc[YY]; y++)
+            {
+                for(z=0; z<grids->nc[ZZ]; z++)
+                {
+                    pmegrid_init(&grids->grid_th[t],
+                                 x,y,z,
+                                 (n[XX]*(x  ))/grids->nc[XX],
+                                 (n[YY]*(y  ))/grids->nc[YY],
+                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
+                                 (n[XX]*(x+1))/grids->nc[XX],
+                                 (n[YY]*(y+1))/grids->nc[YY],
+                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
+                                 TRUE,
+                                 pme_order,
+                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
+                    t++;
+                }
+            }
+        }
+    }
+
+    snew(grids->g2t,DIM);
+    tfac = 1;
+    for(d=DIM-1; d>=0; d--)
+    {
+        snew(grids->g2t[d],n[d]);
+        t = 0;
+        for(i=0; i<n[d]; i++)
+        {
+            /* The second check should match the parameters
+             * of the pmegrid_init call above.
+             */
+            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
+            {
+                t++;
+            }
+            grids->g2t[d][i] = t*tfac;
+        }
+
+        tfac *= grids->nc[d];
+
+        switch (d)
+        {
+        case XX: max_comm_lines = overlap_x;     break;
+        case YY: max_comm_lines = overlap_y;     break;
+        case ZZ: max_comm_lines = pme_order - 1; break;
+        }
+        grids->nthread_comm[d] = 0;
+        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
+        {
+            grids->nthread_comm[d]++;
+        }
+        if (debug != NULL)
+        {
+            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
+                    'x'+d,grids->nthread_comm[d]);
+        }
+        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
+         * work, but this is not a problematic restriction.
+         */
+        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
+        {
+            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
+        }
+    }
+}
+
+
+static void pmegrids_destroy(pmegrids_t *grids)
+{
+    int t;
+
+    if (grids->grid.grid != NULL)
+    {
+        sfree(grids->grid.grid);
+
+        if (grids->nthread > 0)
+        {
+            for(t=0; t<grids->nthread; t++)
+            {
+                sfree(grids->grid_th[t].grid);
+            }
+            sfree(grids->grid_th);
+        }
+    }
+}
+
+
+static void realloc_work(pme_work_t *work,int nkx)
+{
+    if (nkx > work->nalloc)
+    {
+        work->nalloc = nkx;
+        srenew(work->mhx  ,work->nalloc);
+        srenew(work->mhy  ,work->nalloc);
+        srenew(work->mhz  ,work->nalloc);
+        srenew(work->m2   ,work->nalloc);
+        srenew(work->denom,work->nalloc);
+        /* Allocate an aligned pointer for SSE operations, including 3 extra
+         * elements at the end since SSE operates on 4 elements at a time.
+         */
+        sfree_aligned(work->denom);
+        sfree_aligned(work->tmp1);
+        sfree_aligned(work->eterm);
+        snew_aligned(work->denom,work->nalloc+3,16);
+        snew_aligned(work->tmp1 ,work->nalloc+3,16);
+        snew_aligned(work->eterm,work->nalloc+3,16);
+        srenew(work->m2inv,work->nalloc);
+    }
+}
+
+
+static void free_work(pme_work_t *work)
+{
+    sfree(work->mhx);
+    sfree(work->mhy);
+    sfree(work->mhz);
+    sfree(work->m2);
+    sfree_aligned(work->denom);
+    sfree_aligned(work->tmp1);
+    sfree_aligned(work->eterm);
+    sfree(work->m2inv);
+}
+
+
+#ifdef PME_SSE
+    /* Calculate exponentials through SSE in float precision */
+inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+{
+    {
+        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
+        __m128 f_sse;
+        __m128 lu;
+        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
+        int kx;
+        f_sse = _mm_load1_ps(&f);
+        for(kx=0; kx<end; kx+=4)
+        {
+            tmp_d1   = _mm_load_ps(d_aligned+kx);
+            lu       = _mm_rcp_ps(tmp_d1);
+            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
+            tmp_r    = _mm_load_ps(r_aligned+kx);
+            tmp_r    = gmx_mm_exp_ps(tmp_r);
+            tmp_e    = _mm_mul_ps(f_sse,d_inv);
+            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
+            _mm_store_ps(e_aligned+kx,tmp_e);
+        }
+    }
+}
+#else
+inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+{
+    int kx;
+    for(kx=start; kx<end; kx++)
+    {
+        d[kx] = 1.0/d[kx];
+    }
+    for(kx=start; kx<end; kx++)
+    {
+        r[kx] = exp(r[kx]);
+    }
+    for(kx=start; kx<end; kx++)
+    {
+        e[kx] = f*r[kx]*d[kx];
+    }
+}
+#endif
+
+
+static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
+                         real ewaldcoeff,real vol,
+                         gmx_bool bEnerVir,
+                         int nthread,int thread)
 {
     /* do recip sum over local cells in grid */
     /* y major, z middle, x minor or continuous */
     t_complex *p0;
     int     kx,ky,kz,maxkx,maxky,maxkz;
-    int     nx,ny,nz,iy,iz,kxstart,kxend;
+    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
     real    mx,my,mz;
     real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
     real    ets2,struct2,vfactor,ets2vf;
-    real    eterm,d1,d2,energy=0;
+    real    d1,d2,energy=0;
     real    by,bz;
     real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
     real    rxx,ryx,ryy,rzx,rzy,rzz;
-       real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
+    pme_work_t *work;
+    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
     real    mhxk,mhyk,mhzk,m2k;
     real    corner_fac;
     ivec    complex_order;
     ivec    local_ndata,local_offset,local_size;
-    
+    real    elfac;
+
+    elfac = ONE_4PI_EPS0/pme->epsilon_r;
+
     nx = pme->nkx;
     ny = pme->nky;
     nz = pme->nkz;
-    
+
     /* Dimensions should be identical for A/B grid, so we just use A here */
     gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
                                       complex_order,
                                       local_ndata,
                                       local_offset,
                                       local_size);
-    
+
     rxx = pme->recipbox[XX][XX];
     ryx = pme->recipbox[YY][XX];
     ryy = pme->recipbox[YY][YY];
     rzx = pme->recipbox[ZZ][XX];
     rzy = pme->recipbox[ZZ][YY];
     rzz = pme->recipbox[ZZ][ZZ];
-    
+
     maxkx = (nx+1)/2;
     maxky = (ny+1)/2;
     maxkz = nz/2+1;
-       
-       mhx   = pme->work_mhx;
-       mhy   = pme->work_mhy;
-       mhz   = pme->work_mhz;
-       m2    = pme->work_m2;
-       denom = pme->work_denom;
-       tmp1  = pme->work_tmp1;
-       m2inv = pme->work_m2inv;        
 
-    for(iy=0;iy<local_ndata[YY];iy++)
+    work = &pme->work[thread];
+    mhx   = work->mhx;
+    mhy   = work->mhy;
+    mhz   = work->mhz;
+    m2    = work->m2;
+    denom = work->denom;
+    tmp1  = work->tmp1;
+    eterm = work->eterm;
+    m2inv = work->m2inv;
+
+    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
+    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+
+    for(iyz=iyz0; iyz<iyz1; iyz++)
     {
+        iy = iyz/local_ndata[ZZ];
+        iz = iyz - iy*local_ndata[ZZ];
+
         ky = iy + local_offset[YY];
-        
-        if (ky < maxky) 
+
+        if (ky < maxky)
         {
             my = ky;
         }
-        else 
+        else
         {
             my = (ky - ny);
         }
-        
+
         by = M_PI*vol*pme->bsp_mod[YY][ky];
 
-        for(iz=0;iz<local_ndata[ZZ];iz++)
-        {
-            kz = iz + local_offset[ZZ];
-            
-            mz = kz;
-
-            bz = pme->bsp_mod[ZZ][kz];
-            
-            /* 0.5 correction for corner points */
-                       corner_fac = 1;
-            if (kz == 0)
-                corner_fac = 0.5;
-            if (kz == (nz+1)/2)
-                corner_fac = 0.5;
-                      
-            p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
-            
-            /* We should skip the k-space point (0,0,0) */
-            if (local_offset[XX] > 0 ||
-                local_offset[YY] > 0 || ky > 0 ||
-                kz > 0)
+        kz = iz + local_offset[ZZ];
+
+        mz = kz;
+
+        bz = pme->bsp_mod[ZZ][kz];
+
+        /* 0.5 correction for corner points */
+        corner_fac = 1;
+        if (kz == 0 || kz == (nz+1)/2)
+        {
+            corner_fac = 0.5;
+        }
+
+        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+
+        /* We should skip the k-space point (0,0,0) */
+        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
+        {
+            kxstart = local_offset[XX];
+        }
+        else
+        {
+            kxstart = local_offset[XX] + 1;
+            p0++;
+        }
+        kxend = local_offset[XX] + local_ndata[XX];
+
+        if (bEnerVir)
+        {
+            /* More expensive inner loop, especially because of the storage
+             * of the mh elements in array's.
+             * Because x is the minor grid index, all mh elements
+             * depend on kx for triclinic unit cells.
+             */
+
+                /* Two explicit loops to avoid a conditional inside the loop */
+            for(kx=kxstart; kx<maxkx; kx++)
             {
-                kxstart = local_offset[XX];
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
             }
-            else
+
+            for(kx=maxkx; kx<kxend; kx++)
             {
-                kxstart = local_offset[XX] + 1;
-                p0++;
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
             }
-            kxend = local_offset[XX] + local_ndata[XX];
-                       
-            if (bEnerVir)
+
+            for(kx=kxstart; kx<kxend; kx++)
             {
-                /* More expensive inner loop, especially because of the storage
-                 * of the mh elements in array's.
-                 * Because x is the minor grid index, all mh elements
-                 * depend on kx for triclinic unit cells.
-                 */
+                m2inv[kx] = 1.0/m2[kx];
+            }
 
-                /* Two explicit loops to avoid a conditional inside the loop */
-                for(kx=kxstart; kx<maxkx; kx++)
-                {
-                    mx = kx;
-                    
-                    mhxk      = mx * rxx;
-                    mhyk      = mx * ryx + my * ryy;
-                    mhzk      = mx * rzx + my * rzy + mz * rzz;
-                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                    mhx[kx]   = mhxk;
-                    mhy[kx]   = mhyk;
-                    mhz[kx]   = mhzk;
-                    m2[kx]    = m2k;
-                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                    tmp1[kx]  = -factor*m2k;
-                }
-                
-                for(kx=maxkx; kx<kxend; kx++)
-                {
-                    mx = (kx - nx);
-
-                    mhxk      = mx * rxx;
-                    mhyk      = mx * ryx + my * ryy;
-                    mhzk      = mx * rzx + my * rzy + mz * rzz;
-                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                    mhx[kx]   = mhxk;
-                    mhy[kx]   = mhyk;
-                    mhz[kx]   = mhzk;
-                    m2[kx]    = m2k;
-                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                    tmp1[kx]  = -factor*m2k;
-                }
-                
-                for(kx=kxstart; kx<kxend; kx++)
-                {
-                    m2inv[kx] = 1.0/m2[kx];
-                }
-                for(kx=kxstart; kx<kxend; kx++)
-                {
-                    denom[kx] = 1.0/denom[kx];
-                }
+            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 
-                CALC_EXPONENTIALS(kxstart,kxend,tmp1);
+            for(kx=kxstart; kx<kxend; kx++,p0++)
+            {
+                d1      = p0->re;
+                d2      = p0->im;
 
-                for(kx=kxstart; kx<kxend; kx++,p0++)
-                {
-                    d1      = p0->re;
-                    d2      = p0->im;
-                    
-                    eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-                    
-                    p0->re  = d1*eterm;
-                    p0->im  = d2*eterm;
-                    
-                    struct2 = 2.0*(d1*d1+d2*d2);
-                    
-                    tmp1[kx] = eterm*struct2;
-                }
-                
-                for(kx=kxstart; kx<kxend; kx++)
-                {
-                    ets2     = corner_fac*tmp1[kx];
-                    vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
-                    energy  += ets2;
-                    
-                    ets2vf   = ets2*vfactor;
-                    virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
-                    virxy   += ets2vf*mhx[kx]*mhy[kx];
-                    virxz   += ets2vf*mhx[kx]*mhz[kx];
-                    viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
-                    viryz   += ets2vf*mhy[kx]*mhz[kx];
-                    virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
-                }
+                p0->re  = d1*eterm[kx];
+                p0->im  = d2*eterm[kx];
+
+                struct2 = 2.0*(d1*d1+d2*d2);
+
+                tmp1[kx] = eterm[kx]*struct2;
             }
-            else
+
+            for(kx=kxstart; kx<kxend; kx++)
             {
-                /* We don't need to calculate the energy and the virial.
-                 * In this case the triclinic overhead is small.
-                 */
+                ets2     = corner_fac*tmp1[kx];
+                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
+                energy  += ets2;
+
+                ets2vf   = ets2*vfactor;
+                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
+                virxy   += ets2vf*mhx[kx]*mhy[kx];
+                virxz   += ets2vf*mhx[kx]*mhz[kx];
+                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
+                viryz   += ets2vf*mhy[kx]*mhz[kx];
+                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
+            }
+        }
+        else
+        {
+            /* We don't need to calculate the energy and the virial.
+             * In this case the triclinic overhead is small.
+             */
 
-                /* Two explicit loops to avoid a conditional inside the loop */
+            /* Two explicit loops to avoid a conditional inside the loop */
 
-                for(kx=kxstart; kx<maxkx; kx++)
-                {
-                    mx = kx;
-                    
-                    mhxk      = mx * rxx;
-                    mhyk      = mx * ryx + my * ryy;
-                    mhzk      = mx * rzx + my * rzy + mz * rzz;
-                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                    tmp1[kx]  = -factor*m2k;
-                }
-                
-                for(kx=maxkx; kx<kxend; kx++)
-                {
-                    mx = (kx - nx);
-                    
-                    mhxk      = mx * rxx;
-                    mhyk      = mx * ryx + my * ryy;
-                    mhzk      = mx * rzx + my * rzy + mz * rzz;
-                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                    tmp1[kx]  = -factor*m2k;
-                }
-                
-                for(kx=kxstart; kx<kxend; kx++)
-                {
-                    denom[kx] = 1.0/denom[kx];
-                }
+            for(kx=kxstart; kx<maxkx; kx++)
+            {
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+            }
 
-                CALC_EXPONENTIALS(kxstart,kxend,tmp1);
-               
-                for(kx=kxstart; kx<kxend; kx++,p0++)
-                {
-                    d1      = p0->re;
-                    d2      = p0->im;
-                    
-                    eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-                    
-                    p0->re  = d1*eterm;
-                    p0->im  = d2*eterm;
-                }
+            for(kx=maxkx; kx<kxend; kx++)
+            {
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+            }
+
+            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
+
+            for(kx=kxstart; kx<kxend; kx++,p0++)
+            {
+                d1      = p0->re;
+                d2      = p0->im;
+
+                p0->re  = d1*eterm[kx];
+                p0->im  = d2*eterm[kx];
             }
         }
     }
-    
+
     if (bEnerVir)
     {
         /* Update virial with local values.
@@ -1407,38 +2094,85 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
          * experiencing problems on semiisotropic membranes.
          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
          */
-        vir[XX][XX] = 0.25*virxx;
-        vir[YY][YY] = 0.25*viryy;
-        vir[ZZ][ZZ] = 0.25*virzz;
-        vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
-        vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
-        vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
-        
+        work->vir[XX][XX] = 0.25*virxx;
+        work->vir[YY][YY] = 0.25*viryy;
+        work->vir[ZZ][ZZ] = 0.25*virzz;
+        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
+        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
+        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+
         /* This energy should be corrected for a charged system */
-        *mesh_energy = 0.5*energy;
+        work->energy = 0.5*energy;
     }
 
     /* Return the loop count */
-    return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
+    return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
+                             real *mesh_energy,matrix vir)
+{
+    /* This function sums output over threads
+     * and should therefore only be called after thread synchronization.
+     */
+    int thread;
+
+    *mesh_energy = pme->work[0].energy;
+    copy_mat(pme->work[0].vir,vir);
+
+    for(thread=1; thread<nthread; thread++)
+    {
+        *mesh_energy += pme->work[thread].energy;
+        m_add(vir,pme->work[thread].vir,vir);
+    }
+}
+
+static int solve_pme_yzx_wrapper(gmx_pme_t pme,t_complex *grid,
+                                 real ewaldcoeff,real vol,
+                                 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
+{
+    int  nthread,thread;
+    int  nelements=0;
+
+    nthread = pme->nthread;
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
+    {
+        int n;
+
+        n = solve_pme_yzx(pme,grid,ewaldcoeff,vol,bEnerVir,nthread,thread);
+        if (thread == 0)
+        {
+            nelements = n;
+        }
+    }
+
+    if (bEnerVir)
+    {
+        get_pme_ener_vir(pme,nthread,mesh_energy,vir);
+    }
+
+    return nelements;
 }
 
 
 #define DO_FSPLINE(order)                      \
 for(ithx=0; (ithx<order); ithx++)              \
-{                                                                         \
+{                                              \
     index_x = (i0+ithx)*pny*pnz;               \
     tx      = thx[ithx];                       \
     dx      = dthx[ithx];                      \
                                                \
     for(ithy=0; (ithy<order); ithy++)          \
-    {                                                                             \
+    {                                          \
         index_xy = index_x+(j0+ithy)*pnz;      \
         ty       = thy[ithy];                  \
         dy       = dthy[ithy];                 \
         fxy1     = fz1 = 0;                    \
                                                \
         for(ithz=0; (ithz<order); ithz++)      \
-        {                                                                 \
+        {                                      \
             gval  = grid[index_xy+(k0+ithz)];  \
             fxy1 += thz[ithz]*gval;            \
             fz1  += dthz[ithz]*gval;           \
@@ -1451,9 +2185,11 @@ for(ithx=0; (ithx<order); ithx++)              \
 
 
 void gather_f_bsplines(gmx_pme_t pme,real *grid,
-                       gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
+                       gmx_bool bClearF,pme_atomcomm_t *atc,
+                       splinedata_t *spline,
+                       real scale)
 {
-    /* sum forces for local particles */  
+    /* sum forces for local particles */
     int     nn,n,ithx,ithy,ithz,i0,j0,k0;
     int     index_x,index_xy;
     int     nx,ny,nz,pnx,pny,pnz;
@@ -1465,21 +2201,25 @@ void gather_f_bsplines(gmx_pme_t pme,real *grid,
     int     norder;
     real    rxx,ryx,ryy,rzx,rzy,rzz;
     int     order;
-    
+
+    pme_spline_work_t *work;
+
+    work = &pme->spline_work;
+
     order = pme->pme_order;
-    thx   = atc->theta[XX];
-    thy   = atc->theta[YY];
-    thz   = atc->theta[ZZ];
-    dthx  = atc->dtheta[XX];
-    dthy  = atc->dtheta[YY];
-    dthz  = atc->dtheta[ZZ];
+    thx   = spline->theta[XX];
+    thy   = spline->theta[YY];
+    thz   = spline->theta[ZZ];
+    dthx  = spline->dtheta[XX];
+    dthy  = spline->dtheta[YY];
+    dthz  = spline->dtheta[ZZ];
     nx    = pme->nkx;
     ny    = pme->nky;
     nz    = pme->nkz;
     pnx   = pme->pmegrid_nx;
     pny   = pme->pmegrid_ny;
     pnz   = pme->pmegrid_nz;
-    
+
     rxx   = pme->recipbox[XX][XX];
     ryx   = pme->recipbox[YY][XX];
     ryy   = pme->recipbox[YY][YY];
@@ -1487,41 +2227,63 @@ void gather_f_bsplines(gmx_pme_t pme,real *grid,
     rzy   = pme->recipbox[ZZ][YY];
     rzz   = pme->recipbox[ZZ][ZZ];
 
-    for(nn=0; (nn<atc->n); nn++) 
+    for(nn=0; nn<spline->n; nn++)
     {
-        n = nn;
-        qn      = scale*atc->q[n];
-        
-        if (bClearF) 
+        n  = spline->ind[nn];
+        qn atc->q[n];
+
+        if (bClearF)
         {
             atc->f[n][XX] = 0;
             atc->f[n][YY] = 0;
             atc->f[n][ZZ] = 0;
         }
-        if (qn != 0) 
+        if (qn != 0)
         {
             fx     = 0;
             fy     = 0;
             fz     = 0;
             idxptr = atc->idx[n];
-            norder = n*order;
-            
-            i0   = idxptr[XX]; 
+            norder = nn*order;
+
+            i0   = idxptr[XX];
             j0   = idxptr[YY];
             k0   = idxptr[ZZ];
-            
+
             /* Pointer arithmetic alert, next six statements */
-            thx  = atc->theta[XX] + norder;
-            thy  = atc->theta[YY] + norder;
-            thz  = atc->theta[ZZ] + norder;
-            dthx = atc->dtheta[XX] + norder;
-            dthy = atc->dtheta[YY] + norder;
-            dthz = atc->dtheta[ZZ] + norder;
-            
+            thx  = spline->theta[XX] + norder;
+            thy  = spline->theta[YY] + norder;
+            thz  = spline->theta[ZZ] + norder;
+            dthx = spline->dtheta[XX] + norder;
+            dthy = spline->dtheta[YY] + norder;
+            dthz = spline->dtheta[ZZ] + norder;
+
             switch (order) {
-            case 4:  DO_FSPLINE(4);     break;
-            case 5:  DO_FSPLINE(5);     break;
-            default: DO_FSPLINE(order); break;
+            case 4:
+#ifdef PME_SSE
+#ifdef PME_SSE_UNALIGNED
+#define PME_GATHER_F_SSE_ORDER4
+#else
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 4
+#endif
+#include "pme_sse_single.h"
+#else
+                DO_FSPLINE(4);
+#endif
+                break;
+            case 5:
+#ifdef PME_SSE
+#define PME_GATHER_F_SSE_ALIGNED
+#define PME_ORDER 5
+#include "pme_sse_single.h"
+#else
+                DO_FSPLINE(5);
+#endif
+                break;
+            default:
+                DO_FSPLINE(order);
+                break;
             }
 
             atc->f[n][XX] += -qn*( fx*nx*rxx );
@@ -1540,9 +2302,11 @@ void gather_f_bsplines(gmx_pme_t pme,real *grid,
      */
 }
 
+
 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
                                    pme_atomcomm_t *atc)
 {
+    splinedata_t *spline;
     int     n,ithx,ithy,ithz,i0,j0,k0;
     int     index_x,index_xy;
     int *   idxptr;
@@ -1550,26 +2314,27 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
     real    *thx,*thy,*thz;
     int     norder;
     int     order;
-    
-    
+
+    spline = &atc->spline[0];
+
     order = pme->pme_order;
-    
+
     energy = 0;
     for(n=0; (n<atc->n); n++) {
         qn      = atc->q[n];
-        
+
         if (qn != 0) {
             idxptr = atc->idx[n];
             norder = n*order;
-            
-            i0   = idxptr[XX]; 
+
+            i0   = idxptr[XX];
             j0   = idxptr[YY];
             k0   = idxptr[ZZ];
-            
+
             /* Pointer arithmetic alert, next three statements */
-            thx  = atc->theta[XX] + norder;
-            thy  = atc->theta[YY] + norder;
-            thz  = atc->theta[ZZ] + norder;
+            thx  = spline->theta[XX] + norder;
+            thy  = spline->theta[YY] + norder;
+            thz  = spline->theta[ZZ] + norder;
 
             pot = 0;
             for(ithx=0; (ithx<order); ithx++)
@@ -1598,54 +2363,81 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
     return energy;
 }
 
+/* Macro to force loop unrolling by fixing order.
+ * This gives a significant performance gain.
+ */
+#define CALC_SPLINE(order)                     \
+{                                              \
+    int j,k,l;                                 \
+    real dr,div;                               \
+    real data[PME_ORDER_MAX];                  \
+    real ddata[PME_ORDER_MAX];                 \
+                                               \
+    for(j=0; (j<DIM); j++)                     \
+    {                                          \
+        dr  = xptr[j];                         \
+                                               \
+        /* dr is relative offset from lower cell limit */ \
+        data[order-1] = 0;                     \
+        data[1] = dr;                          \
+        data[0] = 1 - dr;                      \
+                                               \
+        for(k=3; (k<order); k++)               \
+        {                                      \
+            div = 1.0/(k - 1.0);               \
+            data[k-1] = div*dr*data[k-2];      \
+            for(l=1; (l<(k-1)); l++)           \
+            {                                  \
+                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
+                                   data[k-l-1]);                \
+            }                                  \
+            data[0] = div*(1-dr)*data[0];      \
+        }                                      \
+        /* differentiate */                    \
+        ddata[0] = -data[0];                   \
+        for(k=1; (k<order); k++)               \
+        {                                      \
+            ddata[k] = data[k-1] - data[k];    \
+        }                                      \
+                                               \
+        div = 1.0/(order - 1);                 \
+        data[order-1] = div*dr*data[order-2];  \
+        for(l=1; (l<(order-1)); l++)           \
+        {                                      \
+            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
+                               (order-l-dr)*data[order-l-1]); \
+        }                                      \
+        data[0] = div*(1 - dr)*data[0];        \
+                                               \
+        for(k=0; k<order; k++)                 \
+        {                                      \
+            theta[j][i*order+k]  = data[k];    \
+            dtheta[j][i*order+k] = ddata[k];   \
+        }                                      \
+    }                                          \
+}
+
 void make_bsplines(splinevec theta,splinevec dtheta,int order,
-                   rvec fractx[],int nr,real charge[],
+                   rvec fractx[],int nr,int ind[],real charge[],
                    gmx_bool bFreeEnergy)
 {
     /* construct splines for local atoms */
-    int  i,j,k,l;
-    real dr,div;
-    real *data,*ddata,*xptr;
-    
-    for(i=0; (i<nr); i++) {
+    int  i,ii;
+    real *xptr;
+
+    for(i=0; i<nr; i++)
+    {
         /* With free energy we do not use the charge check.
          * In most cases this will be more efficient than calling make_bsplines
          * twice, since usually more than half the particles have charges.
          */
-        if (bFreeEnergy || charge[i] != 0.0) {
-            xptr = fractx[i];
-            for(j=0; (j<DIM); j++) {
-                dr  = xptr[j];
-                
-                /* dr is relative offset from lower cell limit */
-                data=&(theta[j][i*order]);
-                data[order-1]=0;
-                data[1]=dr;
-                data[0]=1-dr;
-                
-                for(k=3; (k<order); k++) {
-                    div=1.0/(k-1.0);    
-                    data[k-1]=div*dr*data[k-2];
-                    for(l=1; (l<(k-1)); l++) {
-                        data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
-                                         data[k-l-1]);
-                    }
-                    data[0]=div*(1-dr)*data[0];
-                }
-                /* differentiate */
-                ddata    = &(dtheta[j][i*order]);
-                ddata[0] = -data[0];
-                for(k=1; (k<order); k++) {
-                    ddata[k]=data[k-1]-data[k];
-                }
-                
-                div=1.0/(order-1);
-                data[order-1]=div*dr*data[order-2];
-                for(l=1; (l<(order-1)); l++) {
-                    data[order-l-1]=div*((dr+l)*data[order-l-2]+
-                                         (order-l-dr)*data[order-l-1]);
-                }
-                data[0]=div*(1-dr)*data[0]; 
+        ii = ind[i];
+        if (bFreeEnergy || charge[ii] != 0.0) {
+            xptr = fractx[ii];
+            switch(order) {
+            case 4:  CALC_SPLINE(4);     break;
+            case 5:  CALC_SPLINE(5);     break;
+            default: CALC_SPLINE(order); break;
             }
         }
     }
@@ -1656,7 +2448,7 @@ void make_dft_mod(real *mod,real *data,int ndata)
 {
   int i,j;
   real sc,ss,arg;
-    
+
   for(i=0;i<ndata;i++) {
     sc=ss=0;
     for(j=0;j<ndata;j++) {
@@ -1679,7 +2471,7 @@ void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
   real *data,*ddata,*bsp_data;
   int i,k,l;
   real div;
-    
+
   snew(data,order);
   snew(ddata,order);
   snew(bsp_data,nmax);
@@ -1687,7 +2479,7 @@ void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
   data[order-1]=0;
   data[1]=0;
   data[0]=1;
-           
+
   for(k=3;k<order;k++) {
     div=1.0/(k-1.0);
     data[k-1]=0;
@@ -1703,13 +2495,13 @@ void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
   data[order-1]=0;
   for(l=1;l<(order-1);l++)
     data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
-  data[0]=div*data[0]; 
+  data[0]=div*data[0];
 
   for(i=0;i<nmax;i++)
     bsp_data[i]=0;
   for(i=1;i<=order;i++)
     bsp_data[i]=data[i-1];
-    
+
   make_dft_mod(bsp_mod[XX],bsp_data,nx);
   make_dft_mod(bsp_mod[YY],bsp_data,ny);
   make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
@@ -1734,7 +2526,7 @@ static void setup_coordinate_communication(pme_atomcomm_t *atc)
       atc->node_dest[n] = fw;
       atc->node_src[n]  = bw;
       n++;
-    } 
+    }
     if (n < nslab - 1) {
       atc->node_dest[n] = bw;
       atc->node_src[n]  = fw;
@@ -1745,6 +2537,8 @@ static void setup_coordinate_communication(pme_atomcomm_t *atc)
 
 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
 {
+    int thread;
+
     if(NULL != log)
     {
         fprintf(log,"Destroying PME data structures.\n");
@@ -1753,28 +2547,29 @@ int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
     sfree((*pmedata)->nnx);
     sfree((*pmedata)->nny);
     sfree((*pmedata)->nnz);
-       
-    sfree((*pmedata)->pmegridA);
+
+    pmegrids_destroy(&(*pmedata)->pmegridA);
+
     sfree((*pmedata)->fftgridA);
     sfree((*pmedata)->cfftgridA);
     gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-    
-    if((*pmedata)->pmegridB)
+
+    if ((*pmedata)->pmegridB.grid.grid != NULL)
     {
-        sfree((*pmedata)->pmegridB);
+        pmegrids_destroy(&(*pmedata)->pmegridB);
         sfree((*pmedata)->fftgridB);
         sfree((*pmedata)->cfftgridB);
         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
     }
-    sfree((*pmedata)->work_mhz);
-    sfree((*pmedata)->work_m2);
-    sfree((*pmedata)->work_denom);
-    sfree((*pmedata)->work_tmp1_alloc);
-    sfree((*pmedata)->work_m2inv);
-       
+    for(thread=0; thread<(*pmedata)->nthread; thread++)
+    {
+        free_work(&(*pmedata)->work[thread]);
+    }
+    sfree((*pmedata)->work);
+
     sfree(*pmedata);
     *pmedata = NULL;
-  
+
   return 0;
 }
 
@@ -1804,7 +2599,7 @@ static double pme_load_imbalance(gmx_pme_t pme)
 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
                           int dimind,gmx_bool bSpread)
 {
-    int nk,k,s;
+    int nk,k,s,thread;
 
     atc->dimind = dimind;
     atc->nslab  = 1;
@@ -1832,22 +2627,43 @@ static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
         snew(atc->node_dest,atc->nslab);
         snew(atc->node_src,atc->nslab);
         setup_coordinate_communication(atc);
-        
-        snew(atc->count,atc->nslab);
+
+        snew(atc->count_thread,pme->nthread);
+        for(thread=0; thread<pme->nthread; thread++)
+        {
+            snew(atc->count_thread[thread],atc->nslab);
+        }
+        atc->count = atc->count_thread[0];
         snew(atc->rcount,atc->nslab);
         snew(atc->buf_index,atc->nslab);
     }
+
+    atc->nthread = pme->nthread;
+    if (atc->nthread > 1)
+    {
+        snew(atc->thread_plist,atc->nthread);
+    }
+    snew(atc->spline,atc->nthread);
+    for(thread=0; thread<atc->nthread; thread++)
+    {
+        if (atc->nthread > 1)
+        {
+            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
+            atc->thread_plist[thread].n += GMX_CACHE_SEP;
+        }
+    }
 }
 
-static void 
+static void
 init_overlap_comm(pme_overlap_t *  ol,
                   int              norder,
 #ifdef GMX_MPI
-                  MPI_Comm         comm,  
+                  MPI_Comm         comm,
 #endif
-                  int              nnodes, 
+                  int              nnodes,
                   int              nodeid,
-                  int              ndata)
+                  int              ndata,
+                  int              commplainsize)
 {
     int lbnd,rbnd,maxlr,b,i;
     int exten;
@@ -1855,11 +2671,11 @@ init_overlap_comm(pme_overlap_t *  ol,
     pme_grid_comm_t *pgc;
     gmx_bool bCont;
     int fft_start,fft_end,send_index1,recv_index1;
-    
+
 #ifdef GMX_MPI
     ol->mpi_comm = comm;
 #endif
-    
+
     ol->nnodes = nnodes;
     ol->nodeid = nodeid;
 
@@ -1873,7 +2689,7 @@ init_overlap_comm(pme_overlap_t *  ol,
     snew(ol->s2g0,ol->nnodes+1);
     snew(ol->s2g1,ol->nnodes);
     if (debug) { fprintf(debug,"PME slab boundaries:"); }
-    for(i=0; i<nnodes; i++) 
+    for(i=0; i<nnodes; i++)
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
@@ -1946,6 +2762,10 @@ init_overlap_comm(pme_overlap_t *  ol,
         pgc->recv_index0 = fft_start;
         pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
     }
+
+    /* For non-divisible grid we need pme_order iso pme_order-1 */
+    snew(ol->sendbuf,norder*commplainsize);
+    snew(ol->recvbuf,norder*commplainsize);
 }
 
 static void
@@ -1985,7 +2805,7 @@ make_gridindex5_to_localindex(int n,int local_start,int local_range,
                 if (gtl[i] == n-1)
                 {
                     gtl[i] = 0;
-                    fsh[i] = -1; 
+                    fsh[i] = -1;
                 }
                 else if (gtl[i] == local_range)
                 {
@@ -2000,6 +2820,29 @@ make_gridindex5_to_localindex(int n,int local_start,int local_range,
     *fraction_shift  = fsh;
 }
 
+static void sse_mask_init(pme_spline_work_t *work,int order)
+{
+#ifdef PME_SSE
+    float  tmp[8];
+    __m128 zero_SSE;
+    int    of,i;
+
+    zero_SSE = _mm_setzero_ps();
+
+    for(of=0; of<8-(order-1); of++)
+    {
+        for(i=0; i<8; i++)
+        {
+            tmp[i] = (i >= of && i < of+order ? 1 : 0);
+        }
+        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
+        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
+        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
+        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
+    }
+#endif
+}
+
 static void
 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
 {
@@ -2014,13 +2857,13 @@ gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
             gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
                       dim,*nk,dim,nnodes,dim);
         }
-        
+
         if (fplog != NULL)
         {
             fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
                     dim,*nk,dim,nnodes,dim,nk_new,dim);
         }
-            
+
         *nk = nk_new;
     }
 }
@@ -2031,42 +2874,38 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                  int                 nnodes_minor,
                  t_inputrec *        ir,
                  int                 homenr,
-                 gmx_bool                bFreeEnergy,
-                 gmx_bool                bReproducible)
+                 gmx_bool            bFreeEnergy,
+                 gmx_bool            bReproducible,
+                 int                 nthread)
 {
     gmx_pme_t pme=NULL;
-    
+
     pme_atomcomm_t *atc;
-    int bufsizex,bufsizey,bufsize;
     ivec ndata;
-    
+
     if (debug)
         fprintf(debug,"Creating PME data structures.\n");
     snew(pme,1);
-        
+
     pme->redist_init         = FALSE;
     pme->sum_qgrid_tmp       = NULL;
     pme->sum_qgrid_dd_tmp    = NULL;
     pme->buf_nalloc          = 0;
     pme->redist_buf_nalloc   = 0;
-    
+
     pme->nnodes              = 1;
     pme->bPPnode             = TRUE;
-    
+
     pme->nnodes_major        = nnodes_major;
     pme->nnodes_minor        = nnodes_minor;
 
 #ifdef GMX_MPI
-    if (nnodes_major*nnodes_minor > 1 && PAR(cr)) 
+    if (PAR(cr))
     {
         pme->mpi_comm        = cr->mpi_comm_mygroup;
-        
+
         MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
         MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
-        if (pme->nnodes != nnodes_major*nnodes_minor)
-        {
-            gmx_incons("PME node count mismatch");
-        }
     }
 #endif
 
@@ -2075,6 +2914,9 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         pme->ndecompdim = 0;
         pme->nodeid_major = 0;
         pme->nodeid_minor = 0;
+#ifdef GMX_MPI
+        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
+#endif
     }
     else
     {
@@ -2082,37 +2924,37 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         {
 #ifdef GMX_MPI
             pme->mpi_comm_d[0] = pme->mpi_comm;
-            pme->mpi_comm_d[1] = NULL;
+            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 #endif
             pme->ndecompdim = 1;
             pme->nodeid_major = pme->nodeid;
             pme->nodeid_minor = 0;
-            
+
         }
         else if (nnodes_major == 1)
         {
 #ifdef GMX_MPI
-            pme->mpi_comm_d[0] = NULL;
+            pme->mpi_comm_d[0] = MPI_COMM_NULL;
             pme->mpi_comm_d[1] = pme->mpi_comm;
 #endif
             pme->ndecompdim = 1;
             pme->nodeid_major = 0;
             pme->nodeid_minor = pme->nodeid;
         }
-        else 
+        else
         {
             if (pme->nnodes % nnodes_major != 0)
             {
                 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
             }
             pme->ndecompdim = 2;
-            
+
 #ifdef GMX_MPI
             MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
                            pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
             MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
                            pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
-            
+
             MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
             MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
             MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
@@ -2121,19 +2963,27 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         }
         pme->bPPnode = (cr->duty & DUTY_PP);
     }
-    
+
+    pme->nthread = nthread;
+
     if (ir->ePBC == epbcSCREW)
     {
         gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
     }
-    
+
     pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
     pme->nkx         = ir->nkx;
     pme->nky         = ir->nky;
     pme->nkz         = ir->nkz;
     pme->pme_order   = ir->pme_order;
     pme->epsilon_r   = ir->epsilon_r;
-    
+
+    if (pme->pme_order > PME_ORDER_MAX)
+    {
+        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+                  pme->pme_order,PME_ORDER_MAX);
+    }
+
     /* Currently pme.c supports only the fft5d FFT code.
      * Therefore the grid always needs to be divisible by nnodes.
      * When the old 1D code is also supported again, change this check.
@@ -2153,184 +3003,832 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         */
     }
 
-    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
-        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
-        pme->nkz <= pme->pme_order)
+    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
+        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
+        pme->nkz <= pme->pme_order)
+    {
+        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
+    }
+
+    if (pme->nnodes > 1) {
+        double imbal;
+
+#ifdef GMX_MPI
+        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
+        MPI_Type_commit(&(pme->rvec_mpi));
+#endif
+
+        /* Note that the charge spreading and force gathering, which usually
+         * takes about the same amount of time as FFT+solve_pme,
+         * is always fully load balanced
+         * (unless the charge distribution is inhomogeneous).
+         */
+
+        imbal = pme_load_imbalance(pme);
+        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+        {
+            fprintf(stderr,
+                    "\n"
+                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
+                    "      For optimal PME load balancing\n"
+                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
+                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+                    "\n",
+                    (int)((imbal-1)*100 + 0.5),
+                    pme->nkx,pme->nky,pme->nnodes_major,
+                    pme->nky,pme->nkz,pme->nnodes_minor);
+        }
+    }
+
+    /* For non-divisible grid we need pme_order iso pme_order-1 */
+    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
+     * y is always copied through a buffer: we don't need padding in z,
+     * but we do need the overlap in x because of the communication order.
+     */
+    init_overlap_comm(&pme->overlap[0],pme->pme_order,
+#ifdef GMX_MPI
+                      pme->mpi_comm_d[0],
+#endif
+                      pme->nnodes_major,pme->nodeid_major,
+                      pme->nkx,
+                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
+
+    init_overlap_comm(&pme->overlap[1],pme->pme_order,
+#ifdef GMX_MPI
+                      pme->mpi_comm_d[1],
+#endif
+                      pme->nnodes_minor,pme->nodeid_minor,
+                      pme->nky,
+                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
+
+    /* Check for a limitation of the (current) sum_fftgrid_dd code */
+    if (pme->nthread > 1 &&
+        (pme->overlap[0].noverlap_nodes > 1 ||
+         pme->overlap[1].noverlap_nodes > 1))
+    {
+        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
+    }
+
+    snew(pme->bsp_mod[XX],pme->nkx);
+    snew(pme->bsp_mod[YY],pme->nky);
+    snew(pme->bsp_mod[ZZ],pme->nkz);
+
+    /* The required size of the interpolation grid, including overlap.
+     * The allocated size (pmegrid_n?) might be slightly larger.
+     */
+    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
+                      pme->overlap[0].s2g0[pme->nodeid_major];
+    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
+                      pme->overlap[1].s2g0[pme->nodeid_minor];
+    pme->pmegrid_nz_base = pme->nkz;
+    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
+    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
+
+    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
+    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
+    pme->pmegrid_start_iz = 0;
+
+    make_gridindex5_to_localindex(pme->nkx,
+                                  pme->pmegrid_start_ix,
+                                  pme->pmegrid_nx - (pme->pme_order-1),
+                                  &pme->nnx,&pme->fshx);
+    make_gridindex5_to_localindex(pme->nky,
+                                  pme->pmegrid_start_iy,
+                                  pme->pmegrid_ny - (pme->pme_order-1),
+                                  &pme->nny,&pme->fshy);
+    make_gridindex5_to_localindex(pme->nkz,
+                                  pme->pmegrid_start_iz,
+                                  pme->pmegrid_nz_base,
+                                  &pme->nnz,&pme->fshz);
+
+    pmegrids_init(&pme->pmegridA,
+                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+                  pme->pmegrid_nz_base,
+                  pme->pme_order,
+                  pme->nthread,
+                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+
+    sse_mask_init(&pme->spline_work,pme->pme_order);
+
+    ndata[0] = pme->nkx;
+    ndata[1] = pme->nky;
+    ndata[2] = pme->nkz;
+
+    /* This routine will allocate the grid data to fit the FFTs */
+    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
+                            &pme->fftgridA,&pme->cfftgridA,
+                            pme->mpi_comm_d,
+                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
+                            bReproducible,pme->nthread);
+
+    if (bFreeEnergy)
+    {
+        pmegrids_init(&pme->pmegridB,
+                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+                      pme->pmegrid_nz_base,
+                      pme->pme_order,
+                      pme->nthread,
+                      pme->nkx % pme->nnodes_major != 0,
+                      pme->nky % pme->nnodes_minor != 0);
+
+        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
+                                &pme->fftgridB,&pme->cfftgridB,
+                                pme->mpi_comm_d,
+                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
+                                bReproducible,pme->nthread);
+    }
+    else
+    {
+        pme->pmegridB.grid.grid = NULL;
+        pme->fftgridB           = NULL;
+        pme->cfftgridB          = NULL;
+    }
+
+    make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
+
+    /* Use atc[0] for spreading */
+    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
+    if (pme->ndecompdim >= 2)
+    {
+        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
+    }
+
+    if (pme->nnodes == 1) {
+        pme->atc[0].n = homenr;
+        pme_realloc_atomcomm_things(&pme->atc[0]);
+    }
+
+    {
+        int thread;
+
+        /* Use fft5d, order after FFT is y major, z, x minor */
+
+        snew(pme->work,pme->nthread);
+        for(thread=0; thread<pme->nthread; thread++)
+        {
+            realloc_work(&pme->work[thread],pme->nkx);
+        }
+    }
+
+    *pmedata = pme;
+
+    return 0;
+}
+
+
+static void copy_local_grid(gmx_pme_t pme,
+                            pmegrids_t *pmegrids,int thread,real *fftgrid)
+{
+    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    int  fft_my,fft_mz;
+    int  nsx,nsy,nsz;
+    ivec nf;
+    int  offx,offy,offz,x,y,z,i0,i0t;
+    int  d;
+    pmegrid_t *pmegrid;
+    real *grid_th;
+
+    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+                                   local_fft_ndata,
+                                   local_fft_offset,
+                                   local_fft_size);
+    fft_my = local_fft_size[YY];
+    fft_mz = local_fft_size[ZZ];
+
+    pmegrid = &pmegrids->grid_th[thread];
+
+    nsx = pmegrid->n[XX];
+    nsy = pmegrid->n[YY];
+    nsz = pmegrid->n[ZZ];
+
+    for(d=0; d<DIM; d++)
+    {
+        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
+                    local_fft_ndata[d] - pmegrid->offset[d]);
+    }
+
+    offx = pmegrid->offset[XX];
+    offy = pmegrid->offset[YY];
+    offz = pmegrid->offset[ZZ];
+
+    /* Directly copy the non-overlapping parts of the local grids.
+     * This also initializes the full grid.
+     */
+    grid_th = pmegrid->grid;
+    for(x=0; x<nf[XX]; x++)
+    {
+        for(y=0; y<nf[YY]; y++)
+        {
+            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
+            i0t = (x*nsy + y)*nsz;
+            for(z=0; z<nf[ZZ]; z++)
+            {
+                fftgrid[i0+z] = grid_th[i0t+z];
+            }
+        }
+    }
+}
+
+static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
+{
+    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    pme_overlap_t *overlap;
+    int datasize,nind;
+    int i,x,y,z,n;
+
+    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+                                   local_fft_ndata,
+                                   local_fft_offset,
+                                   local_fft_size);
+    /* Major dimension */
+    overlap = &pme->overlap[0];
+
+    nind   = overlap->comm_data[0].send_nindex;
+
+    for(y=0; y<local_fft_ndata[YY]; y++) {
+         printf(" %2d",y);
+    }
+    printf("\n");
+
+    i = 0;
+    for(x=0; x<nind; x++) {
+        for(y=0; y<local_fft_ndata[YY]; y++) {
+            n = 0;
+            for(z=0; z<local_fft_ndata[ZZ]; z++) {
+                if (sendbuf[i] != 0) n++;
+                i++;
+            }
+            printf(" %2d",n);
+        }
+        printf("\n");
+    }
+}
+
+static void
+reduce_threadgrid_overlap(gmx_pme_t pme,
+                          const pmegrids_t *pmegrids,int thread,
+                          real *fftgrid,real *commbuf_x,real *commbuf_y)
+{
+    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    int  fft_nx,fft_ny,fft_nz;
+    int  fft_my,fft_mz;
+    int  buf_my=-1;
+    int  nsx,nsy,nsz;
+    ivec ne;
+    int  offx,offy,offz,x,y,z,i0,i0t;
+    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
+    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
+    gmx_bool bCommX,bCommY;
+    int  d;
+    int  thread_f;
+    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
+    const real *grid_th;
+    real *commbuf=NULL;
+
+    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+                                   local_fft_ndata,
+                                   local_fft_offset,
+                                   local_fft_size);
+    fft_nx = local_fft_ndata[XX];
+    fft_ny = local_fft_ndata[YY];
+    fft_nz = local_fft_ndata[ZZ];
+
+    fft_my = local_fft_size[YY];
+    fft_mz = local_fft_size[ZZ];
+
+    /* This routine is called when all thread have finished spreading.
+     * Here each thread sums grid contributions calculated by other threads
+     * to the thread local grid volume.
+     * To minimize the number of grid copying operations,
+     * this routines sums immediately from the pmegrid to the fftgrid.
+     */
+
+    /* Determine which part of the full node grid we should operate on,
+     * this is our thread local part of the full grid.
+     */
+    pmegrid = &pmegrids->grid_th[thread];
+
+    for(d=0; d<DIM; d++)
+    {
+        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                    local_fft_ndata[d]);
+    }
+
+    offx = pmegrid->offset[XX];
+    offy = pmegrid->offset[YY];
+    offz = pmegrid->offset[ZZ];
+
+
+    bClearBufX  = TRUE;
+    bClearBufY  = TRUE;
+    bClearBufXY = TRUE;
+
+    /* Now loop over all the thread data blocks that contribute
+     * to the grid region we (our thread) are operating on.
+     */
+    /* Note that ffy_nx/y is equal to the number of grid points
+     * between the first point of our node grid and the one of the next node.
+     */
+    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
+    {
+        fx = pmegrid->ci[XX] + sx;
+        ox = 0;
+        bCommX = FALSE;
+        if (fx < 0) {
+            fx += pmegrids->nc[XX];
+            ox -= fft_nx;
+            bCommX = (pme->nnodes_major > 1);
+        }
+        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
+        ox += pmegrid_g->offset[XX];
+        if (!bCommX)
+        {
+            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
+        }
+        else
+        {
+            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
+        }
+
+        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
+        {
+            fy = pmegrid->ci[YY] + sy;
+            oy = 0;
+            bCommY = FALSE;
+            if (fy < 0) {
+                fy += pmegrids->nc[YY];
+                oy -= fft_ny;
+                bCommY = (pme->nnodes_minor > 1);
+            }
+            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
+            oy += pmegrid_g->offset[YY];
+            if (!bCommY)
+            {
+                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
+            }
+            else
+            {
+                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
+            }
+
+            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
+            {
+                fz = pmegrid->ci[ZZ] + sz;
+                oz = 0;
+                if (fz < 0)
+                {
+                    fz += pmegrids->nc[ZZ];
+                    oz -= fft_nz;
+                }
+                pmegrid_g = &pmegrids->grid_th[fz];
+                oz += pmegrid_g->offset[ZZ];
+                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
+
+                if (sx == 0 && sy == 0 && sz == 0)
+                {
+                    /* We have already added our local contribution
+                     * before calling this routine, so skip it here.
+                     */
+                    continue;
+                }
+
+                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
+
+                pmegrid_f = &pmegrids->grid_th[thread_f];
+
+                grid_th = pmegrid_f->grid;
+
+                nsx = pmegrid_f->n[XX];
+                nsy = pmegrid_f->n[YY];
+                nsz = pmegrid_f->n[ZZ];
+
+#ifdef DEBUG_PME_REDUCE
+                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
+                       pme->nodeid,thread,thread_f,
+                       pme->pmegrid_start_ix,
+                       pme->pmegrid_start_iy,
+                       pme->pmegrid_start_iz,
+                       sx,sy,sz,
+                       offx-ox,tx1-ox,offx,tx1,
+                       offy-oy,ty1-oy,offy,ty1,
+                       offz-oz,tz1-oz,offz,tz1);
+#endif
+
+                if (!(bCommX || bCommY))
+                {
+                    /* Copy from the thread local grid to the node grid */
+                    for(x=offx; x<tx1; x++)
+                    {
+                        for(y=offy; y<ty1; y++)
+                        {
+                            i0  = (x*fft_my + y)*fft_mz;
+                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+                            for(z=offz; z<tz1; z++)
+                            {
+                                fftgrid[i0+z] += grid_th[i0t+z];
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    /* The order of this conditional decides
+                     * where the corner volume gets stored with x+y decomp.
+                     */
+                    if (bCommY)
+                    {
+                        commbuf = commbuf_y;
+                        buf_my  = ty1 - offy;
+                        if (bCommX)
+                        {
+                            /* We index commbuf modulo the local grid size */
+                            commbuf += buf_my*fft_nx*fft_nz;
+
+                            bClearBuf  = bClearBufXY;
+                            bClearBufXY = FALSE;
+                        }
+                        else
+                        {
+                            bClearBuf  = bClearBufY;
+                            bClearBufY = FALSE;
+                        }
+                    }
+                    else
+                    {
+                        commbuf = commbuf_x;
+                        buf_my  = fft_ny;
+                        bClearBuf  = bClearBufX;
+                        bClearBufX = FALSE;
+                    }
+
+                    /* Copy to the communication buffer */
+                    for(x=offx; x<tx1; x++)
+                    {
+                        for(y=offy; y<ty1; y++)
+                        {
+                            i0  = (x*buf_my + y)*fft_nz;
+                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
+
+                            if (bClearBuf)
+                            {
+                                /* First access of commbuf, initialize it */
+                                for(z=offz; z<tz1; z++)
+                                {
+                                    commbuf[i0+z]  = grid_th[i0t+z];
+                                }
+                            }
+                            else
+                            {
+                                for(z=offz; z<tz1; z++)
+                                {
+                                    commbuf[i0+z] += grid_th[i0t+z];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
+{
+    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    pme_overlap_t *overlap;
+    int  send_nindex;
+    int  recv_index0,recv_nindex;
+#ifdef GMX_MPI
+    MPI_Status stat;
+#endif
+    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
+    real *sendptr,*recvptr;
+    int  x,y,z,indg,indb;
+
+    /* Note that this routine is only used for forward communication.
+     * Since the force gathering, unlike the charge spreading,
+     * can be trivially parallelized over the particles,
+     * the backwards process is much simpler and can use the "old"
+     * communication setup.
+     */
+
+    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+                                   local_fft_ndata,
+                                   local_fft_offset,
+                                   local_fft_size);
+
+    /* Currently supports only a single communication pulse */
+
+/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
+    if (pme->nnodes_minor > 1)
+    {
+        /* Major dimension */
+        overlap = &pme->overlap[1];
+
+        if (pme->nnodes_major > 1)
+        {
+             size_yx = pme->overlap[0].comm_data[0].send_nindex;
+        }
+        else
+        {
+            size_yx = 0;
+        }
+        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
+
+        ipulse = 0;
+
+        send_id = overlap->send_id[ipulse];
+        recv_id = overlap->recv_id[ipulse];
+        send_nindex   = overlap->comm_data[ipulse].send_nindex;
+        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
+        recv_index0 = 0;
+        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+
+        sendptr = overlap->sendbuf;
+        recvptr = overlap->recvbuf;
+
+        /*
+        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
+               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
+        printf("node %d send %f, %f\n",pme->nodeid,
+               sendptr[0],sendptr[send_nindex*datasize-1]);
+        */
+
+#ifdef GMX_MPI
+        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
+                     send_id,ipulse,
+                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
+                     recv_id,ipulse,
+                     overlap->mpi_comm,&stat);
+#endif
+
+        for(x=0; x<local_fft_ndata[XX]; x++)
+        {
+            for(y=0; y<recv_nindex; y++)
+            {
+                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
+                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
+                for(z=0; z<local_fft_ndata[ZZ]; z++)
+                {
+                    fftgrid[indg+z] += recvptr[indb+z];
+                }
+            }
+        }
+        if (pme->nnodes_major > 1)
+        {
+            sendptr = pme->overlap[0].sendbuf;
+            for(x=0; x<size_yx; x++)
+            {
+                for(y=0; y<recv_nindex; y++)
+                {
+                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
+                    for(z=0; z<local_fft_ndata[ZZ]; z++)
+                    {
+                        sendptr[indg+z] += recvptr[indb+z];
+                    }
+                }
+            }
+        }
+    }
+
+    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
+    if (pme->nnodes_major > 1)
     {
-        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
-    }
+        /* Major dimension */
+        overlap = &pme->overlap[0];
 
-    if (pme->nnodes > 1) {
-        double imbal;
+        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
+        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
+
+        ipulse = 0;
+
+        send_id = overlap->send_id[ipulse];
+        recv_id = overlap->recv_id[ipulse];
+        send_nindex   = overlap->comm_data[ipulse].send_nindex;
+        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
+        recv_index0 = 0;
+        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
+
+        sendptr = overlap->sendbuf;
+        recvptr = overlap->recvbuf;
+
+        if (debug != NULL)
+        {
+            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
+                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
+        }
 
 #ifdef GMX_MPI
-        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
-        MPI_Type_commit(&(pme->rvec_mpi));
+        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
+                     send_id,ipulse,
+                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
+                     recv_id,ipulse,
+                     overlap->mpi_comm,&stat);
 #endif
-        
-        /* Note that the charge spreading and force gathering, which usually
-         * takes about the same amount of time as FFT+solve_pme,
-         * is always fully load balanced
-         * (unless the charge distribution is inhomogeneous).
-         */
-        
-        imbal = pme_load_imbalance(pme);
-        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
+
+        for(x=0; x<recv_nindex; x++)
         {
-            fprintf(stderr,
-                    "\n"
-                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
-                    "      For optimal PME load balancing\n"
-                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
-                    "\n",
-                    (int)((imbal-1)*100 + 0.5),
-                    pme->nkx,pme->nky,pme->nnodes_major,
-                    pme->nky,pme->nkz,pme->nnodes_minor);
+            for(y=0; y<local_fft_ndata[YY]; y++)
+            {
+                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
+                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
+                for(z=0; z<local_fft_ndata[ZZ]; z++)
+                {
+                    fftgrid[indg+z] += recvptr[indb+z];
+                }
+            }
         }
     }
+}
 
-    init_overlap_comm(&pme->overlap[0],pme->pme_order,
-#ifdef GMX_MPI
-                      pme->mpi_comm_d[0],
+
+static void spread_on_grid(gmx_pme_t pme,
+                           pme_atomcomm_t *atc,pmegrids_t *grids,
+                           gmx_bool bCalcSplines,gmx_bool bSpread,
+                           real *fftgrid)
+{
+    int nthread,thread;
+#ifdef PME_TIME_THREADS
+    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
+    static double cs1=0,cs2=0,cs3=0;
+    static double cs1a[6]={0,0,0,0,0,0};
+    static int cnt=0;
 #endif
-                      pme->nnodes_major,pme->nodeid_major,pme->nkx);
-    
-    init_overlap_comm(&pme->overlap[1],pme->pme_order,
-#ifdef GMX_MPI
-                      pme->mpi_comm_d[1],
+
+    nthread = pme->nthread;
+
+#ifdef PME_TIME_THREADS
+    c1 = omp_cyc_start();
 #endif
-                      pme->nnodes_minor,pme->nodeid_minor,pme->nky);
-    
-    snew(pme->bsp_mod[XX],pme->nkx);
-    snew(pme->bsp_mod[YY],pme->nky);
-    snew(pme->bsp_mod[ZZ],pme->nkz);
-    
-    /* Allocate data for the interpolation grid, including overlap */
-    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
-                      pme->overlap[0].s2g0[pme->nodeid_major];
-    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] - 
-                      pme->overlap[1].s2g0[pme->nodeid_minor];
-    pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
-    
-    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
-    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
-    pme->pmegrid_start_iz = 0;
-    
-    make_gridindex5_to_localindex(pme->nkx,
-                                  pme->pmegrid_start_ix,
-                                  pme->pmegrid_nx - (pme->pme_order-1),
-                                  &pme->nnx,&pme->fshx);
-    make_gridindex5_to_localindex(pme->nky,
-                                  pme->pmegrid_start_iy,
-                                  pme->pmegrid_ny - (pme->pme_order-1),
-                                  &pme->nny,&pme->fshy);
-    make_gridindex5_to_localindex(pme->nkz,
-                                  pme->pmegrid_start_iz,
-                                  pme->pmegrid_nz - (pme->pme_order-1),
-                                  &pme->nnz,&pme->fshz);
-    
-    snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
-    
-    /* For non-divisible grid we need pme_order iso pme_order-1 */
-    /* x overlap is copied in place: take padding into account.
-     * y is always copied through a buffer: we don't need padding in z,
-     * but we do need the overlap in x because of the communication order.
-     */
-    bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
-    bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
-    bufsize  = (bufsizex>bufsizey) ? bufsizex : bufsizey;
-    
-    snew(pme->pmegrid_sendbuf,bufsize);
-    snew(pme->pmegrid_recvbuf,bufsize);
-    
-    ndata[0] = pme->nkx;
-    ndata[1] = pme->nky;
-    ndata[2] = pme->nkz;
-    
-    /* This routine will allocate the grid data to fit the FFTs */
-    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
-                            &pme->fftgridA,&pme->cfftgridA,
-                            pme->mpi_comm_d,
-                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                            bReproducible);
-    
-    if (bFreeEnergy)
-    {
-        snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);    
-        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
-                                &pme->fftgridB,&pme->cfftgridB,
-                                pme->mpi_comm_d,
-                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                                bReproducible);
-    } else 
+    if (bCalcSplines)
     {
-        pme->pmegridB    = NULL;
-        pme->fftgridB    = NULL;
-        pme->cfftgridB   = NULL;
+#pragma omp parallel for num_threads(nthread) schedule(static)
+        for(thread=0; thread<nthread; thread++)
+        {
+            int start,end;
+
+            start = atc->n* thread   /nthread;
+            end   = atc->n*(thread+1)/nthread;
+
+            /* Compute fftgrid index for all atoms,
+             * with help of some extra variables.
+             */
+            calc_interpolation_idx(pme,atc,start,end,thread);
+        }
     }
-    
-    make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
-    
-    /* Use atc[0] for spreading */
-    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
-    if (pme->ndecompdim >= 2)
+#ifdef PME_TIME_THREADS
+    c1 = omp_cyc_end(c1);
+    cs1 += (double)c1;
+#endif
+
+#ifdef PME_TIME_THREADS
+    c2 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(nthread) schedule(static)
+    for(thread=0; thread<nthread; thread++)
     {
-        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
+        splinedata_t *spline;
+        pmegrid_t *grid;
+
+        /* make local bsplines  */
+        if (grids->nthread == 1)
+        {
+            spline = &atc->spline[0];
+
+            spline->n = atc->n;
+
+            grid = &grids->grid;
+        }
+        else
+        {
+            spline = &atc->spline[thread];
+
+            make_thread_local_ind(atc,thread,spline);
+
+            grid = &grids->grid_th[thread];
+        }
+
+        if (bCalcSplines)
+        {
+            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
+                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
+        }
+
+        if (bSpread)
+        {
+            /* put local atoms on grid. */
+#ifdef PME_TIME_SPREAD
+            ct1a = omp_cyc_start();
+#endif
+            spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
+
+            if (grids->nthread > 1)
+            {
+                copy_local_grid(pme,grids,thread,fftgrid);
+            }
+#ifdef PME_TIME_SPREAD
+            ct1a = omp_cyc_end(ct1a);
+            cs1a[thread] += (double)ct1a;
+#endif
+        }
     }
-    
-    if (pme->nnodes == 1) {
-        pme->atc[0].n = homenr;
-        pme_realloc_atomcomm_things(&pme->atc[0]);
+#ifdef PME_TIME_THREADS
+    c2 = omp_cyc_end(c2);
+    cs2 += (double)c2;
+#endif
+
+    if (grids->nthread > 1)
+    {
+#ifdef PME_TIME_THREADS
+        c3 = omp_cyc_start();
+#endif
+#pragma omp parallel for num_threads(grids->nthread) schedule(static)
+        for(thread=0; thread<grids->nthread; thread++)
+        {
+            reduce_threadgrid_overlap(pme,grids,thread,
+                                      fftgrid,
+                                      pme->overlap[0].sendbuf,
+                                      pme->overlap[1].sendbuf);
+#ifdef PRINT_PME_SENDBUF
+            print_sendbuf(pme,pme->overlap[0].sendbuf);
+#endif
+        }
+#ifdef PME_TIME_THREADS
+        c3 = omp_cyc_end(c3);
+        cs3 += (double)c3;
+#endif
+
+        if (pme->nnodes > 1)
+        {
+            /* Communicate the overlapping part of the fftgrid */
+            sum_fftgrid_dd(pme,fftgrid);
+        }
     }
-    
-    /* Use fft5d, order after FFT is y major, z, x minor */
-    pme->work_nalloc = pme->nkx;
-    snew(pme->work_mhx,pme->work_nalloc);
-    snew(pme->work_mhy,pme->work_nalloc);
-    snew(pme->work_mhz,pme->work_nalloc);
-    snew(pme->work_m2,pme->work_nalloc);
-    snew(pme->work_denom,pme->work_nalloc);
-    /* Allocate an aligned pointer for SSE operations, including 3 extra
-     * elements at the end since SSE operates on 4 elements at a time.
-     */
-    snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
-    pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
-    snew(pme->work_m2inv,pme->work_nalloc);
 
-    *pmedata = pme;
-    
-    return 0;
+#ifdef PME_TIME_THREADS
+    cnt++;
+    if (cnt % 20 == 0)
+    {
+        printf("idx %.2f spread %.2f red %.2f",
+               cs1*1e-9,cs2*1e-9,cs3*1e-9);
+#ifdef PME_TIME_SPREAD
+        for(thread=0; thread<nthread; thread++)
+            printf(" %.2f",cs1a[thread]*1e-9);
+#endif
+        printf("\n");
+    }
+#endif
 }
 
-static void spread_on_grid(gmx_pme_t pme,
-                           pme_atomcomm_t *atc,real *grid,
-                           gmx_bool bCalcSplines,gmx_bool bSpread)
-{    
-    if (bCalcSplines)
-    {
-    
-        /* Compute fftgrid index for all atoms,
-         * with help of some extra variables.
-         */
-        calc_interpolation_idx(pme,atc);
-        
-        /* make local bsplines  */
-        make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
-                      atc->fractx,atc->n,atc->q,pme->bFEP);
-    }    
-    
-    if (bSpread)
+
+static void dump_grid(FILE *fp,
+                      int sx,int sy,int sz,int nx,int ny,int nz,
+                      int my,int mz,const real *g)
+{
+    int x,y,z;
+
+    for(x=0; x<nx; x++)
     {
-        /* put local atoms on grid. */
-        spread_q_bsplines(pme,atc,grid);
+        for(y=0; y<ny; y++)
+        {
+            for(z=0; z<nz; z++)
+            {
+                fprintf(fp,"%2d %2d %2d %6.3f\n",
+                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
+            }
+        }
     }
 }
 
+static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
+{
+    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+
+    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+                                   local_fft_ndata,
+                                   local_fft_offset,
+                                   local_fft_size);
+
+    dump_grid(stderr,
+              pme->pmegrid_start_ix,
+              pme->pmegrid_start_iy,
+              pme->pmegrid_start_iz,
+              pme->pmegrid_nx-pme->pme_order+1,
+              pme->pmegrid_ny-pme->pme_order+1,
+              pme->pmegrid_nz-pme->pme_order+1,
+              local_fft_size[YY],
+              local_fft_size[ZZ],
+              fftgrid);
+}
+
+
 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
 {
     pme_atomcomm_t *atc;
-    real *grid;
+    pmegrids_t *grid;
 
     if (pme->nnodes > 1)
     {
@@ -2349,13 +3847,13 @@ void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
     pme_realloc_atomcomm_things(atc);
     atc->x         = x;
     atc->q         = q;
-    
+
     /* We only use the A-charges grid */
-    grid = pme->pmegridA;
+    grid = &pme->pmegridA;
 
-    spread_on_grid(pme,atc,NULL,TRUE,FALSE);
+    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
 
-    *V = gather_energy_bsplines(pme,grid,atc);
+    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
 }
 
 
@@ -2391,12 +3889,12 @@ int gmx_pmeonly(gmx_pme_t pme,
     int  count;
     gmx_bool bEnerVir;
     gmx_large_int_t step,step_rel;
-    
-    
+
+
     pme_pp = gmx_pme_pp_init(cr);
-    
+
     init_nrnb(nrnb);
-    
+
     count = 0;
     do /****** this is a quasi-loop over time steps! */
     {
@@ -2407,32 +3905,32 @@ int gmx_pmeonly(gmx_pme_t pme,
                                   &pme->bFEP,&lambda,
                                   &bEnerVir,
                                   &step);
-        
+
         if (natoms == -1) {
             /* We should stop: break out of the loop */
             break;
         }
-        
+
         step_rel = step - ir->init_step;
-        
+
         if (count == 0)
             wallcycle_start(wcycle,ewcRUN);
-        
+
         wallcycle_start(wcycle,ewcPMEMESH);
-        
+
         dvdlambda = 0;
         clear_mat(vir);
         gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
                    cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
                    &energy,lambda,&dvdlambda,
                    GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
-        
+
         cycles = wallcycle_stop(wcycle,ewcPMEMESH);
-        
+
         gmx_pme_send_force_vir_ener(pme_pp,
                                     f_pp,vir,energy,dvdlambda,
                                     cycles);
-        
+
         count++;
 
         if (step_rel == wcycle_get_reset_counters(wcycle))
@@ -2441,10 +3939,10 @@ int gmx_pmeonly(gmx_pme_t pme,
             reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
             wcycle_set_reset_counters(wcycle, 0);
         }
-        
+
     } /***** end of quasi-loop, we stop with the break above */
     while (TRUE);
-    
+
     return 0;
 }
 
@@ -2452,28 +3950,29 @@ int gmx_pme_do(gmx_pme_t pme,
                int start,       int homenr,
                rvec x[],        rvec f[],
                real *chargeA,   real *chargeB,
-               matrix box,     t_commrec *cr,
+               matrix box, t_commrec *cr,
                int  maxshift_x, int maxshift_y,
                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
                matrix vir,      real ewaldcoeff,
-               real *energy,    real lambda, 
+               real *energy,    real lambda,
                real *dvdlambda, int flags)
 {
     int     q,d,i,j,ntot,npme;
     int     nx,ny,nz;
     int     n_d,local_ny;
-    int     loop_count;
     pme_atomcomm_t *atc=NULL;
-    real *  grid=NULL;
+    pmegrids_t *pmegrid=NULL;
+    real    *grid=NULL;
     real    *ptr;
     rvec    *x_d,*f_d;
-    real    *charge=NULL,*q_d,vol;
+    real    *charge=NULL,*q_d;
     real    energy_AB[2];
     matrix  vir_AB[2];
-    gmx_bool    bClearF;
+    gmx_bool bClearF;
     gmx_parallel_3dfft_t pfft_setup;
     real *  fftgrid;
     t_complex * cfftgrid;
+    int     thread;
 
     if (pme->nnodes > 1) {
         atc = &pme->atc[0];
@@ -2489,21 +3988,22 @@ int gmx_pme_do(gmx_pme_t pme,
         /* This could be necessary for TPI */
         pme->atc[0].n = homenr;
     }
-    
+
     for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
         if (q == 0) {
-            grid = pme->pmegridA;
+            pmegrid = &pme->pmegridA;
             fftgrid = pme->fftgridA;
             cfftgrid = pme->cfftgridA;
             pfft_setup = pme->pfft_setupA;
             charge = chargeA+start;
         } else {
-            grid = pme->pmegridB;
+            pmegrid = &pme->pmegridB;
             fftgrid = pme->fftgridB;
             cfftgrid = pme->cfftgridB;
             pfft_setup = pme->pfft_setupB;
             charge = chargeB+start;
         }
+        grid = pmegrid->grid.grid;
         /* Unpack structure */
         if (debug) {
             fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
@@ -2513,8 +4013,8 @@ int gmx_pme_do(gmx_pme_t pme,
                 gmx_fatal(FARGS,"No grid!");
         }
         where();
-        
-        m_inv_ur0(box,pme->recipbox); 
+
+        m_inv_ur0(box,pme->recipbox);
 
         if (pme->nnodes == 1) {
             atc = &pme->atc[0];
@@ -2548,9 +4048,9 @@ int gmx_pme_do(gmx_pme_t pme,
                     srenew(atc->pd,atc->pd_nalloc);
                 }
                 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
-                pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
+                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
                 where();
-                
+
                 GMX_BARRIER(cr->mpi_comm_mygroup);
                 /* Redistribute x (only once) and qA or qB */
                 if (DOMAINDECOMP(cr)) {
@@ -2563,7 +4063,7 @@ int gmx_pme_do(gmx_pme_t pme,
 
             wallcycle_stop(wcycle,ewcPME_REDISTXF);
         }
-        
+
         if (debug)
             fprintf(debug,"Node= %6d, pme local particles=%6d\n",
                     cr->nodeid,atc->n);
@@ -2574,9 +4074,9 @@ int gmx_pme_do(gmx_pme_t pme,
 
             /* Spread the charges on a grid */
             GMX_MPE_LOG(ev_spread_on_grid_start);
-            
+
             /* Spread the charges on a grid */
-            spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
+            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
             GMX_MPE_LOG(ev_spread_on_grid_finish);
 
             if (q == 0)
@@ -2586,75 +4086,114 @@ int gmx_pme_do(gmx_pme_t pme,
             inc_nrnb(nrnb,eNR_SPREADQBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 
-            wrap_periodic_pmegrid(pme,grid);
+            if (pme->nthread == 1)
+            {
+                wrap_periodic_pmegrid(pme,grid);
 
-            /* sum contributions to local grid from other nodes */
+                /* sum contributions to local grid from other nodes */
 #ifdef GMX_MPI
-            if (pme->nnodes > 1) {
-                GMX_BARRIER(cr->mpi_comm_mygroup);
-                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
-                where();
-            }
+                if (pme->nnodes > 1)
+                {
+                    GMX_BARRIER(cr->mpi_comm_mygroup);
+                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
+                    where();
+                }
 #endif
-            where();
 
-            copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
+                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
+            }
 
             wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
-        }
-         
-        if (flags & GMX_PME_SOLVE)
-        {
-            /* do 3d-fft */ 
-            GMX_BARRIER(cr->mpi_comm_mygroup);
-            GMX_MPE_LOG(ev_gmxfft3d_start);
-            wallcycle_start(wcycle,ewcPME_FFT);
-            gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
-            wallcycle_stop(wcycle,ewcPME_FFT);
-            GMX_MPE_LOG(ev_gmxfft3d_finish);
-            where();
-            
-            /* solve in k-space for our local cells */
-            vol = det(box);
-            GMX_BARRIER(cr->mpi_comm_mygroup);
-            GMX_MPE_LOG(ev_solve_pme_start);
-            wallcycle_start(wcycle,ewcPME_SOLVE);
-            loop_count =
-                solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
-                              flags & GMX_PME_CALC_ENER_VIR,
-                              &energy_AB[q],vir_AB[q]);
-            wallcycle_stop(wcycle,ewcPME_SOLVE);
-            where();
-            GMX_MPE_LOG(ev_solve_pme_finish);
-            inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
+
+            /*
+            dump_local_fftgrid(pme,fftgrid);
+            exit(0);
+            */
         }
 
-        if ((flags & GMX_PME_CALC_F) ||
-            (flags & GMX_PME_CALC_POT))
+        /* Here we start a large thread parallel region */
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+        for(thread=0; thread<pme->nthread; thread++)
         {
-            
-            /* do 3d-invfft */
-            GMX_BARRIER(cr->mpi_comm_mygroup);
-            GMX_MPE_LOG(ev_gmxfft3d_start);
-            where();
-            wallcycle_start(wcycle,ewcPME_FFT);
-            gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
-            wallcycle_stop(wcycle,ewcPME_FFT);
+            if (flags & GMX_PME_SOLVE)
+            {
+                int loop_count;
 
-            where();
-            GMX_MPE_LOG(ev_gmxfft3d_finish);
+                /* do 3d-fft */
+                if (thread == 0)
+                {
+                    GMX_BARRIER(cr->mpi_comm_mygroup);
+                    GMX_MPE_LOG(ev_gmxfft3d_start);
+                    wallcycle_start(wcycle,ewcPME_FFT);
+                }
+                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
+                                           fftgrid,cfftgrid,thread,wcycle);
+                if (thread == 0)
+                {
+                    wallcycle_stop(wcycle,ewcPME_FFT);
+                    GMX_MPE_LOG(ev_gmxfft3d_finish);
+                }
+                where();
 
-            if (pme->nodeid == 0)
-            {
-                ntot = pme->nkx*pme->nky*pme->nkz;
-                npme  = ntot*log((real)ntot)/log(2.0);
-                inc_nrnb(nrnb,eNR_FFT,2*npme);
+                /* solve in k-space for our local cells */
+                if (thread == 0)
+                {
+                    GMX_BARRIER(cr->mpi_comm_mygroup);
+                    GMX_MPE_LOG(ev_solve_pme_start);
+                    wallcycle_start(wcycle,ewcPME_SOLVE);
+                }
+                loop_count =
+                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
+                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                  flags & GMX_PME_CALC_ENER_VIR,
+                                  pme->nthread,thread);
+                if (thread == 0)
+                {
+                    wallcycle_stop(wcycle,ewcPME_SOLVE);
+                    where();
+                    GMX_MPE_LOG(ev_solve_pme_finish);
+                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
+                }
             }
 
-            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+            if (flags & GMX_PME_CALC_F)
+            {
+                /* do 3d-invfft */
+                if (thread == 0)
+                {
+                    GMX_BARRIER(cr->mpi_comm_mygroup);
+                    GMX_MPE_LOG(ev_gmxfft3d_start);
+                    where();
+                    wallcycle_start(wcycle,ewcPME_FFT);
+                }
+                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
+                                           cfftgrid,fftgrid,thread,wcycle);
+                if (thread == 0)
+                {
+                    wallcycle_stop(wcycle,ewcPME_FFT);
+
+                    where();
+                    GMX_MPE_LOG(ev_gmxfft3d_finish);
+
+                    if (pme->nodeid == 0)
+                    {
+                        ntot = pme->nkx*pme->nky*pme->nkz;
+                        npme  = ntot*log((real)ntot)/log(2.0);
+                        inc_nrnb(nrnb,eNR_FFT,2*npme);
+                    }
+
+                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+                }
 
-            copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
+                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
+            }
+        }
+        /* End of thread parallel section.
+         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
+         */
 
+        if (flags & GMX_PME_CALC_F)
+        {
             /* distribute local grid to all nodes */
 #ifdef GMX_MPI
             if (pme->nnodes > 1) {
@@ -2665,33 +4204,44 @@ int gmx_pme_do(gmx_pme_t pme,
             where();
 
             unwrap_periodic_pmegrid(pme,grid);
-        }
 
-        if (flags & GMX_PME_CALC_F)
-        {
             /* interpolate forces for our local atoms */
             GMX_BARRIER(cr->mpi_comm_mygroup);
             GMX_MPE_LOG(ev_gather_f_bsplines_start);
 
             where();
-            
+
             /* If we are running without parallelization,
              * atc->f is the actual force array, not a buffer,
              * therefore we should not clear it.
              */
             bClearF = (q == 0 && PAR(cr));
-            gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
-                              pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+            for(thread=0; thread<pme->nthread; thread++)
+            {
+                gather_f_bsplines(pme,grid,bClearF,atc,
+                                  &atc->spline[thread],
+                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
+            }
+
             where();
-            
+
             GMX_MPE_LOG(ev_gather_f_bsplines_finish);
-            
+
             inc_nrnb(nrnb,eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
             wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
-       }
+        }
+
+        if (flags & GMX_PME_CALC_ENER_VIR)
+        {
+            /* This should only be called on the master thread
+             * and after the threads have synchronized.
+             */
+            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
+        }
     } /* of q-loop */
-    
+
     if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
         wallcycle_start(wcycle,ewcPME_REDISTXF);
         for(d=0; d<pme->ndecompdim; d++)
@@ -2719,7 +4269,7 @@ int gmx_pme_do(gmx_pme_t pme,
         wallcycle_stop(wcycle,ewcPME_REDISTXF);
     }
     where();
-    
+
     if (!pme->bFEP) {
         *energy = energy_AB[0];
         m_add(vir,vir_AB[0],vir);
@@ -2732,7 +4282,9 @@ int gmx_pme_do(gmx_pme_t pme,
     }
 
     if (debug)
+    {
         fprintf(debug,"PME mesh energy: %g\n",*energy);
-    
+    }
+
     return 0;
 }
diff --git a/src/mdlib/pme_sse_single.h b/src/mdlib/pme_sse_single.h
new file mode 100644 (file)
index 0000000..1b0b617
--- /dev/null
@@ -0,0 +1,323 @@
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ *                This source code is part of
+ *
+ *                 G   R   O   M   A   C   S
+ *
+ *          GROningen MAchine for Chemical Simulations
+ *
+ *                        VERSION 4.5
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+
+/* This include file has code between ifdef's to make sure
+ * that this performance sensitive code is inlined
+ * and to remove conditionals and variable loop bounds at compile time.
+ */
+
+#ifdef PME_SPREAD_SSE_ORDER4
+/* This code does not assume any memory alignment.
+ * This code only works for pme_order = 4.
+ */
+{
+    __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
+    __m128 tz_SSE;
+    __m128 vx_SSE;
+    __m128 vx_tz_SSE;
+    __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
+    __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
+
+    ty_SSE0 = _mm_load1_ps(&thy[0]);
+    ty_SSE1 = _mm_load1_ps(&thy[1]);
+    ty_SSE2 = _mm_load1_ps(&thy[2]);
+    ty_SSE3 = _mm_load1_ps(&thy[3]);
+
+    tz_SSE  = _mm_loadu_ps(thz);
+
+    for(ithx=0; (ithx<4); ithx++)
+    {
+        index_x = (i0+ithx)*pny*pnz;
+        valx    = qn*thx[ithx];
+
+        vx_SSE   = _mm_load1_ps(&valx);
+
+        vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
+
+        gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
+        gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
+        gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
+        gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
+
+        sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
+        sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
+        sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
+        sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
+
+        _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
+        _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
+        _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
+        _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
+    }
+}
+#undef PME_SPREAD_SSE_ORDER4
+#endif
+
+
+#ifdef PME_GATHER_F_SSE_ORDER4
+/* This code does not assume any memory alignment.
+ * This code only works for pme_order = 4.
+ */
+{
+    float fx_tmp[4],fy_tmp[4],fz_tmp[4];
+
+    __m128 fx_SSE,fy_SSE,fz_SSE;
+
+    __m128 tx_SSE,ty_SSE,tz_SSE;
+    __m128 dx_SSE,dy_SSE,dz_SSE;
+
+    __m128 gval_SSE;
+
+    __m128 fxy1_SSE;
+    __m128 fz1_SSE;
+
+    fx_SSE = _mm_setzero_ps();
+    fy_SSE = _mm_setzero_ps();
+    fz_SSE = _mm_setzero_ps();
+
+    tz_SSE  = _mm_loadu_ps(thz);
+    dz_SSE  = _mm_loadu_ps(dthz);
+
+    for(ithx=0; (ithx<4); ithx++)
+    {
+        index_x = (i0+ithx)*pny*pnz;
+        tx_SSE   = _mm_load1_ps(thx+ithx);
+        dx_SSE   = _mm_load1_ps(dthx+ithx);
+
+        for(ithy=0; (ithy<4); ithy++)
+        {
+            index_xy = index_x+(j0+ithy)*pnz;
+            ty_SSE   = _mm_load1_ps(thy+ithy);
+            dy_SSE   = _mm_load1_ps(dthy+ithy);
+
+            gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
+
+            fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
+            fz1_SSE  = _mm_mul_ps(dz_SSE,gval_SSE);
+
+            fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
+            fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
+            fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
+        }
+    }
+
+    _mm_storeu_ps(fx_tmp,fx_SSE);
+    _mm_storeu_ps(fy_tmp,fy_SSE);
+    _mm_storeu_ps(fz_tmp,fz_SSE);
+
+    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_GATHER_F_SSE_ORDER4
+#endif
+
+
+#ifdef PME_SPREAD_SSE_ALIGNED
+/* This code assumes that the grid is allocated 16 bit aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+    int offset;
+    int index;
+    __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
+    __m128 tz_SSE0;
+    __m128 tz_SSE1;
+    __m128 vx_SSE;
+    __m128 vx_tz_SSE0;
+    __m128 vx_tz_SSE1;
+    __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04;
+    __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14;
+    __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04;
+    __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14;
+
+    offset = k0 & 3;
+
+    ty_SSE0 = _mm_load1_ps(&thy[0]);
+    ty_SSE1 = _mm_load1_ps(&thy[1]);
+    ty_SSE2 = _mm_load1_ps(&thy[2]);
+    ty_SSE3 = _mm_load1_ps(&thy[3]);
+#if PME_ORDER == 5
+    ty_SSE4 = _mm_load1_ps(&thy[4]);
+#endif
+
+    tz_SSE0 = _mm_loadu_ps(thz-offset);
+    tz_SSE1 = _mm_loadu_ps(thz-offset+4);
+    tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
+    tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
+
+    for(ithx=0; (ithx<PME_ORDER); ithx++)
+    {
+        index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
+        valx  = qn*thx[ithx];
+
+        vx_SSE   = _mm_load1_ps(&valx);
+
+        vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);
+        vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);
+
+        gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
+        gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
+        gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
+        gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
+#if PME_ORDER == 5
+        gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
+#endif
+        gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
+        gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
+        gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
+        gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
+#if PME_ORDER == 5
+        gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
+#endif
+
+        sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0));
+        sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1));
+        sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2));
+        sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3));
+#if PME_ORDER == 5
+        sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4));
+#endif
+        sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0));
+        sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1));
+        sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2));
+        sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3));
+#if PME_ORDER == 5
+        sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4));
+#endif
+
+        _mm_store_ps(grid+index+0*pnz,sum_SSE00);
+        _mm_store_ps(grid+index+1*pnz,sum_SSE01);
+        _mm_store_ps(grid+index+2*pnz,sum_SSE02);
+        _mm_store_ps(grid+index+3*pnz,sum_SSE03);
+#if PME_ORDER == 5
+        _mm_store_ps(grid+index+4*pnz,sum_SSE04);
+#endif
+        _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);
+        _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);
+        _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);
+        _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);
+#if PME_ORDER == 5
+        _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);
+#endif
+    }
+}
+#undef PME_ORDER
+#undef PME_SPREAD_SSE_ALIGNED
+#endif
+
+
+#ifdef PME_GATHER_F_SSE_ALIGNED
+/* This code assumes that the grid is allocated 16 bit aligned
+ * and that pnz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+{
+    int   offset;
+
+    float fx_tmp[4],fy_tmp[4],fz_tmp[4];
+
+    __m128 fx_SSE,fy_SSE,fz_SSE;
+
+    __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
+    __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
+
+    __m128 gval_SSE0;
+    __m128 gval_SSE1;
+
+    __m128 fxy1_SSE0;
+    __m128 fz1_SSE0;
+    __m128 fxy1_SSE1;
+    __m128 fz1_SSE1;
+    __m128 fxy1_SSE;
+    __m128 fz1_SSE;
+
+    offset = k0 & 3;
+
+    fx_SSE = _mm_setzero_ps();
+    fy_SSE = _mm_setzero_ps();
+    fz_SSE = _mm_setzero_ps();
+
+    tz_SSE0 = _mm_loadu_ps(thz-offset);
+    dz_SSE0 = _mm_loadu_ps(dthz-offset);
+    tz_SSE1 = _mm_loadu_ps(thz-offset+4);
+    dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
+    tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
+    dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
+    tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
+    dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
+
+    for(ithx=0; (ithx<PME_ORDER); ithx++)
+    {
+        index_x = (i0+ithx)*pny*pnz;
+        tx_SSE   = _mm_load1_ps(thx+ithx);
+        dx_SSE   = _mm_load1_ps(dthx+ithx);
+
+        for(ithy=0; (ithy<PME_ORDER); ithy++)
+        {
+            index_xy = index_x+(j0+ithy)*pnz;
+            ty_SSE   = _mm_load1_ps(thy+ithy);
+            dy_SSE   = _mm_load1_ps(dthy+ithy);
+
+            gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
+            gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
+
+            fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
+            fz1_SSE0  = _mm_mul_ps(dz_SSE0,gval_SSE0);
+            fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
+            fz1_SSE1  = _mm_mul_ps(dz_SSE1,gval_SSE1);
+
+            fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
+            fz1_SSE  = _mm_add_ps(fz1_SSE0,fz1_SSE1);
+
+            fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
+            fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
+            fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
+        }
+    }
+
+    _mm_store_ps(fx_tmp,fx_SSE);
+    _mm_store_ps(fy_tmp,fy_SSE);
+    _mm_store_ps(fz_tmp,fz_SSE);
+
+    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
+    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
+    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+}
+#undef PME_ORDER
+#undef PME_GATHER_F_SSE_ALIGNED
+#endif
index 6c1dd5718726426b93cdafa082217393d096ed48..8c2c89cd9448b0355493a9e72fe45e221f5d1fde 100644 (file)
 #include <config.h>
 #endif
 
+#ifdef __linux
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/syscall.h>
+#endif
 #include <signal.h>
 #include <stdlib.h>
+
 #include "typedefs.h"
 #include "smalloc.h"
 #include "sysstuff.h"
 #include "tmpi.h"
 #endif
 
+#ifdef GMX_OPENMP
+#include <omp.h>
+#endif
+
 /* afm stuf */
 #include "pull.h"
 
@@ -2747,7 +2757,7 @@ int mdrunner_membed(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
     gmx_edsam_t ed=NULL;
     t_commrec   *cr_old=cr;
     int        nthreads=1,nthreads_requested=1;
-
+    int         omp_nthreads = 1;
 
        char                    *ins;
        int                     rm_bonded_at,fr_id,fr_i=0,tmp_id,warn=0;
@@ -3159,7 +3169,32 @@ int mdrunner_membed(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
         gmx_setup_nodecomm(fplog,cr);
     }
 
-    wcycle = wallcycle_init(fplog,resetstep,cr);
+    /* get number of OpenMP/PME threads
+    * env variable should be read only on one node to make sure it is identical everywhere */
+#ifdef GMX_OPENMP
+   if (EEL_PME(inputrec->coulombtype))
+   {
+       if (MASTER(cr))
+       {
+           char *ptr;
+           omp_nthreads = omp_get_max_threads();
+           if ((ptr=getenv("GMX_PME_NTHREADS")) != NULL)
+           {
+               sscanf(ptr,"%d",&omp_nthreads);
+           }
+           if (fplog!=NULL)
+           {
+               fprintf(fplog,"Using %d threads for PME\n",omp_nthreads);
+           }
+       }
+       if (PAR(cr))
+       {
+           gmx_bcast_sim(sizeof(omp_nthreads),&omp_nthreads,cr);
+       }
+   }
+#endif
+
+    wcycle = wallcycle_init(fplog,resetstep,cr, omp_nthreads);
     if (PAR(cr))
     {
         /* Master synchronizes its value of reset_counters with all nodes
@@ -3300,11 +3335,37 @@ int mdrunner_membed(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
             /* The PME only nodes need to know nChargePerturbed */
             gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
         }
+
+
+        /*set CPU affinity*/
+#ifdef GMX_OPENMP
+#ifdef __linux
+#ifdef GMX_LIB_MPI
+        {
+            int core;
+            MPI_Comm comm_intra; /*intra communicator (but different to nc.comm_intra includes PME nodes)*/
+            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),gmx_node_rank(),&comm_intra);
+            int local_omp_nthreads = (cr->duty & DUTY_PME) ? omp_nthreads : 1; /*threads on this node*/
+            MPI_Scan(&local_omp_nthreads,&core, 1, MPI_INT, MPI_SUM, comm_intra);
+            core-=local_omp_nthreads; /*make exclusive scan*/
+    #pragma omp parallel firstprivate(core) num_threads(local_omp_nthreads)
+            {
+                cpu_set_t mask;
+                CPU_ZERO(&mask);
+                core+=omp_get_thread_num();
+                CPU_SET(core,&mask);
+                sched_setaffinity((pid_t) syscall (SYS_gettid),sizeof(cpu_set_t),&mask);
+            }
+        }
+#endif /*GMX_MPI*/
+#endif /*__linux*/
+#endif /*GMX_OPENMP*/
+
         if (cr->duty & DUTY_PME)
         {
             status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
                                   mtop ? mtop->natoms : 0,nChargePerturbed,
-                                  (Flags & MD_REPRODUCIBLE));
+                                  (Flags & MD_REPRODUCIBLE),omp_nthreads);
             if (status != 0)
             {
                 gmx_fatal(FARGS,"Error %d initializing PME",status);